Re: [PATCH v2] kvm: nVMX: fix entry with pending interrupt if APICv is enabled

2018-10-04 Thread Nikita Leshenko
On Wed, 2018-10-03 at 17:26 +0200, Paolo Bonzini wrote:
> Commit b5861e5cf2fcf83031ea3e26b0a69d887adf7d21 introduced a check on
> the interrupt-window and NMI-window CPU execution controls in order to
> inject an external interrupt vmexit before the first guest instruction
> executes.  However, when APIC virtualization is enabled the host does not
> need a vmexit in order to inject an interrupt at the next interrupt window;
> instead, it just places the interrupt vector in RVI and the processor will
> inject it as soon as possible.  Therefore, on machines with APICv it is
> not enough to check the CPU execution controls: the same scenario can also
> happen if RVI>0.
> 
> Fixes: b5861e5cf2fcf83031ea3e26b0a69d887adf7d21
> Cc: Nikita Leshchenko 
> Cc: Sean Christopherson 
> Cc: Liran Alon 
> Cc: Radim Krčmář 
> Signed-off-by: Paolo Bonzini 

Reviewed-by: Nikita Leshenko 

> ---
>  arch/x86/kvm/vmx.c | 38 ++
>  1 file changed, 26 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 6ef2d5b139b9..c7ae8ea87bc4 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -6162,6 +6162,11 @@ static void 
> vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
>   nested_mark_vmcs12_pages_dirty(vcpu);
>  }
>  
> +static u8 vmx_get_rvi(void)
> +{
> + return vmcs_read16(GUEST_INTR_STATUS) & 0xff;
> +}
> +
>  static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
>  {
>   struct vcpu_vmx *vmx = to_vmx(vcpu);
> @@ -6174,7 +6179,7 @@ static bool vmx_guest_apic_has_interrupt(struct 
> kvm_vcpu *vcpu)
>   WARN_ON_ONCE(!vmx->nested.virtual_apic_page))
>   return false;
>  
> - rvi = vmcs_read16(GUEST_INTR_STATUS) & 0xff;
> + rvi = vmx_get_rvi();
>  
>   vapic_page = kmap(vmx->nested.virtual_apic_page);
>   vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
> @@ -10349,6 +10354,14 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
>   return max_irr;
>  }
>  
> +static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
> +{
> + u8 rvi = vmx_get_rvi();
> + u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);
> +
> + return ((rvi & 0xf0) > (vppr & 0xf0));
> +}
> +
>  static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
>  {
>   if (!kvm_vcpu_apicv_active(vcpu))
> @@ -12593,10 +12606,13 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu 
> *vcpu, u32 *exit_qual)
>   struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
>   bool from_vmentry = !!exit_qual;
>   u32 dummy_exit_qual;
> - u32 vmcs01_cpu_exec_ctrl;
> + bool evaluate_pending_interrupts;
>   int r = 0;
>  
> - vmcs01_cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
> + evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
> + (CPU_BASED_VIRTUAL_INTR_PENDING | 
> CPU_BASED_VIRTUAL_NMI_PENDING);
> + if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
> + evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
>  
>   enter_guest_mode(vcpu);
>  
> @@ -12644,16 +12660,14 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu 
> *vcpu, u32 *exit_qual)
>* to L1 or delivered directly to L2 (e.g. In case L1 don't
>* intercept EXTERNAL_INTERRUPT).
>*
> -  * Usually this would be handled by L0 requesting a
> -  * IRQ/NMI window by setting VMCS accordingly. However,
> -  * this setting was done on VMCS01 and now VMCS02 is active
> -  * instead. Thus, we force L0 to perform pending event
> -  * evaluation by requesting a KVM_REQ_EVENT.
> -  */
> - if (vmcs01_cpu_exec_ctrl &
> - (CPU_BASED_VIRTUAL_INTR_PENDING | 
> CPU_BASED_VIRTUAL_NMI_PENDING)) {
> +  * Usually this would be handled by the processor noticing an
> +  * IRQ/NMI window request, or checking RVI during evaluation of
> +  * pending virtual interrupts.  However, this setting was done
> +  * on VMCS01 and now VMCS02 is active instead. Thus, we force L0
> +  * to perform pending event evaluation by requesting a KVM_REQ_EVENT.
> +  */
> + if (unlikely(evaluate_pending_interrupts))
>   kvm_make_request(KVM_REQ_EVENT, vcpu);
> - }
>  
>   /*
>* Note no nested_vmx_succeed or nested_vmx_fail here. At this point



Re: [PATCH v2] kvm: nVMX: fix entry with pending interrupt if APICv is enabled

2018-10-04 Thread Nikita Leshenko
On Wed, 2018-10-03 at 17:26 +0200, Paolo Bonzini wrote:
> Commit b5861e5cf2fcf83031ea3e26b0a69d887adf7d21 introduced a check on
> the interrupt-window and NMI-window CPU execution controls in order to
> inject an external interrupt vmexit before the first guest instruction
> executes.  However, when APIC virtualization is enabled the host does not
> need a vmexit in order to inject an interrupt at the next interrupt window;
> instead, it just places the interrupt vector in RVI and the processor will
> inject it as soon as possible.  Therefore, on machines with APICv it is
> not enough to check the CPU execution controls: the same scenario can also
> happen if RVI>0.
> 
> Fixes: b5861e5cf2fcf83031ea3e26b0a69d887adf7d21
> Cc: Nikita Leshchenko 
> Cc: Sean Christopherson 
> Cc: Liran Alon 
> Cc: Radim Krčmář 
> Signed-off-by: Paolo Bonzini 

Reviewed-by: Nikita Leshenko 

> ---
>  arch/x86/kvm/vmx.c | 38 ++
>  1 file changed, 26 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 6ef2d5b139b9..c7ae8ea87bc4 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -6162,6 +6162,11 @@ static void 
> vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
>   nested_mark_vmcs12_pages_dirty(vcpu);
>  }
>  
> +static u8 vmx_get_rvi(void)
> +{
> + return vmcs_read16(GUEST_INTR_STATUS) & 0xff;
> +}
> +
>  static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
>  {
>   struct vcpu_vmx *vmx = to_vmx(vcpu);
> @@ -6174,7 +6179,7 @@ static bool vmx_guest_apic_has_interrupt(struct 
> kvm_vcpu *vcpu)
>   WARN_ON_ONCE(!vmx->nested.virtual_apic_page))
>   return false;
>  
> - rvi = vmcs_read16(GUEST_INTR_STATUS) & 0xff;
> + rvi = vmx_get_rvi();
>  
>   vapic_page = kmap(vmx->nested.virtual_apic_page);
>   vppr = *((u32 *)(vapic_page + APIC_PROCPRI));
> @@ -10349,6 +10354,14 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
>   return max_irr;
>  }
>  
> +static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
> +{
> + u8 rvi = vmx_get_rvi();
> + u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);
> +
> + return ((rvi & 0xf0) > (vppr & 0xf0));
> +}
> +
>  static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
>  {
>   if (!kvm_vcpu_apicv_active(vcpu))
> @@ -12593,10 +12606,13 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu 
> *vcpu, u32 *exit_qual)
>   struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
>   bool from_vmentry = !!exit_qual;
>   u32 dummy_exit_qual;
> - u32 vmcs01_cpu_exec_ctrl;
> + bool evaluate_pending_interrupts;
>   int r = 0;
>  
> - vmcs01_cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
> + evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
> + (CPU_BASED_VIRTUAL_INTR_PENDING | 
> CPU_BASED_VIRTUAL_NMI_PENDING);
> + if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
> + evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
>  
>   enter_guest_mode(vcpu);
>  
> @@ -12644,16 +12660,14 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu 
> *vcpu, u32 *exit_qual)
>* to L1 or delivered directly to L2 (e.g. In case L1 don't
>* intercept EXTERNAL_INTERRUPT).
>*
> -  * Usually this would be handled by L0 requesting a
> -  * IRQ/NMI window by setting VMCS accordingly. However,
> -  * this setting was done on VMCS01 and now VMCS02 is active
> -  * instead. Thus, we force L0 to perform pending event
> -  * evaluation by requesting a KVM_REQ_EVENT.
> -  */
> - if (vmcs01_cpu_exec_ctrl &
> - (CPU_BASED_VIRTUAL_INTR_PENDING | 
> CPU_BASED_VIRTUAL_NMI_PENDING)) {
> +  * Usually this would be handled by the processor noticing an
> +  * IRQ/NMI window request, or checking RVI during evaluation of
> +  * pending virtual interrupts.  However, this setting was done
> +  * on VMCS01 and now VMCS02 is active instead. Thus, we force L0
> +  * to perform pending event evaluation by requesting a KVM_REQ_EVENT.
> +  */
> + if (unlikely(evaluate_pending_interrupts))
>   kvm_make_request(KVM_REQ_EVENT, vcpu);
> - }
>  
>   /*
>* Note no nested_vmx_succeed or nested_vmx_fail here. At this point



Re: [PATCH] kvm: nVMX: fix entry with pending interrupt if APICv is enabled

2018-10-03 Thread Nikita Leshenko
On Wed, 2018-10-03 at 13:47 +0200, Paolo Bonzini wrote:
> Commit b5861e5cf2fcf83031ea3e26b0a69d887adf7d21 introduced a check on
> the interrupt-window and NMI-window CPU execution controls in order to
> inject an external interrupt vmexit before the first guest instruction
> executes.  However, when APIC virtualization is enabled the host does not
> need a vmexit in order to inject an interrupt at the next interrupt window;
> instead, it just places the interrupt vector in RVI and the processor will
> inject it as soon as possible.  Therefore, on machines with APICv it is
> not enough to check the CPU execution controls: the same scenario can also
> happen if RVI>0.
> 
> Fixes: b5861e5cf2fcf83031ea3e26b0a69d887adf7d21
> Cc: Nikita Leshchenko 
> Cc: Sean Christopherson 
> Cc: Liran Alon 
> Cc: Radim Krčmář 
> Signed-off-by: Paolo Bonzini 
> ---
>  arch/x86/kvm/vmx.c | 16 +++-
>  1 file changed, 11 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 6ef2d5b139b9..c0c7689f0049 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -10280,6 +10280,11 @@ static void vmx_hwapic_isr_update(struct kvm_vcpu 
> *vcpu, int max_isr)
>   }
>  }
>  
> +static u8 vmx_get_rvi(void)
> +{
> + return vmcs_read16(GUEST_INTR_STATUS) & 0xff;
> +}
> +
>  static void vmx_set_rvi(int vector)
>  {
>   u16 status;
> @@ -12593,10 +12598,13 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu 
> *vcpu, u32 *exit_qual)
>   struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
>   bool from_vmentry = !!exit_qual;
>   u32 dummy_exit_qual;
> - u32 vmcs01_cpu_exec_ctrl;
> + bool evaluate_pending_interrupts;
>   int r = 0;
>  
> - vmcs01_cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
> + evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
> + (CPU_BASED_VIRTUAL_INTR_PENDING | 
> CPU_BASED_VIRTUAL_NMI_PENDING);
> + if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
> + evaluate_pending_interrupts |= vmx_get_rvi() > 0;

You should check for RVI > VPPR, similarly to how it is done in
vmx_guest_apic_has_interrupt().

Also, now that you introduced vmx_get_rvi(), it could be nice to use it
in vmx_guest_apic_has_interrupt() as well.

Apart from that, looks good.

Reviewed-by: Nikita Leshenko 

Thanks,
Nikita

>  
>   enter_guest_mode(vcpu);
>  
> @@ -12650,10 +12658,8 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu 
> *vcpu, u32 *exit_qual)
>* instead. Thus, we force L0 to perform pending event
>* evaluation by requesting a KVM_REQ_EVENT.
>*/
> - if (vmcs01_cpu_exec_ctrl &
> - (CPU_BASED_VIRTUAL_INTR_PENDING | 
> CPU_BASED_VIRTUAL_NMI_PENDING)) {
> + if (evaluate_pending_interrupts)
>   kvm_make_request(KVM_REQ_EVENT, vcpu);
> - }
>  
>   /*
>* Note no nested_vmx_succeed or nested_vmx_fail here. At this point



Re: [PATCH] kvm: nVMX: fix entry with pending interrupt if APICv is enabled

2018-10-03 Thread Nikita Leshenko
On Wed, 2018-10-03 at 13:47 +0200, Paolo Bonzini wrote:
> Commit b5861e5cf2fcf83031ea3e26b0a69d887adf7d21 introduced a check on
> the interrupt-window and NMI-window CPU execution controls in order to
> inject an external interrupt vmexit before the first guest instruction
> executes.  However, when APIC virtualization is enabled the host does not
> need a vmexit in order to inject an interrupt at the next interrupt window;
> instead, it just places the interrupt vector in RVI and the processor will
> inject it as soon as possible.  Therefore, on machines with APICv it is
> not enough to check the CPU execution controls: the same scenario can also
> happen if RVI>0.
> 
> Fixes: b5861e5cf2fcf83031ea3e26b0a69d887adf7d21
> Cc: Nikita Leshchenko 
> Cc: Sean Christopherson 
> Cc: Liran Alon 
> Cc: Radim Krčmář 
> Signed-off-by: Paolo Bonzini 
> ---
>  arch/x86/kvm/vmx.c | 16 +++-
>  1 file changed, 11 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 6ef2d5b139b9..c0c7689f0049 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -10280,6 +10280,11 @@ static void vmx_hwapic_isr_update(struct kvm_vcpu 
> *vcpu, int max_isr)
>   }
>  }
>  
> +static u8 vmx_get_rvi(void)
> +{
> + return vmcs_read16(GUEST_INTR_STATUS) & 0xff;
> +}
> +
>  static void vmx_set_rvi(int vector)
>  {
>   u16 status;
> @@ -12593,10 +12598,13 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu 
> *vcpu, u32 *exit_qual)
>   struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
>   bool from_vmentry = !!exit_qual;
>   u32 dummy_exit_qual;
> - u32 vmcs01_cpu_exec_ctrl;
> + bool evaluate_pending_interrupts;
>   int r = 0;
>  
> - vmcs01_cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
> + evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
> + (CPU_BASED_VIRTUAL_INTR_PENDING | 
> CPU_BASED_VIRTUAL_NMI_PENDING);
> + if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
> + evaluate_pending_interrupts |= vmx_get_rvi() > 0;

You should check for RVI > VPPR, similarly to how it is done in
vmx_guest_apic_has_interrupt().

Also, now that you introduced vmx_get_rvi(), it could be nice to use it
in vmx_guest_apic_has_interrupt() as well.

Apart from that, looks good.

Reviewed-by: Nikita Leshenko 

Thanks,
Nikita

>  
>   enter_guest_mode(vcpu);
>  
> @@ -12650,10 +12658,8 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu 
> *vcpu, u32 *exit_qual)
>* instead. Thus, we force L0 to perform pending event
>* evaluation by requesting a KVM_REQ_EVENT.
>*/
> - if (vmcs01_cpu_exec_ctrl &
> - (CPU_BASED_VIRTUAL_INTR_PENDING | 
> CPU_BASED_VIRTUAL_NMI_PENDING)) {
> + if (evaluate_pending_interrupts)
>   kvm_make_request(KVM_REQ_EVENT, vcpu);
> - }
>  
>   /*
>* Note no nested_vmx_succeed or nested_vmx_fail here. At this point



Re: [RFC][PATCH 3/5] [PATCH 3/5] kvm-ept-idle: HVA indexed EPT read

2018-09-04 Thread Nikita Leshenko
On 1 Sep 2018, at 13:28, Fengguang Wu  wrote:
> +static ssize_t ept_idle_read(struct file *file, char *buf,
> +  size_t count, loff_t *ppos)
> +{
> + struct task_struct *task = file->private_data;
> + struct ept_idle_ctrl *eic;
> + unsigned long hva_start = *ppos << BITMAP_BYTE2PVA_SHIFT;
> + unsigned long hva_end = hva_start + (count << BITMAP_BYTE2PVA_SHIFT);
> + int ret;
> +
> + if (*ppos % IDLE_BITMAP_CHUNK_SIZE ||
> + count % IDLE_BITMAP_CHUNK_SIZE)
> + return -EINVAL;
> +
> + eic = kzalloc(sizeof(*eic), GFP_KERNEL);
> + if (!eic)
> + return -EBUSY;
> +
> + eic->buf = buf;
> + eic->buf_size = count;
> + eic->kvm = task_kvm(task);
> + if (!eic->kvm) {
> + ret = -EINVAL;
> + goto out_free;
> + }
I think you need to increment the refcount while using kvm,
otherwise kvm can be destroyed from another thread while you're
walking it.

-Nikita
> +
> + ret = ept_idle_walk_hva_range(eic, hva_start, hva_end);
> + if (ret)
> + goto out_free;
> +
> + ret = eic->bytes_copied;
> + *ppos += ret;
> +out_free:
> + kfree(eic);
> +
> + return ret;
> +}


Re: [RFC][PATCH 3/5] [PATCH 3/5] kvm-ept-idle: HVA indexed EPT read

2018-09-04 Thread Nikita Leshenko
On 1 Sep 2018, at 13:28, Fengguang Wu  wrote:
> +static ssize_t ept_idle_read(struct file *file, char *buf,
> +  size_t count, loff_t *ppos)
> +{
> + struct task_struct *task = file->private_data;
> + struct ept_idle_ctrl *eic;
> + unsigned long hva_start = *ppos << BITMAP_BYTE2PVA_SHIFT;
> + unsigned long hva_end = hva_start + (count << BITMAP_BYTE2PVA_SHIFT);
> + int ret;
> +
> + if (*ppos % IDLE_BITMAP_CHUNK_SIZE ||
> + count % IDLE_BITMAP_CHUNK_SIZE)
> + return -EINVAL;
> +
> + eic = kzalloc(sizeof(*eic), GFP_KERNEL);
> + if (!eic)
> + return -EBUSY;
> +
> + eic->buf = buf;
> + eic->buf_size = count;
> + eic->kvm = task_kvm(task);
> + if (!eic->kvm) {
> + ret = -EINVAL;
> + goto out_free;
> + }
I think you need to increment the refcount while using kvm,
otherwise kvm can be destroyed from another thread while you're
walking it.

-Nikita
> +
> + ret = ept_idle_walk_hva_range(eic, hva_start, hva_end);
> + if (ret)
> + goto out_free;
> +
> + ret = eic->bytes_copied;
> + *ppos += ret;
> +out_free:
> + kfree(eic);
> +
> + return ret;
> +}


Re: [RFC][PATCH 1/5] [PATCH 1/5] kvm: register in task_struct

2018-09-04 Thread Nikita Leshenko
On 4 Sep 2018, at 2:46, Fengguang Wu  wrote:
> 
> Here it goes:
> 
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 99ce070e7dcb..27c5446f3deb 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -27,6 +27,7 @@ typedef int vm_fault_t;
> struct address_space;
> struct mem_cgroup;
> struct hmm;
> +struct kvm;
> /*
> * Each physical page in the system has a struct page associated with
> @@ -489,10 +490,19 @@ struct mm_struct {
>   /* HMM needs to track a few things per mm */
>   struct hmm *hmm;
> #endif
> +#if IS_ENABLED(CONFIG_KVM)
> + struct kvm *kvm;
> +#endif
> } __randomize_layout;
> extern struct mm_struct init_mm;
> +#if IS_ENABLED(CONFIG_KVM)
> +static inline struct kvm *mm_kvm(struct mm_struct *mm) { return mm->kvm; }
> +#else
> +static inline struct kvm *mm_kvm(struct mm_struct *mm) { return NULL; }
> +#endif
> +
> static inline void mm_init_cpumask(struct mm_struct *mm)
> {
> #ifdef CONFIG_CPUMASK_OFFSTACK
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 0c483720de8d..dca6156a7b35 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -3892,7 +3892,7 @@ static void kvm_uevent_notify_change(unsigned int type, 
> struct kvm *kvm)
>   if (type == KVM_EVENT_CREATE_VM) {
>   add_uevent_var(env, "EVENT=create");
>   kvm->userspace_pid = task_pid_nr(current);
> - current->kvm = kvm;
> + current->mm->kvm = kvm;
I think you also need to reset kvm to NULL once the VM is
destroyed, otherwise it would point to dangling memory.

-Nikita
>   } else if (type == KVM_EVENT_DESTROY_VM) {
>   add_uevent_var(env, "EVENT=destroy");
>   }



Re: [RFC][PATCH 1/5] [PATCH 1/5] kvm: register in task_struct

2018-09-04 Thread Nikita Leshenko
On 4 Sep 2018, at 2:46, Fengguang Wu  wrote:
> 
> Here it goes:
> 
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 99ce070e7dcb..27c5446f3deb 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -27,6 +27,7 @@ typedef int vm_fault_t;
> struct address_space;
> struct mem_cgroup;
> struct hmm;
> +struct kvm;
> /*
> * Each physical page in the system has a struct page associated with
> @@ -489,10 +490,19 @@ struct mm_struct {
>   /* HMM needs to track a few things per mm */
>   struct hmm *hmm;
> #endif
> +#if IS_ENABLED(CONFIG_KVM)
> + struct kvm *kvm;
> +#endif
> } __randomize_layout;
> extern struct mm_struct init_mm;
> +#if IS_ENABLED(CONFIG_KVM)
> +static inline struct kvm *mm_kvm(struct mm_struct *mm) { return mm->kvm; }
> +#else
> +static inline struct kvm *mm_kvm(struct mm_struct *mm) { return NULL; }
> +#endif
> +
> static inline void mm_init_cpumask(struct mm_struct *mm)
> {
> #ifdef CONFIG_CPUMASK_OFFSTACK
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 0c483720de8d..dca6156a7b35 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -3892,7 +3892,7 @@ static void kvm_uevent_notify_change(unsigned int type, 
> struct kvm *kvm)
>   if (type == KVM_EVENT_CREATE_VM) {
>   add_uevent_var(env, "EVENT=create");
>   kvm->userspace_pid = task_pid_nr(current);
> - current->kvm = kvm;
> + current->mm->kvm = kvm;
I think you also need to reset kvm to NULL once the VM is
destroyed, otherwise it would point to dangling memory.

-Nikita
>   } else if (type == KVM_EVENT_DESTROY_VM) {
>   add_uevent_var(env, "EVENT=destroy");
>   }



Re: [RFC][PATCH 1/5] [PATCH 1/5] kvm: register in task_struct

2018-09-03 Thread Nikita Leshenko
On September 2, 2018 5:21:15 AM, fengguang...@intel.com wrote:
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 8b47507faab5..0c483720de8d 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -3892,6 +3892,7 @@ static void kvm_uevent_notify_change(unsigned int type, 
> struct kvm *kvm)
>   if (type == KVM_EVENT_CREATE_VM) {
>   add_uevent_var(env, "EVENT=create");
>   kvm->userspace_pid = task_pid_nr(current);
> + current->kvm = kvm;

Is it OK to store `kvm` on the task_struct? What if the thread that
originally created the VM exits? From the documentation it seems
like a VM is associated with an address space and not a specific
thread, so maybe it should be stored on mm_struct?

From Documentation/virtual/kvm/api.txt:
   Only run VM ioctls from the same process (address space) that was used
   to create the VM.

-Nikita
>   } else if (type == KVM_EVENT_DESTROY_VM) {
>   add_uevent_var(env, "EVENT=destroy");
>   }
> -- 
> 2.15.0
> 
> 
> 


Re: [RFC][PATCH 1/5] [PATCH 1/5] kvm: register in task_struct

2018-09-03 Thread Nikita Leshenko
On September 2, 2018 5:21:15 AM, fengguang...@intel.com wrote:
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 8b47507faab5..0c483720de8d 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -3892,6 +3892,7 @@ static void kvm_uevent_notify_change(unsigned int type, 
> struct kvm *kvm)
>   if (type == KVM_EVENT_CREATE_VM) {
>   add_uevent_var(env, "EVENT=create");
>   kvm->userspace_pid = task_pid_nr(current);
> + current->kvm = kvm;

Is it OK to store `kvm` on the task_struct? What if the thread that
originally created the VM exits? From the documentation it seems
like a VM is associated with an address space and not a specific
thread, so maybe it should be stored on mm_struct?

From Documentation/virtual/kvm/api.txt:
   Only run VM ioctls from the same process (address space) that was used
   to create the VM.

-Nikita
>   } else if (type == KVM_EVENT_DESTROY_VM) {
>   add_uevent_var(env, "EVENT=destroy");
>   }
> -- 
> 2.15.0
> 
> 
> 


Re: [PATCH 2/2] KVM: VMX: Add Force Emulation Prefix for "emulate the next instruction"

2018-03-27 Thread Nikita Leshenko
What you are essentially trying to do is create a PV interface to access
the x86 emulator.
Why not use a simple hypercall (VMCALL) to accomplish this instead of
inventing yet another PV method?

Something like “KVM_HC_EMULATE_NEXT_INSTRUCTION” in kvm_emulate_hypercall
should do the trick (however it needs to be placed before the check for
CPL>0 so that user mode code can test the emulator too).

Nikita

> On 27 Mar 2018, at 11:26, Liran Alon  wrote:
> 
> 
> - pbonz...@redhat.com wrote:
> 
>> On 27/03/2018 09:52, Liran Alon wrote:
>>> In addition, I think this module parameter should be in kvm module
>>> (not kvm_intel) and you should add similar logic to kvm_amd module
>> (SVM)
>> 
>> If you can move handle_ud to x86.c, then it makes sense to have the
>> module parameter in the kvm module.  I haven't checked.
> 
> I don't see a reason why you couldn't do that.
> 
>> 
>> Otherwise you would have to EXPORT_SYMBOL_GPL the variable; in the
> 
> This is what I did for enable_vmware_backdoor module parameter.
> I think this is what should be done in this case as-well.
> 
>> end
>> it's just a debugging tool, so it'd be simpler to just add it
>> separately
>> to kvm_intel and kvm_amd.
> 
> I agree it's just a debugging tool. But no reason for it to be used 
> differently
> when running tests on Intel CPU vs. AMD CPU.
> I think the effort to fix this is low.
> 
>> 
>> Paolo
> 
> 



Re: [PATCH 2/2] KVM: VMX: Add Force Emulation Prefix for "emulate the next instruction"

2018-03-27 Thread Nikita Leshenko
What you are essentially trying to do is create a PV interface to access
the x86 emulator.
Why not use a simple hypercall (VMCALL) to accomplish this instead of
inventing yet another PV method?

Something like “KVM_HC_EMULATE_NEXT_INSTRUCTION” in kvm_emulate_hypercall
should do the trick (however it needs to be placed before the check for
CPL>0 so that user mode code can test the emulator too).

Nikita

> On 27 Mar 2018, at 11:26, Liran Alon  wrote:
> 
> 
> - pbonz...@redhat.com wrote:
> 
>> On 27/03/2018 09:52, Liran Alon wrote:
>>> In addition, I think this module parameter should be in kvm module
>>> (not kvm_intel) and you should add similar logic to kvm_amd module
>> (SVM)
>> 
>> If you can move handle_ud to x86.c, then it makes sense to have the
>> module parameter in the kvm module.  I haven't checked.
> 
> I don't see a reason why you couldn't do that.
> 
>> 
>> Otherwise you would have to EXPORT_SYMBOL_GPL the variable; in the
> 
> This is what I did for enable_vmware_backdoor module parameter.
> I think this is what should be done in this case as-well.
> 
>> end
>> it's just a debugging tool, so it'd be simpler to just add it
>> separately
>> to kvm_intel and kvm_amd.
> 
> I agree it's just a debugging tool. But no reason for it to be used 
> differently
> when running tests on Intel CPU vs. AMD CPU.
> I think the effort to fix this is low.
> 
>> 
>> Paolo
> 
> 



Re: [PATCH] KVM: lapic: stop advertising DIRECTED_EOI when in-kernel IOAPIC is in use

2018-02-09 Thread Nikita Leshenko
The patch looks correct, however I’m confused about why you consider
this to be a bug in the guest rather than a bug in KVM.

The spec for x2APIC states:
"The support for Directed EOI capability can be detected by means of
bit 24 in the Local APIC Version Register” (Intel’s x2APIC spec, 2.5.1
Directed EOI)
It seems to me that Windows did the right thing by testing for the
presence of directed EOI feature rather than implying it exists by
testing a version number. KVM did the wrong thing by advertising a
feature it doesn’t support.

Therefore I think that you should change the comment to something like
“KVM’s in-kernel IOAPIC doesn’t support Directed EOI register, so don’t
advertise this capability in the LAPIC Version Register.” instead of
talking about buggy guests, as it may confuse future readers of this
code.

Thanks,
Nikita
> On 9 Feb 2018, at 15:01, Vitaly Kuznetsov  wrote:
> 
> Devices which use level-triggered interrupts under Windows 2016 with
> Hyper-V role enabled don't work: Windows disables EOI broadcast in SPIV
> unconditionally. Our in-kernel IOAPIC implementation emulates an old IOAPIC
> version which has no EOI register so EOI never happens.
> 
> The issue was discovered and discussed a while ago:
> https://urldefense.proofpoint.com/v2/url?u=https-3A__www.spinics.net_lists_kvm_msg148098.html=DwIBAg=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE=JD7W0KpKqI3xo5AglC-aIVDRz_ysy5CrQRnZ9Jb7je0=GWIw1X7PvyWESZaIau591RwjCXYZTi6THVNSOEcdaxU=5QUI6ED5i6frC8BzcF_e7hp6Kd_OqAxkg0z73R-UIDI=
> 
> While this is a guest OS bug (it should check that IOAPIC has the required
> capabilities before disabling EOI broadcast) we can workaround it in KVM:
> advertising DIRECTED_EOI with in-kernel IOAPIC makes little sense anyway.
> 
> Signed-off-by: Vitaly Kuznetsov 
> ---
> - Radim's suggestion was to disable DIRECTED_EOI unconditionally but I'm not
>  that radical :-) In theory, we may have multiple IOAPICs in userspace in
>  future and DIRECTED_EOI can be leveraged.
> ---
> arch/x86/kvm/lapic.c | 10 +-
> 1 file changed, 9 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 924ac8ce9d50..5339287fee63 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -321,8 +321,16 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
>   if (!lapic_in_kernel(vcpu))
>   return;
> 
> + /*
> +  * KVM emulates 82093AA datasheet (with in-kernel IOAPIC implementation)
> +  * which doesn't have EOI register; Some buggy OSes (e.g. Windows with
> +  * Hyper-V role) disable EOI broadcast in lapic not checking for IOAPIC
> +  * version first and level-triggered interrupts never get EOIed in
> +  * IOAPIC.
> +  */
>   feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0);
> - if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31
> + if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31))) &&
> + !ioapic_in_kernel(vcpu->kvm))
>   v |= APIC_LVR_DIRECTED_EOI;
>   kvm_lapic_set_reg(apic, APIC_LVR, v);
> }
> -- 
> 2.14.3
> 



Re: [PATCH] KVM: lapic: stop advertising DIRECTED_EOI when in-kernel IOAPIC is in use

2018-02-09 Thread Nikita Leshenko
The patch looks correct, however I’m confused about why you consider
this to be a bug in the guest rather than a bug in KVM.

The spec for x2APIC states:
"The support for Directed EOI capability can be detected by means of
bit 24 in the Local APIC Version Register” (Intel’s x2APIC spec, 2.5.1
Directed EOI)
It seems to me that Windows did the right thing by testing for the
presence of directed EOI feature rather than implying it exists by
testing a version number. KVM did the wrong thing by advertising a
feature it doesn’t support.

Therefore I think that you should change the comment to something like
“KVM’s in-kernel IOAPIC doesn’t support Directed EOI register, so don’t
advertise this capability in the LAPIC Version Register.” instead of
talking about buggy guests, as it may confuse future readers of this
code.

Thanks,
Nikita
> On 9 Feb 2018, at 15:01, Vitaly Kuznetsov  wrote:
> 
> Devices which use level-triggered interrupts under Windows 2016 with
> Hyper-V role enabled don't work: Windows disables EOI broadcast in SPIV
> unconditionally. Our in-kernel IOAPIC implementation emulates an old IOAPIC
> version which has no EOI register so EOI never happens.
> 
> The issue was discovered and discussed a while ago:
> https://urldefense.proofpoint.com/v2/url?u=https-3A__www.spinics.net_lists_kvm_msg148098.html=DwIBAg=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE=JD7W0KpKqI3xo5AglC-aIVDRz_ysy5CrQRnZ9Jb7je0=GWIw1X7PvyWESZaIau591RwjCXYZTi6THVNSOEcdaxU=5QUI6ED5i6frC8BzcF_e7hp6Kd_OqAxkg0z73R-UIDI=
> 
> While this is a guest OS bug (it should check that IOAPIC has the required
> capabilities before disabling EOI broadcast) we can workaround it in KVM:
> advertising DIRECTED_EOI with in-kernel IOAPIC makes little sense anyway.
> 
> Signed-off-by: Vitaly Kuznetsov 
> ---
> - Radim's suggestion was to disable DIRECTED_EOI unconditionally but I'm not
>  that radical :-) In theory, we may have multiple IOAPICs in userspace in
>  future and DIRECTED_EOI can be leveraged.
> ---
> arch/x86/kvm/lapic.c | 10 +-
> 1 file changed, 9 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 924ac8ce9d50..5339287fee63 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -321,8 +321,16 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu)
>   if (!lapic_in_kernel(vcpu))
>   return;
> 
> + /*
> +  * KVM emulates 82093AA datasheet (with in-kernel IOAPIC implementation)
> +  * which doesn't have EOI register; Some buggy OSes (e.g. Windows with
> +  * Hyper-V role) disable EOI broadcast in lapic not checking for IOAPIC
> +  * version first and level-triggered interrupts never get EOIed in
> +  * IOAPIC.
> +  */
>   feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0);
> - if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31
> + if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31))) &&
> + !ioapic_in_kernel(vcpu->kvm))
>   v |= APIC_LVR_DIRECTED_EOI;
>   kvm_lapic_set_reg(apic, APIC_LVR, v);
> }
> -- 
> 2.14.3
>