Re: [PATCH v2] KVM: nVMX: Fix read/write to MSR_IA32_FEATURE_CONTROL

2013-07-07 Thread Gleb Natapov
On Fri, Jul 05, 2013 at 11:13:17PM +0800, Arthur Chunqi Li wrote:
 Fix read/write to IA32_FEATURE_CONTROL MSR in nested environment.
 
 This patch simulate this MSR in nested_vmx and the default value is
 0x0. BIOS should set it to 0x5 before VMXON. After setting the lock
 bit, write to it will cause #GP(0).
 
 Signed-off-by: Arthur Chunqi Li yzt...@gmail.com
The patch is heavily based on Nadav's one. You need to preserve
authorship. Add From: and Nadav's Sign-off-by.

 ---
  arch/x86/kvm/vmx.c |   25 +
  arch/x86/kvm/x86.c |3 ++-
  2 files changed, 23 insertions(+), 5 deletions(-)
 
 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index 260a919..5e3d44e 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -373,6 +373,7 @@ struct nested_vmx {
* we must keep them pinned while L2 runs.
*/
   struct page *apic_access_page;
 + u64 msr_ia32_feature_control;
  };
  
  #define POSTED_INTR_ON  0
 @@ -2277,8 +2278,11 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 
 msr_index, u64 *pdata)
  
   switch (msr_index) {
   case MSR_IA32_FEATURE_CONTROL:
 - *pdata = 0;
 - break;
 + if (nested_vmx_allowed(vcpu)){
 + *pdata = to_vmx(vcpu)-nested.msr_ia32_feature_control;
 + break;
 + }
 + return 0;
   case MSR_IA32_VMX_BASIC:
   /*
* This MSR reports some information about VMX support. We
 @@ -2356,9 +2360,13 @@ static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 
 msr_index, u64 data)
   if (!nested_vmx_allowed(vcpu))
   return 0;
  
 - if (msr_index == MSR_IA32_FEATURE_CONTROL)
 - /* TODO: the right thing. */
 + if (msr_index == MSR_IA32_FEATURE_CONTROL){
 + if (to_vmx(vcpu)-nested.msr_ia32_feature_control
 +  FEATURE_CONTROL_LOCKED)
 + return 0;
 + to_vmx(vcpu)-nested.msr_ia32_feature_control = data;
You need to allow setting it from userspace, otherwise reset will not
clear it.

   return 1;
 + }
   /*
* No need to treat VMX capability MSRs specially: If we don't handle
* them, handle_wrmsr will #GP(0), which is correct (they are readonly)
 @@ -5595,6 +5603,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
   skip_emulated_instruction(vcpu);
   return 1;
   }
 +
 +#define VMXON_NEEDED_FEATURES \
Avi asked to use const u64 here.

 +   (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
 + if ((vmx-nested.msr_ia32_feature_control  VMXON_NEEDED_FEATURES)
 + != VMXON_NEEDED_FEATURES) {
 + kvm_inject_gp(vcpu, 0);
 + return 1;
 + }
 +
   if (enable_shadow_vmcs) {
   shadow_vmcs = alloc_vmcs();
   if (!shadow_vmcs)
 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index e8ba99c..2d4eb8e 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -850,7 +850,8 @@ static u32 msrs_to_save[] = {
  #ifdef CONFIG_X86_64
   MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
  #endif
 - MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
 + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
 + MSR_IA32_FEATURE_CONTROL
  };
  
  static unsigned num_msrs_to_save;
 -- 
 1.7.9.5

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] KVM: nVMX: Fix read/write to MSR_IA32_FEATURE_CONTROL

2013-07-07 Thread Arthur Chunqi Li
On Sun, Jul 7, 2013 at 2:59 PM, Gleb Natapov g...@redhat.com wrote:
 On Fri, Jul 05, 2013 at 11:13:17PM +0800, Arthur Chunqi Li wrote:
 Fix read/write to IA32_FEATURE_CONTROL MSR in nested environment.

 This patch simulate this MSR in nested_vmx and the default value is
 0x0. BIOS should set it to 0x5 before VMXON. After setting the lock
 bit, write to it will cause #GP(0).

 Signed-off-by: Arthur Chunqi Li yzt...@gmail.com
 The patch is heavily based on Nadav's one. You need to preserve
 authorship. Add From: and Nadav's Sign-off-by.

 ---
  arch/x86/kvm/vmx.c |   25 +
  arch/x86/kvm/x86.c |3 ++-
  2 files changed, 23 insertions(+), 5 deletions(-)

 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index 260a919..5e3d44e 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -373,6 +373,7 @@ struct nested_vmx {
* we must keep them pinned while L2 runs.
*/
   struct page *apic_access_page;
 + u64 msr_ia32_feature_control;
  };

  #define POSTED_INTR_ON  0
 @@ -2277,8 +2278,11 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 
 msr_index, u64 *pdata)

   switch (msr_index) {
   case MSR_IA32_FEATURE_CONTROL:
 - *pdata = 0;
 - break;
 + if (nested_vmx_allowed(vcpu)){
 + *pdata = to_vmx(vcpu)-nested.msr_ia32_feature_control;
 + break;
 + }
 + return 0;
   case MSR_IA32_VMX_BASIC:
   /*
* This MSR reports some information about VMX support. We
 @@ -2356,9 +2360,13 @@ static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 
 msr_index, u64 data)
   if (!nested_vmx_allowed(vcpu))
   return 0;

 - if (msr_index == MSR_IA32_FEATURE_CONTROL)
 - /* TODO: the right thing. */
 + if (msr_index == MSR_IA32_FEATURE_CONTROL){
 + if (to_vmx(vcpu)-nested.msr_ia32_feature_control
 +  FEATURE_CONTROL_LOCKED)
 + return 0;
 + to_vmx(vcpu)-nested.msr_ia32_feature_control = data;
 You need to allow setting it from userspace, otherwise reset will not
 clear it.
As to my memory, if the lock bit of IA32_FEATURE_CONTROL is set, this
MSR cannot be set or reset. So what do you mean setting it from
userspace?

   return 1;
 + }
   /*
* No need to treat VMX capability MSRs specially: If we don't handle
* them, handle_wrmsr will #GP(0), which is correct (they are readonly)
 @@ -5595,6 +5603,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
   skip_emulated_instruction(vcpu);
   return 1;
   }
 +
 +#define VMXON_NEEDED_FEATURES \
 Avi asked to use const u64 here.

 +   (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
 + if ((vmx-nested.msr_ia32_feature_control  VMXON_NEEDED_FEATURES)
 + != VMXON_NEEDED_FEATURES) {
 + kvm_inject_gp(vcpu, 0);
 + return 1;
 + }
 +
   if (enable_shadow_vmcs) {
   shadow_vmcs = alloc_vmcs();
   if (!shadow_vmcs)
 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index e8ba99c..2d4eb8e 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -850,7 +850,8 @@ static u32 msrs_to_save[] = {
  #ifdef CONFIG_X86_64
   MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
  #endif
 - MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
 + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
 + MSR_IA32_FEATURE_CONTROL
  };

  static unsigned num_msrs_to_save;
 --
 1.7.9.5

 --
 Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] KVM: nVMX: Fix read/write to MSR_IA32_FEATURE_CONTROL

2013-07-07 Thread Gleb Natapov
On Sun, Jul 07, 2013 at 03:16:08PM +0800, Arthur Chunqi Li wrote:
 On Sun, Jul 7, 2013 at 2:59 PM, Gleb Natapov g...@redhat.com wrote:
  On Fri, Jul 05, 2013 at 11:13:17PM +0800, Arthur Chunqi Li wrote:
  Fix read/write to IA32_FEATURE_CONTROL MSR in nested environment.
 
  This patch simulate this MSR in nested_vmx and the default value is
  0x0. BIOS should set it to 0x5 before VMXON. After setting the lock
  bit, write to it will cause #GP(0).
 
  Signed-off-by: Arthur Chunqi Li yzt...@gmail.com
  The patch is heavily based on Nadav's one. You need to preserve
  authorship. Add From: and Nadav's Sign-off-by.
 
  ---
   arch/x86/kvm/vmx.c |   25 +
   arch/x86/kvm/x86.c |3 ++-
   2 files changed, 23 insertions(+), 5 deletions(-)
 
  diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
  index 260a919..5e3d44e 100644
  --- a/arch/x86/kvm/vmx.c
  +++ b/arch/x86/kvm/vmx.c
  @@ -373,6 +373,7 @@ struct nested_vmx {
 * we must keep them pinned while L2 runs.
 */
struct page *apic_access_page;
  + u64 msr_ia32_feature_control;
   };
 
   #define POSTED_INTR_ON  0
  @@ -2277,8 +2278,11 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, 
  u32 msr_index, u64 *pdata)
 
switch (msr_index) {
case MSR_IA32_FEATURE_CONTROL:
  - *pdata = 0;
  - break;
  + if (nested_vmx_allowed(vcpu)){
  + *pdata = 
  to_vmx(vcpu)-nested.msr_ia32_feature_control;
  + break;
  + }
  + return 0;
case MSR_IA32_VMX_BASIC:
/*
 * This MSR reports some information about VMX support. We
  @@ -2356,9 +2360,13 @@ static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, 
  u32 msr_index, u64 data)
if (!nested_vmx_allowed(vcpu))
return 0;
 
  - if (msr_index == MSR_IA32_FEATURE_CONTROL)
  - /* TODO: the right thing. */
  + if (msr_index == MSR_IA32_FEATURE_CONTROL){
  + if (to_vmx(vcpu)-nested.msr_ia32_feature_control
  +  FEATURE_CONTROL_LOCKED)
  + return 0;
  + to_vmx(vcpu)-nested.msr_ia32_feature_control = data;
  You need to allow setting it from userspace, otherwise reset will not
  clear it.
 As to my memory, if the lock bit of IA32_FEATURE_CONTROL is set, this
 MSR cannot be set or reset. So what do you mean setting it from
 userspace?
It can be reset by power cycling CPU obviously, so when userspace emulates
power cycle reset it has to be able to zero the MSR. BTW you need to write
corespondent QEMU patch to handle this MSR during reset and migration.

 
return 1;
  + }
/*
 * No need to treat VMX capability MSRs specially: If we don't handle
 * them, handle_wrmsr will #GP(0), which is correct (they are 
  readonly)
  @@ -5595,6 +5603,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
skip_emulated_instruction(vcpu);
return 1;
}
  +
  +#define VMXON_NEEDED_FEATURES \
  Avi asked to use const u64 here.
 
  +   (FEATURE_CONTROL_LOCKED | 
  FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
  + if ((vmx-nested.msr_ia32_feature_control  VMXON_NEEDED_FEATURES)
  + != VMXON_NEEDED_FEATURES) {
  + kvm_inject_gp(vcpu, 0);
  + return 1;
  + }
  +
if (enable_shadow_vmcs) {
shadow_vmcs = alloc_vmcs();
if (!shadow_vmcs)
  diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
  index e8ba99c..2d4eb8e 100644
  --- a/arch/x86/kvm/x86.c
  +++ b/arch/x86/kvm/x86.c
  @@ -850,7 +850,8 @@ static u32 msrs_to_save[] = {
   #ifdef CONFIG_X86_64
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
   #endif
  - MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
  + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
  + MSR_IA32_FEATURE_CONTROL
   };
 
   static unsigned num_msrs_to_save;
  --
  1.7.9.5
 
  --
  Gleb.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH] uio: uio_pci_generic: Add support for MSI interrupts

2013-07-07 Thread Michael S. Tsirkin
On Thu, Jul 04, 2013 at 09:35:00AM -0700, Guenter Roeck wrote:
 On Thu, Jul 04, 2013 at 05:34:12PM +0300, Michael S. Tsirkin wrote:
  On Thu, Jul 04, 2013 at 07:25:23AM -0700, Guenter Roeck wrote:
   On Thu, Jul 04, 2013 at 10:20:23AM +0300, Michael S. Tsirkin wrote:
On Thu, Jun 27, 2013 at 10:00:52AM -0700, Guenter Roeck wrote:
 On Thu, Jun 27, 2013 at 10:45:01AM +0300, Michael S. Tsirkin wrote:
  On Wed, Jun 26, 2013 at 03:30:23PM -0700, Guenter Roeck wrote:
   Enable support for MSI interrupts if the device supports it.
   Since MSI interrupts are edge triggered, it is no longer 
   necessary to
   disable interrupts in the kernel and re-enable them from 
   user-space.
   Instead, clearing the interrupt condition in the user space 
   application
   automatically re-enables the interrupt.
   
   Signed-off-by: Guenter Roeck li...@roeck-us.net
   ---
   An open question is if we can just do this unconditionally
   or if there should be some flag to enable it. A module parameter, 
   maybe ?
  
  NACK
  
  UIO is for devices that don't do memory writes.
  Anything that can do writes must be protected by an IOMMU
  and/or have a secure kernel driver, not a UIO stub.
  
  MSI is done by memory writes so if userspace
  controls the device it can trick it to write
  anywhere in memory.
  
 Just out of curiosity: Since MSI support is mandatory for all PCIE 
 devices,
 isn't that possible anyway, even if MSI is not enabled by the kernel ?
 All one would need to do is to enable MSI from user space; after all,
 the chip configuration space is writable.
 
 Thanks,
 Guenter

If a device has capability to do writes, sure. So don't do this then :)

   Not an option. I need to use MSI.
   
   Not that it matters anymore - turns out it was better writing a specific 
   driver
   for my devices anyway; I needed to be able to disable chip interrupts 
   before
   unloading the driver. But why is it then a reason to NACK this patch ?
  
  There seem to be two cases - either you can't access the device -
  and the uio driver is not useful - or you can, and it's not safe.
  In both cases the patch does not seem to bring about anything
  except user confusion ...
  
 Sounds like claiming that supporting MSI would cause some kind of confusion.
 Not sure if I can follow that logic.
 
 Actually, it simplifies the user space code a lot. Since MSI interrupts are 
 edge
 triggered and queued, it is not necessary to disable the interrupts, and all
 user space has to do is to remove the interrupt reason. Works quite nicely for
 our devices.
 
 As I said, I don't really care too much anymore if this patch is rejected, but
 the reasons for the rejection are kind of weak.
 
   Besides, doesn't one have to be root anyway to perform such activities,
   which could then be more easily accomplished by writing into /dev/mem ?
   
   Thanks,
   Guenter
  
  root might not be able to write into /dev/mem.
  
 I don't really want to try, but at least it is marked rw for root.
 
 Guenter

Yes but mechanisms such as selinux can still block it.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Bug 60505] Heavy network traffic triggers vhost_net lockup

2013-07-07 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=60505

Michael S. Tsirkin m.s.tsir...@gmail.com changed:

   What|Removed |Added

 CC||m.s.tsir...@gmail.com

--- Comment #2 from Michael S. Tsirkin m.s.tsir...@gmail.com ---
does this still trigger if you disable zerocopy tx?

-- 
You are receiving this mail because:
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] KVM: nVMX: Fix read/write to MSR_IA32_FEATURE_CONTROL

2013-07-07 Thread Arthur Chunqi Li
On Sun, Jul 7, 2013 at 3:20 PM, Gleb Natapov g...@redhat.com wrote:
 On Sun, Jul 07, 2013 at 03:16:08PM +0800, Arthur Chunqi Li wrote:
 On Sun, Jul 7, 2013 at 2:59 PM, Gleb Natapov g...@redhat.com wrote:
  On Fri, Jul 05, 2013 at 11:13:17PM +0800, Arthur Chunqi Li wrote:
  Fix read/write to IA32_FEATURE_CONTROL MSR in nested environment.
 
  This patch simulate this MSR in nested_vmx and the default value is
  0x0. BIOS should set it to 0x5 before VMXON. After setting the lock
  bit, write to it will cause #GP(0).
 
  Signed-off-by: Arthur Chunqi Li yzt...@gmail.com
  The patch is heavily based on Nadav's one. You need to preserve
  authorship. Add From: and Nadav's Sign-off-by.
 
  ---
   arch/x86/kvm/vmx.c |   25 +
   arch/x86/kvm/x86.c |3 ++-
   2 files changed, 23 insertions(+), 5 deletions(-)
 
  diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
  index 260a919..5e3d44e 100644
  --- a/arch/x86/kvm/vmx.c
  +++ b/arch/x86/kvm/vmx.c
  @@ -373,6 +373,7 @@ struct nested_vmx {
 * we must keep them pinned while L2 runs.
 */
struct page *apic_access_page;
  + u64 msr_ia32_feature_control;
   };
 
   #define POSTED_INTR_ON  0
  @@ -2277,8 +2278,11 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, 
  u32 msr_index, u64 *pdata)
 
switch (msr_index) {
case MSR_IA32_FEATURE_CONTROL:
  - *pdata = 0;
  - break;
  + if (nested_vmx_allowed(vcpu)){
  + *pdata = 
  to_vmx(vcpu)-nested.msr_ia32_feature_control;
  + break;
  + }
  + return 0;
case MSR_IA32_VMX_BASIC:
/*
 * This MSR reports some information about VMX support. We
  @@ -2356,9 +2360,13 @@ static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, 
  u32 msr_index, u64 data)
if (!nested_vmx_allowed(vcpu))
return 0;
 
  - if (msr_index == MSR_IA32_FEATURE_CONTROL)
  - /* TODO: the right thing. */
  + if (msr_index == MSR_IA32_FEATURE_CONTROL){
  + if (to_vmx(vcpu)-nested.msr_ia32_feature_control
  +  FEATURE_CONTROL_LOCKED)
  + return 0;
  + to_vmx(vcpu)-nested.msr_ia32_feature_control = data;
  You need to allow setting it from userspace, otherwise reset will not
  clear it.
 As to my memory, if the lock bit of IA32_FEATURE_CONTROL is set, this
 MSR cannot be set or reset. So what do you mean setting it from
 userspace?
 It can be reset by power cycling CPU obviously, so when userspace emulates
 power cycle reset it has to be able to zero the MSR. BTW you need to write
 corespondent QEMU patch to handle this MSR during reset and migration.
What is the current call trace when QEMU do system_reset? Will it
trigger vmx_vcpu_reset() in vmx.c?

 
return 1;
  + }
/*
 * No need to treat VMX capability MSRs specially: If we don't 
  handle
 * them, handle_wrmsr will #GP(0), which is correct (they are 
  readonly)
  @@ -5595,6 +5603,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
skip_emulated_instruction(vcpu);
return 1;
}
  +
  +#define VMXON_NEEDED_FEATURES \
  Avi asked to use const u64 here.
 
  +   (FEATURE_CONTROL_LOCKED | 
  FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
  + if ((vmx-nested.msr_ia32_feature_control  VMXON_NEEDED_FEATURES)
  + != VMXON_NEEDED_FEATURES) {
  + kvm_inject_gp(vcpu, 0);
  + return 1;
  + }
  +
if (enable_shadow_vmcs) {
shadow_vmcs = alloc_vmcs();
if (!shadow_vmcs)
  diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
  index e8ba99c..2d4eb8e 100644
  --- a/arch/x86/kvm/x86.c
  +++ b/arch/x86/kvm/x86.c
  @@ -850,7 +850,8 @@ static u32 msrs_to_save[] = {
   #ifdef CONFIG_X86_64
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
   #endif
  - MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
  + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
  + MSR_IA32_FEATURE_CONTROL
   };
 
   static unsigned num_msrs_to_save;
  --
  1.7.9.5
 
  --
  Gleb.

 --
 Gleb.



--
Arthur Chunqi Li
Department of Computer Science
School of EECS
Peking University
Beijing, China
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] KVM: nVMX: Fix read/write to MSR_IA32_FEATURE_CONTROL

2013-07-07 Thread Gleb Natapov
On Sun, Jul 07, 2013 at 04:47:05PM +0800, Arthur Chunqi Li wrote:
 On Sun, Jul 7, 2013 at 3:20 PM, Gleb Natapov g...@redhat.com wrote:
  On Sun, Jul 07, 2013 at 03:16:08PM +0800, Arthur Chunqi Li wrote:
  On Sun, Jul 7, 2013 at 2:59 PM, Gleb Natapov g...@redhat.com wrote:
   On Fri, Jul 05, 2013 at 11:13:17PM +0800, Arthur Chunqi Li wrote:
   Fix read/write to IA32_FEATURE_CONTROL MSR in nested environment.
  
   This patch simulate this MSR in nested_vmx and the default value is
   0x0. BIOS should set it to 0x5 before VMXON. After setting the lock
   bit, write to it will cause #GP(0).
  
   Signed-off-by: Arthur Chunqi Li yzt...@gmail.com
   The patch is heavily based on Nadav's one. You need to preserve
   authorship. Add From: and Nadav's Sign-off-by.
  
   ---
arch/x86/kvm/vmx.c |   25 +
arch/x86/kvm/x86.c |3 ++-
2 files changed, 23 insertions(+), 5 deletions(-)
  
   diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
   index 260a919..5e3d44e 100644
   --- a/arch/x86/kvm/vmx.c
   +++ b/arch/x86/kvm/vmx.c
   @@ -373,6 +373,7 @@ struct nested_vmx {
  * we must keep them pinned while L2 runs.
  */
 struct page *apic_access_page;
   + u64 msr_ia32_feature_control;
};
  
#define POSTED_INTR_ON  0
   @@ -2277,8 +2278,11 @@ static int vmx_get_vmx_msr(struct kvm_vcpu 
   *vcpu, u32 msr_index, u64 *pdata)
  
 switch (msr_index) {
 case MSR_IA32_FEATURE_CONTROL:
   - *pdata = 0;
   - break;
   + if (nested_vmx_allowed(vcpu)){
   + *pdata = 
   to_vmx(vcpu)-nested.msr_ia32_feature_control;
   + break;
   + }
   + return 0;
 case MSR_IA32_VMX_BASIC:
 /*
  * This MSR reports some information about VMX support. We
   @@ -2356,9 +2360,13 @@ static int vmx_set_vmx_msr(struct kvm_vcpu 
   *vcpu, u32 msr_index, u64 data)
 if (!nested_vmx_allowed(vcpu))
 return 0;
  
   - if (msr_index == MSR_IA32_FEATURE_CONTROL)
   - /* TODO: the right thing. */
   + if (msr_index == MSR_IA32_FEATURE_CONTROL){
   + if (to_vmx(vcpu)-nested.msr_ia32_feature_control
   +  FEATURE_CONTROL_LOCKED)
   + return 0;
   + to_vmx(vcpu)-nested.msr_ia32_feature_control = data;
   You need to allow setting it from userspace, otherwise reset will not
   clear it.
  As to my memory, if the lock bit of IA32_FEATURE_CONTROL is set, this
  MSR cannot be set or reset. So what do you mean setting it from
  userspace?
  It can be reset by power cycling CPU obviously, so when userspace emulates
  power cycle reset it has to be able to zero the MSR. BTW you need to write
  corespondent QEMU patch to handle this MSR during reset and migration.
 What is the current call trace when QEMU do system_reset? Will it
 trigger vmx_vcpu_reset() in vmx.c?
It will not. QEMU inits vcpu state in userspace and write it into the
kernel with set_regs/set_srges ioctls.

 
  
 return 1;
   + }
 /*
  * No need to treat VMX capability MSRs specially: If we don't 
   handle
  * them, handle_wrmsr will #GP(0), which is correct (they are 
   readonly)
   @@ -5595,6 +5603,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 skip_emulated_instruction(vcpu);
 return 1;
 }
   +
   +#define VMXON_NEEDED_FEATURES \
   Avi asked to use const u64 here.
  
   +   (FEATURE_CONTROL_LOCKED | 
   FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
   + if ((vmx-nested.msr_ia32_feature_control  VMXON_NEEDED_FEATURES)
   + != VMXON_NEEDED_FEATURES) {
   + kvm_inject_gp(vcpu, 0);
   + return 1;
   + }
   +
 if (enable_shadow_vmcs) {
 shadow_vmcs = alloc_vmcs();
 if (!shadow_vmcs)
   diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
   index e8ba99c..2d4eb8e 100644
   --- a/arch/x86/kvm/x86.c
   +++ b/arch/x86/kvm/x86.c
   @@ -850,7 +850,8 @@ static u32 msrs_to_save[] = {
#ifdef CONFIG_X86_64
 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
#endif
   - MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
   + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
   + MSR_IA32_FEATURE_CONTROL
};
  
static unsigned num_msrs_to_save;
   --
   1.7.9.5
  
   --
   Gleb.
 
  --
  Gleb.
 
 
 
 --
 Arthur Chunqi Li
 Department of Computer Science
 School of EECS
 Peking University
 Beijing, China

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] KVM: nVMX: Fix read/write to MSR_IA32_FEATURE_CONTROL

2013-07-07 Thread Arthur Chunqi Li
On Sun, Jul 7, 2013 at 4:50 PM, Gleb Natapov g...@redhat.com wrote:
 On Sun, Jul 07, 2013 at 04:47:05PM +0800, Arthur Chunqi Li wrote:
 On Sun, Jul 7, 2013 at 3:20 PM, Gleb Natapov g...@redhat.com wrote:
  On Sun, Jul 07, 2013 at 03:16:08PM +0800, Arthur Chunqi Li wrote:
  On Sun, Jul 7, 2013 at 2:59 PM, Gleb Natapov g...@redhat.com wrote:
   On Fri, Jul 05, 2013 at 11:13:17PM +0800, Arthur Chunqi Li wrote:
   Fix read/write to IA32_FEATURE_CONTROL MSR in nested environment.
  
   This patch simulate this MSR in nested_vmx and the default value is
   0x0. BIOS should set it to 0x5 before VMXON. After setting the lock
   bit, write to it will cause #GP(0).
  
   Signed-off-by: Arthur Chunqi Li yzt...@gmail.com
   The patch is heavily based on Nadav's one. You need to preserve
   authorship. Add From: and Nadav's Sign-off-by.
  
   ---
arch/x86/kvm/vmx.c |   25 +
arch/x86/kvm/x86.c |3 ++-
2 files changed, 23 insertions(+), 5 deletions(-)
  
   diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
   index 260a919..5e3d44e 100644
   --- a/arch/x86/kvm/vmx.c
   +++ b/arch/x86/kvm/vmx.c
   @@ -373,6 +373,7 @@ struct nested_vmx {
  * we must keep them pinned while L2 runs.
  */
 struct page *apic_access_page;
   + u64 msr_ia32_feature_control;
};
  
#define POSTED_INTR_ON  0
   @@ -2277,8 +2278,11 @@ static int vmx_get_vmx_msr(struct kvm_vcpu 
   *vcpu, u32 msr_index, u64 *pdata)
  
 switch (msr_index) {
 case MSR_IA32_FEATURE_CONTROL:
   - *pdata = 0;
   - break;
   + if (nested_vmx_allowed(vcpu)){
   + *pdata = 
   to_vmx(vcpu)-nested.msr_ia32_feature_control;
   + break;
   + }
   + return 0;
 case MSR_IA32_VMX_BASIC:
 /*
  * This MSR reports some information about VMX support. 
   We
   @@ -2356,9 +2360,13 @@ static int vmx_set_vmx_msr(struct kvm_vcpu 
   *vcpu, u32 msr_index, u64 data)
 if (!nested_vmx_allowed(vcpu))
 return 0;
  
   - if (msr_index == MSR_IA32_FEATURE_CONTROL)
   - /* TODO: the right thing. */
   + if (msr_index == MSR_IA32_FEATURE_CONTROL){
   + if (to_vmx(vcpu)-nested.msr_ia32_feature_control
   +  FEATURE_CONTROL_LOCKED)
   + return 0;
   + to_vmx(vcpu)-nested.msr_ia32_feature_control = data;
   You need to allow setting it from userspace, otherwise reset will not
   clear it.
  As to my memory, if the lock bit of IA32_FEATURE_CONTROL is set, this
  MSR cannot be set or reset. So what do you mean setting it from
  userspace?
  It can be reset by power cycling CPU obviously, so when userspace emulates
  power cycle reset it has to be able to zero the MSR. BTW you need to write
  corespondent QEMU patch to handle this MSR during reset and migration.
 What is the current call trace when QEMU do system_reset? Will it
 trigger vmx_vcpu_reset() in vmx.c?
 It will not. QEMU inits vcpu state in userspace and write it into the
 kernel with set_regs/set_srges ioctls.
So how will this achieve isolation and protection? I mean if QEMU can
reset this MSR when reboot, can user in OS also reset it? Actually,
after setting to lock bit, it cannot be set until next reboot.

 
  
 return 1;
   + }
 /*
  * No need to treat VMX capability MSRs specially: If we don't 
   handle
  * them, handle_wrmsr will #GP(0), which is correct (they are 
   readonly)
   @@ -5595,6 +5603,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 skip_emulated_instruction(vcpu);
 return 1;
 }
   +
   +#define VMXON_NEEDED_FEATURES \
   Avi asked to use const u64 here.
  
   +   (FEATURE_CONTROL_LOCKED | 
   FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
   + if ((vmx-nested.msr_ia32_feature_control  
   VMXON_NEEDED_FEATURES)
   + != VMXON_NEEDED_FEATURES) {
   + kvm_inject_gp(vcpu, 0);
   + return 1;
   + }
   +
 if (enable_shadow_vmcs) {
 shadow_vmcs = alloc_vmcs();
 if (!shadow_vmcs)
   diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
   index e8ba99c..2d4eb8e 100644
   --- a/arch/x86/kvm/x86.c
   +++ b/arch/x86/kvm/x86.c
   @@ -850,7 +850,8 @@ static u32 msrs_to_save[] = {
#ifdef CONFIG_X86_64
 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
#endif
   - MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
   + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
   + MSR_IA32_FEATURE_CONTROL
};
  
static unsigned num_msrs_to_save;
   --
   1.7.9.5
  
   --
   Gleb.
 
  --
  Gleb.



 --
 Arthur Chunqi Li
 Department of Computer Science
 School of EECS
 Peking University
 Beijing, China

 --
 Gleb.
--

Re: [PATCH v2] KVM: nVMX: Fix read/write to MSR_IA32_FEATURE_CONTROL

2013-07-07 Thread Gleb Natapov
On Sun, Jul 07, 2013 at 05:17:00PM +0800, Arthur Chunqi Li wrote:
 On Sun, Jul 7, 2013 at 4:50 PM, Gleb Natapov g...@redhat.com wrote:
  On Sun, Jul 07, 2013 at 04:47:05PM +0800, Arthur Chunqi Li wrote:
  On Sun, Jul 7, 2013 at 3:20 PM, Gleb Natapov g...@redhat.com wrote:
   On Sun, Jul 07, 2013 at 03:16:08PM +0800, Arthur Chunqi Li wrote:
   On Sun, Jul 7, 2013 at 2:59 PM, Gleb Natapov g...@redhat.com wrote:
On Fri, Jul 05, 2013 at 11:13:17PM +0800, Arthur Chunqi Li wrote:
Fix read/write to IA32_FEATURE_CONTROL MSR in nested environment.
   
This patch simulate this MSR in nested_vmx and the default value is
0x0. BIOS should set it to 0x5 before VMXON. After setting the lock
bit, write to it will cause #GP(0).
   
Signed-off-by: Arthur Chunqi Li yzt...@gmail.com
The patch is heavily based on Nadav's one. You need to preserve
authorship. Add From: and Nadav's Sign-off-by.
   
---
 arch/x86/kvm/vmx.c |   25 +
 arch/x86/kvm/x86.c |3 ++-
 2 files changed, 23 insertions(+), 5 deletions(-)
   
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 260a919..5e3d44e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -373,6 +373,7 @@ struct nested_vmx {
   * we must keep them pinned while L2 runs.
   */
  struct page *apic_access_page;
+ u64 msr_ia32_feature_control;
 };
   
 #define POSTED_INTR_ON  0
@@ -2277,8 +2278,11 @@ static int vmx_get_vmx_msr(struct kvm_vcpu 
*vcpu, u32 msr_index, u64 *pdata)
   
  switch (msr_index) {
  case MSR_IA32_FEATURE_CONTROL:
- *pdata = 0;
- break;
+ if (nested_vmx_allowed(vcpu)){
+ *pdata = 
to_vmx(vcpu)-nested.msr_ia32_feature_control;
+ break;
+ }
+ return 0;
  case MSR_IA32_VMX_BASIC:
  /*
   * This MSR reports some information about VMX 
support. We
@@ -2356,9 +2360,13 @@ static int vmx_set_vmx_msr(struct kvm_vcpu 
*vcpu, u32 msr_index, u64 data)
  if (!nested_vmx_allowed(vcpu))
  return 0;
   
- if (msr_index == MSR_IA32_FEATURE_CONTROL)
- /* TODO: the right thing. */
+ if (msr_index == MSR_IA32_FEATURE_CONTROL){
+ if (to_vmx(vcpu)-nested.msr_ia32_feature_control
+  FEATURE_CONTROL_LOCKED)
+ return 0;
+ to_vmx(vcpu)-nested.msr_ia32_feature_control = data;
You need to allow setting it from userspace, otherwise reset will not
clear it.
   As to my memory, if the lock bit of IA32_FEATURE_CONTROL is set, this
   MSR cannot be set or reset. So what do you mean setting it from
   userspace?
   It can be reset by power cycling CPU obviously, so when userspace 
   emulates
   power cycle reset it has to be able to zero the MSR. BTW you need to 
   write
   corespondent QEMU patch to handle this MSR during reset and migration.
  What is the current call trace when QEMU do system_reset? Will it
  trigger vmx_vcpu_reset() in vmx.c?
  It will not. QEMU inits vcpu state in userspace and write it into the
  kernel with set_regs/set_srges ioctls.
 So how will this achieve isolation and protection? I mean if QEMU can
 reset this MSR when reboot, can user in OS also reset it? Actually,
 after setting to lock bit, it cannot be set until next reboot.
There is a way to tell if msr write is done by QEMU or a guest OS.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/4] PF: Move architecture specifics to the backends

2013-07-07 Thread Gleb Natapov
On Fri, Jul 05, 2013 at 10:55:52PM +0200, Dominik Dingel wrote:
 Current common codes uses PAGE_OFFSET to indicate a bad host virtual address.
 As this check won't work on architectures that don't map kernel and user 
 memory
 into the same address space (e.g. s390), it is moved into architcture specific
 code.
 
 Signed-off-by: Dominik Dingel din...@linux.vnet.ibm.com
 ---
  arch/arm/include/asm/kvm_host.h |  8 
  arch/ia64/include/asm/kvm_host.h|  3 +++
  arch/mips/include/asm/kvm_host.h|  6 ++
  arch/powerpc/include/asm/kvm_host.h |  8 
  arch/s390/include/asm/kvm_host.h| 12 
  arch/x86/include/asm/kvm_host.h |  8 
  include/linux/kvm_host.h|  8 
  7 files changed, 45 insertions(+), 8 deletions(-)
 
 diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
 index 7d22517..557c2a1 100644
 --- a/arch/arm/include/asm/kvm_host.h
 +++ b/arch/arm/include/asm/kvm_host.h
 @@ -74,6 +74,14 @@ struct kvm_arch {
   struct vgic_distvgic;
  };
  
 +#define KVM_HVA_ERR_BAD  (PAGE_OFFSET)
 +#define KVM_HVA_ERR_RO_BAD   (PAGE_OFFSET + PAGE_SIZE)
 +
 +static inline bool kvm_is_error_hva(unsigned long addr)
 +{
 + return addr = PAGE_OFFSET;
 +}
 +
Instead of changing every arch I prefer to add
#ifndef KVM_HVA_ERR_BAD
#dndif
around this code in the common header and define different version for
s390 only.

A question bellow.

  #define KVM_NR_MEM_OBJS 40
  
  /*
 diff --git a/arch/ia64/include/asm/kvm_host.h 
 b/arch/ia64/include/asm/kvm_host.h
 index 989dd3f..d3afa6f 100644
 --- a/arch/ia64/include/asm/kvm_host.h
 +++ b/arch/ia64/include/asm/kvm_host.h
 @@ -486,6 +486,9 @@ struct kvm_arch {
   unsigned long irq_states[KVM_IOAPIC_NUM_PINS];
  };
  
 +#define KVM_HVA_ERR_BAD  (PAGE_OFFSET)
 +#define KVM_HVA_ERR_RO_BAD   (PAGE_OFFSET + PAGE_SIZE)
 +
  union cpuid3_t {
   u64 value;
   struct {
 diff --git a/arch/mips/include/asm/kvm_host.h 
 b/arch/mips/include/asm/kvm_host.h
 index 4d6fa0b..3a0a3f7 100644
 --- a/arch/mips/include/asm/kvm_host.h
 +++ b/arch/mips/include/asm/kvm_host.h
 @@ -34,7 +34,13 @@
  #define KVM_NR_PAGE_SIZES1
  #define KVM_PAGES_PER_HPAGE(x)   1
  
 +#define KVM_HVA_ERR_BAD  (PAGE_OFFSET)
 +#define KVM_HVA_ERR_RO_BAD   (PAGE_OFFSET + PAGE_SIZE)
  
 +static inline bool kvm_is_error_hva(unsigned long addr)
 +{
 + return addr = PAGE_OFFSET;
 +}
  
  /* Special address that contains the comm page, used for reducing # of traps 
 */
  #define KVM_GUEST_COMMPAGE_ADDR 0x0
 diff --git a/arch/powerpc/include/asm/kvm_host.h 
 b/arch/powerpc/include/asm/kvm_host.h
 index af326cd..be5d7f4 100644
 --- a/arch/powerpc/include/asm/kvm_host.h
 +++ b/arch/powerpc/include/asm/kvm_host.h
 @@ -273,6 +273,14 @@ struct kvm_arch {
  #endif
  };
  
 +#define KVM_HVA_ERR_BAD  (PAGE_OFFSET)
 +#define KVM_HVA_ERR_RO_BAD   (PAGE_OFFSET + PAGE_SIZE)
 +
 +static inline bool kvm_is_error_hva(unsigned long addr)
 +{
 + return addr = PAGE_OFFSET;
 +}
 +
  /*
   * Struct for a virtual core.
   * Note: entry_exit_count combines an entry count in the bottom 8 bits
 diff --git a/arch/s390/include/asm/kvm_host.h 
 b/arch/s390/include/asm/kvm_host.h
 index 3238d40..152 100644
 --- a/arch/s390/include/asm/kvm_host.h
 +++ b/arch/s390/include/asm/kvm_host.h
 @@ -274,6 +274,18 @@ struct kvm_arch{
   int css_support;
  };
  
 +#define KVM_HVA_ERR_BAD  (-1UL)
 +#define KVM_HVA_ERR_RO_BAD   (-1UL)
 +
 +static inline bool kvm_is_error_hva(unsigned long addr)
 +{
 + /*
 +  * on s390, this check is not needed as kernel and user memory
 +  * is not mapped into the same address space
 +  */
 + return false;
 +}
 +
Can gfn_to_hva() ever return error hva on S390? Currently error hva is
returned if gfn is outside of any slot or slot is invalid.

  extern int sie64a(struct kvm_s390_sie_block *, u64 *);
  extern char sie_exit;
  #endif
 diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
 index f87f7fc..07e8570 100644
 --- a/arch/x86/include/asm/kvm_host.h
 +++ b/arch/x86/include/asm/kvm_host.h
 @@ -96,6 +96,14 @@
  
  #define ASYNC_PF_PER_VCPU 64
  
 +#define KVM_HVA_ERR_BAD  (PAGE_OFFSET)
 +#define KVM_HVA_ERR_RO_BAD   (PAGE_OFFSET + PAGE_SIZE)
 +
 +static inline bool kvm_is_error_hva(unsigned long addr)
 +{
 + return addr = PAGE_OFFSET;
 +}
 +
  struct kvm_vcpu;
  struct kvm;
  struct kvm_async_pf;
 diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
 index a63d83e..210f493 100644
 --- a/include/linux/kvm_host.h
 +++ b/include/linux/kvm_host.h
 @@ -85,14 +85,6 @@ static inline bool is_noslot_pfn(pfn_t pfn)
   return pfn == KVM_PFN_NOSLOT;
  }
  
 -#define KVM_HVA_ERR_BAD  (PAGE_OFFSET)
 -#define KVM_HVA_ERR_RO_BAD   (PAGE_OFFSET + PAGE_SIZE)
 -
 -static inline bool kvm_is_error_hva(unsigned long addr)
 -{
 - return addr = PAGE_OFFSET;
 -}
 -
 

Re: [PATCH 3/4] PF: Provide additional direct page notification

2013-07-07 Thread Gleb Natapov
On Fri, Jul 05, 2013 at 10:55:53PM +0200, Dominik Dingel wrote:
 By setting a Kconfig option, the architecture can control when
 guest notifications will be presented by the apf backend.
 So there is the default batch mechanism, working as before, where the vcpu 
 thread
 should pull in this information. On the other hand there is now the direct
 mechanism, this will directly push the information to the guest.
 
 Still the vcpu thread should call check_completion to cleanup leftovers,
 that leaves most of the common code untouched.
 
 Signed-off-by: Dominik Dingel din...@linux.vnet.ibm.com
 ---
  arch/x86/kvm/mmu.c   |  2 +-
  include/linux/kvm_host.h |  2 +-
  virt/kvm/Kconfig |  4 
  virt/kvm/async_pf.c  | 22 +++---
  4 files changed, 25 insertions(+), 5 deletions(-)
 
 diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
 index 0d094da..b8632e9 100644
 --- a/arch/x86/kvm/mmu.c
 +++ b/arch/x86/kvm/mmu.c
 @@ -3343,7 +3343,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu 
 *vcpu, gva_t gva, gfn_t gfn)
   arch.direct_map = vcpu-arch.mmu.direct_map;
   arch.cr3 = vcpu-arch.mmu.get_cr3(vcpu);
  
 - return kvm_setup_async_pf(vcpu, gva, gfn, arch);
 + return kvm_setup_async_pf(vcpu, gva, gfn_to_hva(vcpu-kvm, gfn), arch);
  }
  
  static bool can_do_async_pf(struct kvm_vcpu *vcpu)
 diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
 index 210f493..969d575 100644
 --- a/include/linux/kvm_host.h
 +++ b/include/linux/kvm_host.h
 @@ -175,7 +175,7 @@ struct kvm_async_pf {
  
  void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu);
  void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu);
 -int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
 +int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva,
  struct kvm_arch_async_pf *arch);
  int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
  #endif
 diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
 index 779262f..715e6b5 100644
 --- a/virt/kvm/Kconfig
 +++ b/virt/kvm/Kconfig
 @@ -22,6 +22,10 @@ config KVM_MMIO
  config KVM_ASYNC_PF
 bool
  
 +# Toggle to switch between direct notification and batch job
 +config KVM_ASYNC_PF_DIRECT
 +   bool
 +
  config HAVE_KVM_MSI
 bool
  
 diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
 index ea475cd..b8df37a 100644
 --- a/virt/kvm/async_pf.c
 +++ b/virt/kvm/async_pf.c
 @@ -28,6 +28,21 @@
  #include async_pf.h
  #include trace/events/kvm.h
  
 +static inline void kvm_async_page_direct_present(struct kvm_vcpu *vcpu,
 +  struct kvm_async_pf *work)
 +{
 +#ifdef CONFIG_KVM_ASYNC_PF_DIRECT
 + kvm_arch_async_page_present(vcpu, work);
 +#endif
 +}
 +static inline void kvm_async_page_batch_present(struct kvm_vcpu *vcpu,
 + struct kvm_async_pf *work)
 +{
 +#ifndef CONFIG_KVM_ASYNC_PF_DIRECT
 + kvm_arch_async_page_present(vcpu, work);
 +#endif
 +}
 +
I would call them kvm_async_page_present_(async|sync)().

Hmm, to much sync in each function name, but I still think it is
better.

  static struct kmem_cache *async_pf_cache;
  
  int kvm_async_pf_init(void)
 @@ -70,6 +85,7 @@ static void async_pf_execute(struct work_struct *work)
   down_read(mm-mmap_sem);
   get_user_pages(current, mm, addr, 1, 1, 0, page, NULL);
   up_read(mm-mmap_sem);
 + kvm_async_page_direct_present(vcpu, apf);
   unuse_mm(mm);
  
   spin_lock(vcpu-async_pf.lock);
 @@ -134,7 +150,7 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
  
   if (work-page)
   kvm_arch_async_page_ready(vcpu, work);
 - kvm_arch_async_page_present(vcpu, work);
 + kvm_async_page_batch_present(vcpu, work);
  
   list_del(work-queue);
   vcpu-async_pf.queued--;
 @@ -144,7 +160,7 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
   }
  }
  
 -int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
 +int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva,
  struct kvm_arch_async_pf *arch)
  {
   struct kvm_async_pf *work;
 @@ -166,7 +182,7 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, 
 gfn_t gfn,
   work-done = false;
   work-vcpu = vcpu;
   work-gva = gva;
 - work-addr = gfn_to_hva(vcpu-kvm, gfn);
 + work-addr = hva;
   work-arch = *arch;
   work-mm = current-mm;
   atomic_inc(work-mm-mm_count);
 -- 
 1.8.2.2

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Bug 60518] Heavy network traffic between guest and host triggers kernel oops

2013-07-07 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=60518

--- Comment #2 from Bart Van Assche bvanass...@acm.org ---
Note: so far I haven't been able to trigger this issue with kernel 3.8.12 nor
with kernel 3.10.

-- 
You are receiving this mail because:
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCHv3] vhost-net: fix use-after-free in vhost_net_flush

2013-07-07 Thread Michael S. Tsirkin
vhost_net_ubuf_put_and_wait has a confusing name:
it will actually also free it's argument.
Thus since commit 1280c27f8e29acf4af2da914e80ec27c3dbd5c01
vhost-net: flush outstanding DMAs on memory change
vhost_net_flush tries to use the argument after passing it
to vhost_net_ubuf_put_and_wait, this results
in use after free.
To fix, don't free the argument in vhost_net_ubuf_put_and_wait,
add an new API for callers that want to free ubufs.

Acked-by: Asias He as...@redhat.com
Acked-by: Jason Wang jasow...@redhat.com
Signed-off-by: Michael S. Tsirkin m...@redhat.com
---

Please review, and queue for 3.11 and stable.

Changes from v2:
- rebased to net/master
Changes from v1:
- tweak commit message

 drivers/vhost/net.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index f80d3dd..8ca5ac7 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -150,6 +150,11 @@ static void vhost_net_ubuf_put_and_wait(struct 
vhost_net_ubuf_ref *ubufs)
 {
kref_put(ubufs-kref, vhost_net_zerocopy_done_signal);
wait_event(ubufs-wait, !atomic_read(ubufs-kref.refcount));
+}
+
+static void vhost_net_ubuf_put_wait_and_free(struct vhost_net_ubuf_ref *ubufs)
+{
+   vhost_net_ubuf_put_and_wait(ubufs);
kfree(ubufs);
 }
 
@@ -948,7 +953,7 @@ static long vhost_net_set_backend(struct vhost_net *n, 
unsigned index, int fd)
mutex_unlock(vq-mutex);
 
if (oldubufs) {
-   vhost_net_ubuf_put_and_wait(oldubufs);
+   vhost_net_ubuf_put_wait_and_free(oldubufs);
mutex_lock(vq-mutex);
vhost_zerocopy_signal_used(n, vq);
mutex_unlock(vq-mutex);
@@ -966,7 +971,7 @@ err_used:
rcu_assign_pointer(vq-private_data, oldsock);
vhost_net_enable_vq(n, vq);
if (ubufs)
-   vhost_net_ubuf_put_and_wait(ubufs);
+   vhost_net_ubuf_put_wait_and_free(ubufs);
 err_ubufs:
fput(sock-file);
 err_vq:
-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Bug 60505] Heavy network traffic triggers vhost_net lockup

2013-07-07 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=60505

--- Comment #3 from Michael S. Tsirkin m.s.tsir...@gmail.com ---
Also.
I just posted a patch fixing a bug in this function.
[PATCHv3] vhost-net: fix use-after-free in vhost_net_flush
could you please try with this patch applied?

-- 
You are receiving this mail because:
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] vhost: Avoid that vhost_work_flush() locks up

2013-07-07 Thread Michael S. Tsirkin
On Fri, Jul 05, 2013 at 10:36:46AM +0200, Bart Van Assche wrote:
 Wake up work-done waiters even if the TIF_NEED_RESCHED task flag
 has been set. This patch fixes a regression introduced in commit
 d550dda (kernel v3.4).
 
 Signed-off-by: Bart Van Assche bvanass...@acm.org
 Reference: https://bugzilla.kernel.org/show_bug.cgi?id=60505
 Cc: Michael S. Tsirkin m...@redhat.com
 Cc: Asias He as...@redhat.com
 Cc: Nadav Har'El n...@math.technion.ac.il
 Cc: Abel Gordon ab...@il.ibm.com
 Cc: sta...@vger.kernel.org # v3.4+

I just posted a patch fixing a bug in this function.
[PATCHv3] vhost-net: fix use-after-free in vhost_net_flush
could you please try with this patch applied?

 ---
  drivers/vhost/vhost.c |   10 +-
  1 file changed, 9 insertions(+), 1 deletion(-)
 
 diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
 index 60aa5ad..cd544ae 100644
 --- a/drivers/vhost/vhost.c
 +++ b/drivers/vhost/vhost.c
 @@ -227,8 +227,16 @@ static int vhost_worker(void *data)
   if (work) {
   __set_current_state(TASK_RUNNING);
   work-fn(work);
 - if (need_resched())
 + if (need_resched()) {
 + spin_lock_irq(dev-work_lock);
 + work-done_seq = seq;
 + if (work-flushing)
 + wake_up_all(work-done);
 + spin_unlock_irq(dev-work_lock);
 +
 + work = NULL;
   schedule();
 + }
   } else
   schedule();
  
 -- 
 1.7.10.4
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 03/11] vhost: Make vhost a separate module

2013-07-07 Thread Michael S. Tsirkin
On Mon, May 06, 2013 at 08:10:03PM +0800, Asias He wrote:
 On Mon, May 06, 2013 at 01:03:42PM +0300, Michael S. Tsirkin wrote:
  On Mon, May 06, 2013 at 04:38:21PM +0800, Asias He wrote:
   Currently, vhost-net and vhost-scsi are sharing the vhost core code.
   However, vhost-scsi shares the code by including the vhost.c file
   directly.
   
   Making vhost a separate module makes it is easier to share code with
   other vhost devices.
   
   Signed-off-by: Asias He as...@redhat.com
  
  Also this will break test.c, right? Let's fix it in the same
  commit too.
 
 I will fix it up and remove the useless 'return'.

Don't see v3 anywhere?

   ---
drivers/vhost/Kconfig  |  8 
drivers/vhost/Makefile |  3 ++-
drivers/vhost/scsi.c   |  1 -
drivers/vhost/vhost.c  | 51 
   +-
drivers/vhost/vhost.h  |  2 ++
5 files changed, 62 insertions(+), 3 deletions(-)
   
   diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
   index 8b9226d..017a1e8 100644
   --- a/drivers/vhost/Kconfig
   +++ b/drivers/vhost/Kconfig
   @@ -1,6 +1,7 @@
config VHOST_NET
 tristate Host kernel accelerator for virtio net
 depends on NET  EVENTFD  (TUN || !TUN)  (MACVTAP || !MACVTAP)
   + select VHOST
 select VHOST_RING
 ---help---
   This kernel module can be loaded in host kernel to accelerate
   @@ -13,6 +14,7 @@ config VHOST_NET
config VHOST_SCSI
 tristate VHOST_SCSI TCM fabric driver
 depends on TARGET_CORE  EVENTFD  m
   + select VHOST
 select VHOST_RING
 default n
 ---help---
   @@ -24,3 +26,9 @@ config VHOST_RING
 ---help---
   This option is selected by any driver which needs to access
   the host side of a virtio ring.
   +
   +config VHOST
   + tristate
   + ---help---
   +   This option is selected by any driver which needs to access
   +   the core of vhost.
   diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
   index 654e9afb..e0441c3 100644
   --- a/drivers/vhost/Makefile
   +++ b/drivers/vhost/Makefile
   @@ -1,7 +1,8 @@
obj-$(CONFIG_VHOST_NET) += vhost_net.o
   -vhost_net-y := vhost.o net.o
   +vhost_net-y := net.o

obj-$(CONFIG_VHOST_SCSI) += vhost_scsi.o
vhost_scsi-y := scsi.o

obj-$(CONFIG_VHOST_RING) += vringh.o
   +obj-$(CONFIG_VHOST)  += vhost.o
   diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
   index 5179f7a..2dcb94a 100644
   --- a/drivers/vhost/scsi.c
   +++ b/drivers/vhost/scsi.c
   @@ -49,7 +49,6 @@
#include linux/llist.h
#include linux/bitmap.h

   -#include vhost.c
#include vhost.h

#define TCM_VHOST_VERSION  v0.1
   diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
   index de9441a..e406d5f 100644
   --- a/drivers/vhost/vhost.c
   +++ b/drivers/vhost/vhost.c
   @@ -25,6 +25,7 @@
#include linux/slab.h
#include linux/kthread.h
#include linux/cgroup.h
   +#include linux/module.h

#include vhost.h

   @@ -66,6 +67,7 @@ void vhost_work_init(struct vhost_work *work, 
   vhost_work_fn_t fn)
 work-flushing = 0;
 work-queue_seq = work-done_seq = 0;
}
   +EXPORT_SYMBOL_GPL(vhost_work_init);

/* Init poll structure */
void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
   @@ -79,6 +81,7 @@ void vhost_poll_init(struct vhost_poll *poll, 
   vhost_work_fn_t fn,

 vhost_work_init(poll-work, fn);
}
   +EXPORT_SYMBOL_GPL(vhost_poll_init);

/* Start polling a file. We add ourselves to file's wait queue. The 
   caller must
 * keep a reference to a file until after vhost_poll_stop is called. */
   @@ -101,6 +104,7 @@ int vhost_poll_start(struct vhost_poll *poll, struct 
   file *file)

 return ret;
}
   +EXPORT_SYMBOL_GPL(vhost_poll_start);

/* Stop polling a file. After this function returns, it becomes safe to 
   drop the
 * file reference. You must also flush afterwards. */
   @@ -111,6 +115,7 @@ void vhost_poll_stop(struct vhost_poll *poll)
 poll-wqh = NULL;
 }
}
   +EXPORT_SYMBOL_GPL(vhost_poll_stop);

static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work 
   *work,
 unsigned seq)
   @@ -123,7 +128,7 @@ static bool vhost_work_seq_done(struct vhost_dev 
   *dev, struct vhost_work *work,
 return left = 0;
}

   -static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work 
   *work)
   +void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work)
{
 unsigned seq;
 int flushing;
   @@ -138,6 +143,7 @@ static void vhost_work_flush(struct vhost_dev *dev, 
   struct vhost_work *work)
 spin_unlock_irq(dev-work_lock);
 BUG_ON(flushing  0);
}
   +EXPORT_SYMBOL_GPL(vhost_work_flush);

/* Flush any work that has been scheduled. When calling this, don't hold 
   any
 * locks that are also used by the callback. */
   @@ -145,6 +151,7 @@ void vhost_poll_flush(struct 

[PATCH] virtio: include asm/barrier explicitly

2013-07-07 Thread Michael S. Tsirkin
virtio_ring.h uses mb() and friends, make
it pull in asm/barrier.h itself, not rely
on other headers to do it.

Signed-off-by: Michael S. Tsirkin m...@redhat.com
---
 include/linux/virtio_ring.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
index ca3ad41..b300787 100644
--- a/include/linux/virtio_ring.h
+++ b/include/linux/virtio_ring.h
@@ -1,6 +1,7 @@
 #ifndef _LINUX_VIRTIO_RING_H
 #define _LINUX_VIRTIO_RING_H
 
+#include asm/barrier.h
 #include linux/irqreturn.h
 #include uapi/linux/virtio_ring.h
 
-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] virtio: include asm/barrier explicitly

2013-07-07 Thread Michael S. Tsirkin
On Sun, Jul 07, 2013 at 05:20:19PM +0300, Michael S. Tsirkin wrote:
 virtio_ring.h uses mb() and friends, make
 it pull in asm/barrier.h itself, not rely
 on other headers to do it.
 
 Signed-off-by: Michael S. Tsirkin m...@redhat.com

In particular this fixes warnings during test build:
[linux]$ make -C tools/virtio/ 
make: Entering directory `/home/mst/scm/linux/tools/virtio'
cc -g -O2 -Wall -I. -I ../../usr/include/ -Wno-pointer-sign
-fno-strict-overflow -fno-strict-aliasing -fno-common -MMD
-U_FORTIFY_SOURCE   -c -o virtio_test.o virtio_test.c
In file included from ./linux/virtio_ring.h:1:0,
 from ../../usr/include/linux/vhost.h:17,
 from virtio_test.c:14:
./linux/../../../include/linux/virtio_ring.h: In function ‘virtio_mb’:
./linux/../../../include/linux/virtio_ring.h:50:2: warning: implicit
declaration of function ‘mb’ [-Wimplicit-function-declaration]


 ---
  include/linux/virtio_ring.h | 1 +
  1 file changed, 1 insertion(+)
 
 diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
 index ca3ad41..b300787 100644
 --- a/include/linux/virtio_ring.h
 +++ b/include/linux/virtio_ring.h
 @@ -1,6 +1,7 @@
  #ifndef _LINUX_VIRTIO_RING_H
  #define _LINUX_VIRTIO_RING_H
  
 +#include asm/barrier.h
  #include linux/irqreturn.h
  #include uapi/linux/virtio_ring.h
  
 -- 
 MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] tools/virtio: move module license stub to module.h

2013-07-07 Thread Michael S. Tsirkin
This fixes build for the vringh test:
[linux]$ make -C tools/virtio/
make: Entering directory `/home/mst/scm/linux/tools/virtio'
cc -g -O2 -Wall -I. -I ../../usr/include/ -Wno-pointer-sign
-fno-strict-overflow -fno-strict-aliasing -fno-common -MMD
-U_FORTIFY_SOURCE   -c -o vringh.o ../../drivers/vhost/vringh.c
../../drivers/vhost/vringh.c:1010:16: error: expected declaration
specifiers or ‘...’ before string constant

Signed-off-by: Michael S. Tsirkin m...@redhat.com
---
 tools/virtio/linux/module.h | 5 +
 tools/virtio/linux/virtio.h | 3 ---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/tools/virtio/linux/module.h b/tools/virtio/linux/module.h
index 3039a7e..28ce95a 100644
--- a/tools/virtio/linux/module.h
+++ b/tools/virtio/linux/module.h
@@ -1 +1,6 @@
 #include linux/export.h
+
+#define MODULE_LICENSE(__MODULE_LICENSE_value) \
+   static __attribute__((unused)) const char *__MODULE_LICENSE_name = \
+   __MODULE_LICENSE_value
+
diff --git a/tools/virtio/linux/virtio.h b/tools/virtio/linux/virtio.h
index cd80183..8447830 100644
--- a/tools/virtio/linux/virtio.h
+++ b/tools/virtio/linux/virtio.h
@@ -45,9 +45,6 @@ struct virtqueue {
void *priv;
 };
 
-#define MODULE_LICENSE(__MODULE_LICENSE_value) \
-   const char *__MODULE_LICENSE_name = __MODULE_LICENSE_value
-
 /* Interfaces exported by virtio_ring. */
 int virtqueue_add_sgs(struct virtqueue *vq,
  struct scatterlist *sgs[],
-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH] uio: uio_pci_generic: Add support for MSI interrupts

2013-07-07 Thread Guenter Roeck
On Sun, Jul 07, 2013 at 11:15:11AM +0300, Michael S. Tsirkin wrote:
 On Thu, Jul 04, 2013 at 09:35:00AM -0700, Guenter Roeck wrote:
  On Thu, Jul 04, 2013 at 05:34:12PM +0300, Michael S. Tsirkin wrote:
   On Thu, Jul 04, 2013 at 07:25:23AM -0700, Guenter Roeck wrote:
On Thu, Jul 04, 2013 at 10:20:23AM +0300, Michael S. Tsirkin wrote:
 On Thu, Jun 27, 2013 at 10:00:52AM -0700, Guenter Roeck wrote:
  On Thu, Jun 27, 2013 at 10:45:01AM +0300, Michael S. Tsirkin wrote:
   On Wed, Jun 26, 2013 at 03:30:23PM -0700, Guenter Roeck wrote:
Enable support for MSI interrupts if the device supports it.
Since MSI interrupts are edge triggered, it is no longer 
necessary to
disable interrupts in the kernel and re-enable them from 
user-space.
Instead, clearing the interrupt condition in the user space 
application
automatically re-enables the interrupt.

Signed-off-by: Guenter Roeck li...@roeck-us.net
---
An open question is if we can just do this unconditionally
or if there should be some flag to enable it. A module 
parameter, maybe ?
   
   NACK
   
   UIO is for devices that don't do memory writes.
   Anything that can do writes must be protected by an IOMMU
   and/or have a secure kernel driver, not a UIO stub.
   
   MSI is done by memory writes so if userspace
   controls the device it can trick it to write
   anywhere in memory.
   
  Just out of curiosity: Since MSI support is mandatory for all PCIE 
  devices,
  isn't that possible anyway, even if MSI is not enabled by the 
  kernel ?
  All one would need to do is to enable MSI from user space; after 
  all,
  the chip configuration space is writable.
  
  Thanks,
  Guenter
 
 If a device has capability to do writes, sure. So don't do this then 
 :)
 
Not an option. I need to use MSI.

Not that it matters anymore - turns out it was better writing a 
specific driver
for my devices anyway; I needed to be able to disable chip interrupts 
before
unloading the driver. But why is it then a reason to NACK this patch ?
   
   There seem to be two cases - either you can't access the device -
   and the uio driver is not useful - or you can, and it's not safe.
   In both cases the patch does not seem to bring about anything
   except user confusion ...
   
  Sounds like claiming that supporting MSI would cause some kind of confusion.
  Not sure if I can follow that logic.
  
  Actually, it simplifies the user space code a lot. Since MSI interrupts are 
  edge
  triggered and queued, it is not necessary to disable the interrupts, and all
  user space has to do is to remove the interrupt reason. Works quite nicely 
  for
  our devices.
  
  As I said, I don't really care too much anymore if this patch is rejected, 
  but
  the reasons for the rejection are kind of weak.
  
Besides, doesn't one have to be root anyway to perform such activities,
which could then be more easily accomplished by writing into /dev/mem ?

Thanks,
Guenter
   
   root might not be able to write into /dev/mem.
   
  I don't really want to try, but at least it is marked rw for root.
  
  Guenter
 
 Yes but mechanisms such as selinux can still block it.
 
I hope it can also prevent the use of the the MSI trick on PCIe devices.
After all, the user space mechanisms to write into configuration space
are available for all devices, not just for those with uio drivers.

Guenter
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 03/11] vhost: Make vhost a separate module

2013-07-07 Thread Michael S. Tsirkin
On Sun, Jul 07, 2013 at 02:37:10PM +0300, Michael S. Tsirkin wrote:
 On Mon, May 06, 2013 at 08:10:03PM +0800, Asias He wrote:
  On Mon, May 06, 2013 at 01:03:42PM +0300, Michael S. Tsirkin wrote:
   On Mon, May 06, 2013 at 04:38:21PM +0800, Asias He wrote:
Currently, vhost-net and vhost-scsi are sharing the vhost core code.
However, vhost-scsi shares the code by including the vhost.c file
directly.

Making vhost a separate module makes it is easier to share code with
other vhost devices.

Signed-off-by: Asias He as...@redhat.com
   
   Also this will break test.c, right? Let's fix it in the same
   commit too.
  
  I will fix it up and remove the useless 'return'.
 
 Don't see v3 anywhere?

I did these tweaks, you can see the result on the vhost
branch in my tree.

---
 drivers/vhost/Kconfig  |  8 
 drivers/vhost/Makefile |  3 ++-
 drivers/vhost/scsi.c   |  1 -
 drivers/vhost/vhost.c  | 51 
+-
 drivers/vhost/vhost.h  |  2 ++
 5 files changed, 62 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 8b9226d..017a1e8 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -1,6 +1,7 @@
 config VHOST_NET
tristate Host kernel accelerator for virtio net
depends on NET  EVENTFD  (TUN || !TUN)  (MACVTAP || 
!MACVTAP)
+   select VHOST
select VHOST_RING
---help---
  This kernel module can be loaded in host kernel to accelerate
@@ -13,6 +14,7 @@ config VHOST_NET
 config VHOST_SCSI
tristate VHOST_SCSI TCM fabric driver
depends on TARGET_CORE  EVENTFD  m
+   select VHOST
select VHOST_RING
default n
---help---
@@ -24,3 +26,9 @@ config VHOST_RING
---help---
  This option is selected by any driver which needs to access
  the host side of a virtio ring.
+
+config VHOST
+   tristate
+   ---help---
+ This option is selected by any driver which needs to access
+ the core of vhost.
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index 654e9afb..e0441c3 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -1,7 +1,8 @@
 obj-$(CONFIG_VHOST_NET) += vhost_net.o
-vhost_net-y := vhost.o net.o
+vhost_net-y := net.o
 
 obj-$(CONFIG_VHOST_SCSI) += vhost_scsi.o
 vhost_scsi-y := scsi.o
 
 obj-$(CONFIG_VHOST_RING) += vringh.o
+obj-$(CONFIG_VHOST)+= vhost.o
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 5179f7a..2dcb94a 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -49,7 +49,6 @@
 #include linux/llist.h
 #include linux/bitmap.h
 
-#include vhost.c
 #include vhost.h
 
 #define TCM_VHOST_VERSION  v0.1
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index de9441a..e406d5f 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -25,6 +25,7 @@
 #include linux/slab.h
 #include linux/kthread.h
 #include linux/cgroup.h
+#include linux/module.h
 
 #include vhost.h
 
@@ -66,6 +67,7 @@ void vhost_work_init(struct vhost_work *work, 
vhost_work_fn_t fn)
work-flushing = 0;
work-queue_seq = work-done_seq = 0;
 }
+EXPORT_SYMBOL_GPL(vhost_work_init);
 
 /* Init poll structure */
 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
@@ -79,6 +81,7 @@ void vhost_poll_init(struct vhost_poll *poll, 
vhost_work_fn_t fn,
 
vhost_work_init(poll-work, fn);
 }
+EXPORT_SYMBOL_GPL(vhost_poll_init);
 
 /* Start polling a file. We add ourselves to file's wait queue. The 
caller must
  * keep a reference to a file until after vhost_poll_stop is called. */
@@ -101,6 +104,7 @@ int vhost_poll_start(struct vhost_poll *poll, 
struct file *file)
 
return ret;
 }
+EXPORT_SYMBOL_GPL(vhost_poll_start);
 
 /* Stop polling a file. After this function returns, it becomes safe 
to drop the
  * file reference. You must also flush afterwards. */
@@ -111,6 +115,7 @@ void vhost_poll_stop(struct vhost_poll *poll)
poll-wqh = NULL;
}
 }
+EXPORT_SYMBOL_GPL(vhost_poll_stop);
 
 static bool vhost_work_seq_done(struct vhost_dev *dev, struct 
vhost_work *work,
unsigned seq)
@@ -123,7 +128,7 @@ static bool vhost_work_seq_done(struct vhost_dev 
*dev, struct vhost_work *work,
return left = 0;
 }
 
-static void vhost_work_flush(struct vhost_dev *dev, struct vhost_work 
*work)
+void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work)
   

Re: [RFC PATCH] uio: uio_pci_generic: Add support for MSI interrupts

2013-07-07 Thread Michael S. Tsirkin
On Sun, Jul 07, 2013 at 07:39:01AM -0700, Guenter Roeck wrote:
 On Sun, Jul 07, 2013 at 11:15:11AM +0300, Michael S. Tsirkin wrote:
  On Thu, Jul 04, 2013 at 09:35:00AM -0700, Guenter Roeck wrote:
   On Thu, Jul 04, 2013 at 05:34:12PM +0300, Michael S. Tsirkin wrote:
On Thu, Jul 04, 2013 at 07:25:23AM -0700, Guenter Roeck wrote:
 On Thu, Jul 04, 2013 at 10:20:23AM +0300, Michael S. Tsirkin wrote:
  On Thu, Jun 27, 2013 at 10:00:52AM -0700, Guenter Roeck wrote:
   On Thu, Jun 27, 2013 at 10:45:01AM +0300, Michael S. Tsirkin 
   wrote:
On Wed, Jun 26, 2013 at 03:30:23PM -0700, Guenter Roeck wrote:
 Enable support for MSI interrupts if the device supports it.
 Since MSI interrupts are edge triggered, it is no longer 
 necessary to
 disable interrupts in the kernel and re-enable them from 
 user-space.
 Instead, clearing the interrupt condition in the user space 
 application
 automatically re-enables the interrupt.
 
 Signed-off-by: Guenter Roeck li...@roeck-us.net
 ---
 An open question is if we can just do this unconditionally
 or if there should be some flag to enable it. A module 
 parameter, maybe ?

NACK

UIO is for devices that don't do memory writes.
Anything that can do writes must be protected by an IOMMU
and/or have a secure kernel driver, not a UIO stub.

MSI is done by memory writes so if userspace
controls the device it can trick it to write
anywhere in memory.

   Just out of curiosity: Since MSI support is mandatory for all 
   PCIE devices,
   isn't that possible anyway, even if MSI is not enabled by the 
   kernel ?
   All one would need to do is to enable MSI from user space; after 
   all,
   the chip configuration space is writable.
   
   Thanks,
   Guenter
  
  If a device has capability to do writes, sure. So don't do this 
  then :)
  
 Not an option. I need to use MSI.
 
 Not that it matters anymore - turns out it was better writing a 
 specific driver
 for my devices anyway; I needed to be able to disable chip interrupts 
 before
 unloading the driver. But why is it then a reason to NACK this patch ?

There seem to be two cases - either you can't access the device -
and the uio driver is not useful - or you can, and it's not safe.
In both cases the patch does not seem to bring about anything
except user confusion ...

   Sounds like claiming that supporting MSI would cause some kind of 
   confusion.
   Not sure if I can follow that logic.
   
   Actually, it simplifies the user space code a lot. Since MSI interrupts 
   are edge
   triggered and queued, it is not necessary to disable the interrupts, and 
   all
   user space has to do is to remove the interrupt reason. Works quite 
   nicely for
   our devices.
   
   As I said, I don't really care too much anymore if this patch is 
   rejected, but
   the reasons for the rejection are kind of weak.
   
 Besides, doesn't one have to be root anyway to perform such 
 activities,
 which could then be more easily accomplished by writing into /dev/mem 
 ?
 
 Thanks,
 Guenter

root might not be able to write into /dev/mem.

   I don't really want to try, but at least it is marked rw for root.
   
   Guenter
  
  Yes but mechanisms such as selinux can still block it.
  
 I hope it can also prevent the use of the the MSI trick on PCIe devices.
 After all, the user space mechanisms to write into configuration space
 are available for all devices, not just for those with uio drivers.
 
 Guenter

Yes, selinux can also prevent access to sysfs and procfs for any process.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3] KVM: nVMX: Fix read/write to MSR_IA32_FEATURE_CONTROL

2013-07-07 Thread Arthur Chunqi Li
Fix read/write to IA32_FEATURE_CONTROL MSR in nested environment.

This patch simulate this MSR in nested_vmx and the default value is
0x0. BIOS should set it to 0x5 before VMXON. After setting the lock
bit, write to it will cause #GP(0).

Another QEMU patch is also needed to handle emulation of reset
and migration. Reset to vCPU should clear this MSR and migration
should reserve value of it.

This patch is based on Nadav's previous commit.
http://permalink.gmane.org/gmane.comp.emulators.kvm.devel/88478

Signed-off-by: Nadav Har'El nyh at il.ibm.com
Signed-off-by: Arthur Chunqi Li yzt...@gmail.com
---
 arch/x86/kvm/vmx.c |   32 ++--
 arch/x86/kvm/x86.c |3 ++-
 2 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a7e1855..a64efd0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -373,6 +373,7 @@ struct nested_vmx {
 * we must keep them pinned while L2 runs.
 */
struct page *apic_access_page;
+   u64 msr_ia32_feature_control;
 };
 
 #define POSTED_INTR_ON  0
@@ -2282,8 +2283,11 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 
msr_index, u64 *pdata)
 
switch (msr_index) {
case MSR_IA32_FEATURE_CONTROL:
-   *pdata = 0;
-   break;
+   if (nested_vmx_allowed(vcpu)){
+   *pdata = to_vmx(vcpu)-nested.msr_ia32_feature_control;
+   break;
+   }
+   return 0;
case MSR_IA32_VMX_BASIC:
/*
 * This MSR reports some information about VMX support. We
@@ -2356,14 +2360,21 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 
msr_index, u64 *pdata)
return 1;
 }
 
-static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
+   u32 msr_index = msr_info-index;
+   u64 data = msr_info-data;
+   bool host_initialized = msr_info-host_initiated;
if (!nested_vmx_allowed(vcpu))
return 0;
 
-   if (msr_index == MSR_IA32_FEATURE_CONTROL)
-   /* TODO: the right thing. */
+   if (msr_index == MSR_IA32_FEATURE_CONTROL){
+   if (!host_initialized  
to_vmx(vcpu)-nested.msr_ia32_feature_control
+FEATURE_CONTROL_LOCKED)
+   return 0;
+   to_vmx(vcpu)-nested.msr_ia32_feature_control = data;
return 1;
+   }
/*
 * No need to treat VMX capability MSRs specially: If we don't handle
 * them, handle_wrmsr will #GP(0), which is correct (they are readonly)
@@ -2494,7 +2505,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
return 1;
/* Otherwise falls through */
default:
-   if (vmx_set_vmx_msr(vcpu, msr_index, data))
+   if (vmx_set_vmx_msr(vcpu, msr_info))
break;
msr = find_msr_entry(vmx, msr_index);
if (msr) {
@@ -5576,6 +5587,8 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
struct kvm_segment cs;
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs *shadow_vmcs;
+   const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
+   | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
 
/* The Intel VMX Instruction Reference lists a bunch of bits that
 * are prerequisite to running VMXON, most notably cr4.VMXE must be
@@ -5604,6 +5617,13 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
skip_emulated_instruction(vcpu);
return 1;
}
+
+   if ((vmx-nested.msr_ia32_feature_control  VMXON_NEEDED_FEATURES)
+   != VMXON_NEEDED_FEATURES) {
+   kvm_inject_gp(vcpu, 0);
+   return 1;
+   }
+
if (enable_shadow_vmcs) {
shadow_vmcs = alloc_vmcs();
if (!shadow_vmcs)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d21bce5..cff77c4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -850,7 +850,8 @@ static u32 msrs_to_save[] = {
 #ifdef CONFIG_X86_64
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
-   MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
+   MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
+   MSR_IA32_FEATURE_CONTROL
 };
 
 static unsigned num_msrs_to_save;
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[[Qemu-devel] [PATCH]] nVMX: Initialize IA32_FEATURE_CONTROL MSR in reset and migration

2013-07-07 Thread Arthur Chunqi Li
The recent KVM patch adds IA32_FEATURE_CONTROL support. QEMU needs
to clear this MSR when reset vCPU and keep the value of it when
migration. This patch add this feature.

Signed-off-by: Arthur Chunqi Li yzt...@gmail.com
---
 target-i386/cpu.h |2 ++
 target-i386/kvm.c |4 
 target-i386/machine.c |   22 ++
 3 files changed, 28 insertions(+)

diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index 62e3547..a418e17 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -301,6 +301,7 @@
 #define MSR_IA32_APICBASE_BSP   (18)
 #define MSR_IA32_APICBASE_ENABLE(111)
 #define MSR_IA32_APICBASE_BASE  (0xf12)
+#define MSR_IA32_FEATURE_CONTROL0x003a
 #define MSR_TSC_ADJUST  0x003b
 #define MSR_IA32_TSCDEADLINE0x6e0
 
@@ -813,6 +814,7 @@ typedef struct CPUX86State {
 
 uint64_t mcg_status;
 uint64_t msr_ia32_misc_enable;
+uint64_t msr_ia32_feature_control;
 
 /* exception/interrupt handling */
 int error_code;
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 39f4fbb..3cb2161 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -1122,6 +1122,7 @@ static int kvm_put_msrs(X86CPU *cpu, int level)
 if (hyperv_vapic_recommended()) {
 kvm_msr_entry_set(msrs[n++], HV_X64_MSR_APIC_ASSIST_PAGE, 0);
 }
+kvm_msr_entry_set(msrs[n++], MSR_IA32_FEATURE_CONTROL, 
env-msr_ia32_feature_control);
 }
 if (env-mcg_cap) {
 int i;
@@ -1346,6 +1347,7 @@ static int kvm_get_msrs(X86CPU *cpu)
 if (has_msr_misc_enable) {
 msrs[n++].index = MSR_IA32_MISC_ENABLE;
 }
+msrs[n++].index = MSR_IA32_FEATURE_CONTROL;
 
 if (!env-tsc_valid) {
 msrs[n++].index = MSR_IA32_TSC;
@@ -1444,6 +1446,8 @@ static int kvm_get_msrs(X86CPU *cpu)
 case MSR_IA32_MISC_ENABLE:
 env-msr_ia32_misc_enable = msrs[i].data;
 break;
+case MSR_IA32_FEATURE_CONTROL:
+env-msr_ia32_feature_control = msrs[i].data;
 default:
 if (msrs[i].index = MSR_MC0_CTL 
 msrs[i].index  MSR_MC0_CTL + (env-mcg_cap  0xff) * 4) {
diff --git a/target-i386/machine.c b/target-i386/machine.c
index 3659db9..94ca914 100644
--- a/target-i386/machine.c
+++ b/target-i386/machine.c
@@ -399,6 +399,14 @@ static bool misc_enable_needed(void *opaque)
 return env-msr_ia32_misc_enable != MSR_IA32_MISC_ENABLE_DEFAULT;
 }
 
+static bool feature_control_needed(void *opaque)
+{
+X86CPU *cpu = opaque;
+CPUX86State *env = cpu-env;
+
+return env-msr_ia32_feature_control != 0;
+}
+
 static const VMStateDescription vmstate_msr_ia32_misc_enable = {
 .name = cpu/msr_ia32_misc_enable,
 .version_id = 1,
@@ -410,6 +418,17 @@ static const VMStateDescription 
vmstate_msr_ia32_misc_enable = {
 }
 };
 
+static const VMStateDescription vmstate_msr_ia32_feature_control = {
+.name = cpu/msr_ia32_feature_control,
+.version_id = 1,
+.minimum_version_id = 1,
+.minimum_version_id_old = 1,
+.fields  = (VMStateField []) {
+VMSTATE_UINT64(env.msr_ia32_feature_control, X86CPU),
+VMSTATE_END_OF_LIST()
+}
+};
+
 const VMStateDescription vmstate_x86_cpu = {
 .name = cpu,
 .version_id = 12,
@@ -535,6 +554,9 @@ const VMStateDescription vmstate_x86_cpu = {
 }, {
 .vmsd = vmstate_msr_ia32_misc_enable,
 .needed = misc_enable_needed,
+}, {
+.vmsd = vmstate_msr_ia32_feature_control,
+.needed = feature_control_needed,
 } , {
 /* empty */
 }
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm tools: fix boot of guests with more than 4gb of ram

2013-07-07 Thread Pekka Enberg
On Mon, Jun 24, 2013 at 1:06 PM, Michael Tokarev m...@tls.msk.ru wrote:
 24.06.2013 05:23, Sasha Levin wrote:
   queue   = p9dev-vqs[vq];
   queue-pfn  = pfn;
 - p   = guest_flat_to_host(kvm, queue-pfn * page_size);
 + p   = guest_flat_to_host(kvm, (u64)queue-pfn * page_size);

 Maybe it's worth to use a common function for this,
 something like guest_queue_to_host(kvm, queue) ?

Sasha, care to do something like this and resend the patch?
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2] kvm tools: fix boot of guests with more than 4gb of ram

2013-07-07 Thread Sasha Levin
Commit kvm tools: virtio: remove hardcoded assumptions
about guest page size has introduced a bug that prevented
guests with more than 4gb of ram from booting.

The issue is that 'pfn' is a 32bit integer, so when multiplying
it by page size to get the actual page will cause an overflow if
the pfn referred to a memory area above 4gb.

Signed-off-by: Sasha Levin sasha.le...@oracle.com
---
 tools/kvm/include/kvm/virtio.h | 6 ++
 tools/kvm/virtio/9p.c  | 2 +-
 tools/kvm/virtio/balloon.c | 2 +-
 tools/kvm/virtio/blk.c | 2 +-
 tools/kvm/virtio/console.c | 2 +-
 tools/kvm/virtio/net.c | 2 +-
 tools/kvm/virtio/rng.c | 2 +-
 tools/kvm/virtio/scsi.c| 2 +-
 8 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/tools/kvm/include/kvm/virtio.h b/tools/kvm/include/kvm/virtio.h
index 269ea4a..820b94a 100644
--- a/tools/kvm/include/kvm/virtio.h
+++ b/tools/kvm/include/kvm/virtio.h
@@ -90,4 +90,10 @@ int virtio_init(struct kvm *kvm, void *dev, struct 
virtio_device *vdev,
struct virtio_ops *ops, enum virtio_trans trans,
int device_id, int subsys_id, int class);
 int virtio_compat_add_message(const char *device, const char *config);
+
+static inline void *virtio_get_vq(struct kvm *kvm, u32 pfn, u32 page_size)
+{
+   return guest_flat_to_host(kvm, (u64)pfn * page_size);
+}
+
 #endif /* KVM__VIRTIO_H */
diff --git a/tools/kvm/virtio/9p.c b/tools/kvm/virtio/9p.c
index b7a5723..127b13a 100644
--- a/tools/kvm/virtio/9p.c
+++ b/tools/kvm/virtio/9p.c
@@ -1267,7 +1267,7 @@ static int init_vq(struct kvm *kvm, void *dev, u32 vq, 
u32 page_size, u32 align,
 
queue   = p9dev-vqs[vq];
queue-pfn  = pfn;
-   p   = guest_flat_to_host(kvm, queue-pfn * page_size);
+   p   = virtio_get_vq(kvm, queue-pfn, page_size);
job = p9dev-jobs[vq];
 
vring_init(queue-vring, VIRTQUEUE_NUM, p, align);
diff --git a/tools/kvm/virtio/balloon.c b/tools/kvm/virtio/balloon.c
index d1b64fa..4863535 100644
--- a/tools/kvm/virtio/balloon.c
+++ b/tools/kvm/virtio/balloon.c
@@ -204,7 +204,7 @@ static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 
page_size, u32 align,
 
queue   = bdev-vqs[vq];
queue-pfn  = pfn;
-   p   = guest_flat_to_host(kvm, queue-pfn * page_size);
+   p   = virtio_get_vq(kvm, queue-pfn, page_size);
 
thread_pool__init_job(bdev-jobs[vq], kvm, virtio_bln_do_io, queue);
vring_init(queue-vring, VIRTIO_BLN_QUEUE_SIZE, p, align);
diff --git a/tools/kvm/virtio/blk.c b/tools/kvm/virtio/blk.c
index 44ac44b..ab18716 100644
--- a/tools/kvm/virtio/blk.c
+++ b/tools/kvm/virtio/blk.c
@@ -167,7 +167,7 @@ static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 
page_size, u32 align,
 
queue   = bdev-vqs[vq];
queue-pfn  = pfn;
-   p   = guest_flat_to_host(kvm, queue-pfn * page_size);
+   p   = virtio_get_vq(kvm, queue-pfn, page_size);
 
vring_init(queue-vring, VIRTIO_BLK_QUEUE_SIZE, p, align);
 
diff --git a/tools/kvm/virtio/console.c b/tools/kvm/virtio/console.c
index 1f88a4b..83c58bf 100644
--- a/tools/kvm/virtio/console.c
+++ b/tools/kvm/virtio/console.c
@@ -137,7 +137,7 @@ static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 
page_size, u32 align,
 
queue   = cdev.vqs[vq];
queue-pfn  = pfn;
-   p   = guest_flat_to_host(kvm, queue-pfn * page_size);
+   p   = virtio_get_vq(kvm, queue-pfn, page_size);
 
vring_init(queue-vring, VIRTIO_CONSOLE_QUEUE_SIZE, p, align);
 
diff --git a/tools/kvm/virtio/net.c b/tools/kvm/virtio/net.c
index 7855cfc..298903c 100644
--- a/tools/kvm/virtio/net.c
+++ b/tools/kvm/virtio/net.c
@@ -410,7 +410,7 @@ static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 
page_size, u32 align,
 
queue   = ndev-vqs[vq];
queue-pfn  = pfn;
-   p   = guest_flat_to_host(kvm, queue-pfn * page_size);
+   p   = virtio_get_vq(kvm, queue-pfn, page_size);
 
vring_init(queue-vring, VIRTIO_NET_QUEUE_SIZE, p, align);
 
diff --git a/tools/kvm/virtio/rng.c b/tools/kvm/virtio/rng.c
index 2ce8afd..394de71 100644
--- a/tools/kvm/virtio/rng.c
+++ b/tools/kvm/virtio/rng.c
@@ -98,7 +98,7 @@ static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 
page_size, u32 align,
 
queue   = rdev-vqs[vq];
queue-pfn  = pfn;
-   p   = guest_flat_to_host(kvm, queue-pfn * page_size);
+   p   = virtio_get_vq(kvm, queue-pfn, page_size);
 
job = rdev-jobs[vq];
 
diff --git a/tools/kvm/virtio/scsi.c b/tools/kvm/virtio/scsi.c
index 05b2dc6..8da2420 100644
--- a/tools/kvm/virtio/scsi.c
+++ b/tools/kvm/virtio/scsi.c
@@ -63,7 +63,7 @@ static int init_vq(struct kvm *kvm, void *dev, u32 vq, u32 
page_size, u32 align,
 
queue   = 

[PATCH qom-cpu v2 10/42] kvm: Change kvm_{insert,remove}_breakpoint() argument to CPUState

2013-07-07 Thread Andreas Färber
CPUArchState is no longer directly used since converting CPU loops to
CPUState.

Prepares for changing GDBState::c_cpu to CPUState.

Signed-off-by: Andreas Färber afaer...@suse.de
---
 gdbstub.c| 12 
 include/sysemu/kvm.h |  4 ++--
 kvm-all.c| 10 --
 kvm-stub.c   |  4 ++--
 4 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/gdbstub.c b/gdbstub.c
index 17da380..b77cd3e 100644
--- a/gdbstub.c
+++ b/gdbstub.c
@@ -1954,8 +1954,10 @@ static int gdb_breakpoint_insert(target_ulong addr, 
target_ulong len, int type)
 CPUArchState *env;
 int err = 0;
 
-if (kvm_enabled())
-return kvm_insert_breakpoint(gdbserver_state-c_cpu, addr, len, type);
+if (kvm_enabled()) {
+return kvm_insert_breakpoint(ENV_GET_CPU(gdbserver_state-c_cpu),
+ addr, len, type);
+}
 
 switch (type) {
 case GDB_BREAKPOINT_SW:
@@ -1991,8 +1993,10 @@ static int gdb_breakpoint_remove(target_ulong addr, 
target_ulong len, int type)
 CPUArchState *env;
 int err = 0;
 
-if (kvm_enabled())
-return kvm_remove_breakpoint(gdbserver_state-c_cpu, addr, len, type);
+if (kvm_enabled()) {
+return kvm_remove_breakpoint(ENV_GET_CPU(gdbserver_state-c_cpu),
+ addr, len, type);
+}
 
 switch (type) {
 case GDB_BREAKPOINT_SW:
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index 1e08a85..f8ac448 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -169,9 +169,9 @@ void *kvm_arch_ram_alloc(ram_addr_t size);
 void kvm_setup_guest_memory(void *start, size_t size);
 void kvm_flush_coalesced_mmio_buffer(void);
 
-int kvm_insert_breakpoint(CPUArchState *env, target_ulong addr,
+int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr,
   target_ulong len, int type);
-int kvm_remove_breakpoint(CPUArchState *env, target_ulong addr,
+int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr,
   target_ulong len, int type);
 void kvm_remove_all_breakpoints(CPUState *cpu);
 int kvm_update_guest_debug(CPUArchState *env, unsigned long reinject_trap);
diff --git a/kvm-all.c b/kvm-all.c
index 0c16a44..2263c48 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -1903,10 +1903,9 @@ int kvm_update_guest_debug(CPUArchState *env, unsigned 
long reinject_trap)
 return data.err;
 }
 
-int kvm_insert_breakpoint(CPUArchState *env, target_ulong addr,
+int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr,
   target_ulong len, int type)
 {
-CPUState *cpu = ENV_GET_CPU(env);
 struct kvm_sw_breakpoint *bp;
 int err;
 
@@ -1949,10 +1948,9 @@ int kvm_insert_breakpoint(CPUArchState *env, 
target_ulong addr,
 return 0;
 }
 
-int kvm_remove_breakpoint(CPUArchState *env, target_ulong addr,
+int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr,
   target_ulong len, int type)
 {
-CPUState *cpu = ENV_GET_CPU(env);
 struct kvm_sw_breakpoint *bp;
 int err;
 
@@ -2025,13 +2023,13 @@ int kvm_update_guest_debug(CPUArchState *env, unsigned 
long reinject_trap)
 return -EINVAL;
 }
 
-int kvm_insert_breakpoint(CPUArchState *env, target_ulong addr,
+int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr,
   target_ulong len, int type)
 {
 return -EINVAL;
 }
 
-int kvm_remove_breakpoint(CPUArchState *env, target_ulong addr,
+int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr,
   target_ulong len, int type)
 {
 return -EINVAL;
diff --git a/kvm-stub.c b/kvm-stub.c
index 370c837..7b2233a 100644
--- a/kvm-stub.c
+++ b/kvm-stub.c
@@ -83,13 +83,13 @@ int kvm_update_guest_debug(CPUArchState *env, unsigned long 
reinject_trap)
 return -ENOSYS;
 }
 
-int kvm_insert_breakpoint(CPUArchState *env, target_ulong addr,
+int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr,
   target_ulong len, int type)
 {
 return -EINVAL;
 }
 
-int kvm_remove_breakpoint(CPUArchState *env, target_ulong addr,
+int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr,
   target_ulong len, int type)
 {
 return -EINVAL;
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH qom-cpu v2 16/42] exec: Change cpu_memory_rw_debug() argument to CPUState

2013-07-07 Thread Andreas Färber
Propagate X86CPU in kvmvapic for simplicity.

Signed-off-by: Andreas Färber afaer...@suse.de
---
 cpus.c  |  4 +--
 disas.c |  4 +--
 exec.c  |  6 ++--
 gdbstub.c   |  2 +-
 hw/i386/kvmvapic.c  | 72 +++--
 include/exec/cpu-all.h  |  3 +-
 include/exec/softmmu-semi.h | 18 +++-
 monitor.c   |  2 +-
 target-arm/arm-semi.c   |  2 +-
 target-i386/helper.c|  8 +++--
 target-i386/kvm.c   | 14 -
 target-sparc/mmu_helper.c   |  5 ++--
 target-xtensa/xtensa-semi.c | 10 +++
 13 files changed, 77 insertions(+), 73 deletions(-)

diff --git a/cpus.c b/cpus.c
index 8a4f395..1ce6816 100644
--- a/cpus.c
+++ b/cpus.c
@@ -1247,7 +1247,6 @@ void qmp_memsave(int64_t addr, int64_t size, const char 
*filename,
 {
 FILE *f;
 uint32_t l;
-CPUArchState *env;
 CPUState *cpu;
 uint8_t buf[1024];
 
@@ -1261,7 +1260,6 @@ void qmp_memsave(int64_t addr, int64_t size, const char 
*filename,
   a CPU number);
 return;
 }
-env = cpu-env_ptr;
 
 f = fopen(filename, wb);
 if (!f) {
@@ -1273,7 +1271,7 @@ void qmp_memsave(int64_t addr, int64_t size, const char 
*filename,
 l = sizeof(buf);
 if (l  size)
 l = size;
-cpu_memory_rw_debug(env, addr, buf, l, 0);
+cpu_memory_rw_debug(cpu, addr, buf, l, 0);
 if (fwrite(buf, 1, l, f) != l) {
 error_set(errp, QERR_IO_ERROR);
 goto exit;
diff --git a/disas.c b/disas.c
index e51127e..71007fb 100644
--- a/disas.c
+++ b/disas.c
@@ -39,7 +39,7 @@ target_read_memory (bfd_vma memaddr,
 {
 CPUDebug *s = container_of(info, CPUDebug, info);
 
-cpu_memory_rw_debug(s-env, memaddr, myaddr, length, 0);
+cpu_memory_rw_debug(ENV_GET_CPU(s-env), memaddr, myaddr, length, 0);
 return 0;
 }
 
@@ -392,7 +392,7 @@ monitor_read_memory (bfd_vma memaddr, bfd_byte *myaddr, int 
length,
 if (monitor_disas_is_physical) {
 cpu_physical_memory_read(memaddr, myaddr, length);
 } else {
-cpu_memory_rw_debug(s-env, memaddr,myaddr, length, 0);
+cpu_memory_rw_debug(ENV_GET_CPU(s-env), memaddr, myaddr, length, 0);
 }
 return 0;
 }
diff --git a/exec.c b/exec.c
index a768aea..f3fe6e6 100644
--- a/exec.c
+++ b/exec.c
@@ -1840,7 +1840,7 @@ MemoryRegion *get_system_io(void)
 
 /* physical memory access (slow version, mainly for debug) */
 #if defined(CONFIG_USER_ONLY)
-int cpu_memory_rw_debug(CPUArchState *env, target_ulong addr,
+int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
 uint8_t *buf, int len, int is_write)
 {
 int l, flags;
@@ -2574,7 +2574,7 @@ void stq_be_phys(hwaddr addr, uint64_t val)
 }
 
 /* virtual memory access for debug (includes writing to ROM) */
-int cpu_memory_rw_debug(CPUArchState *env, target_ulong addr,
+int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
 uint8_t *buf, int len, int is_write)
 {
 int l;
@@ -2583,7 +2583,7 @@ int cpu_memory_rw_debug(CPUArchState *env, target_ulong 
addr,
 
 while (len  0) {
 page = addr  TARGET_PAGE_MASK;
-phys_addr = cpu_get_phys_page_debug(ENV_GET_CPU(env), page);
+phys_addr = cpu_get_phys_page_debug(cpu, page);
 /* if no physical page mapped, return an error */
 if (phys_addr == -1)
 return -1;
diff --git a/gdbstub.c b/gdbstub.c
index cee9c13..43ecc0d 100644
--- a/gdbstub.c
+++ b/gdbstub.c
@@ -46,7 +46,7 @@
 static inline int target_memory_rw_debug(CPUArchState *env, target_ulong addr,
  uint8_t *buf, int len, int is_write)
 {
-return cpu_memory_rw_debug(env, addr, buf, len, is_write);
+return cpu_memory_rw_debug(ENV_GET_CPU(env), addr, buf, len, is_write);
 }
 #else
 /* target_memory_rw_debug() defined in cpu.h */
diff --git a/hw/i386/kvmvapic.c b/hw/i386/kvmvapic.c
index 224601f..035d0fe 100644
--- a/hw/i386/kvmvapic.c
+++ b/hw/i386/kvmvapic.c
@@ -188,9 +188,10 @@ static bool opcode_matches(uint8_t *opcode, const 
TPRInstruction *instr)
  modrm_reg(opcode[1]) == instr-modrm_reg);
 }
 
-static int evaluate_tpr_instruction(VAPICROMState *s, CPUX86State *env,
+static int evaluate_tpr_instruction(VAPICROMState *s, X86CPU *cpu,
 target_ulong *pip, TPRAccess access)
 {
+CPUState *cs = CPU(cpu);
 const TPRInstruction *instr;
 target_ulong ip = *pip;
 uint8_t opcode[2];
@@ -211,7 +212,7 @@ static int evaluate_tpr_instruction(VAPICROMState *s, 
CPUX86State *env,
  * RSP, used by the patched instruction, is zero, so the guest gets a
  * double fault and dies.
  */
-if (env-regs[R_ESP] == 0) {
+if (cpu-env.regs[R_ESP] == 0) {
 return -1;
 }
 
@@ -226,7 +227,7 @@ static int evaluate_tpr_instruction(VAPICROMState *s, 
CPUX86State *env,
 if 

[PATCH qom-cpu v2 07/42] cpu: Move singlestep_enabled field from CPU_COMMON to CPUState

2013-07-07 Thread Andreas Färber
Prepares for changing cpu_single_step() argument to CPUState.

Acked-by: Michael Walle mich...@walle.cc (for lm32)
Signed-off-by: Andreas Färber afaer...@suse.de
---
 cpu-exec.c|  2 +-
 cpus.c|  2 +-
 exec.c| 10 ++
 include/exec/cpu-defs.h   |  1 -
 include/qom/cpu.h |  2 ++
 kvm-all.c |  2 +-
 target-alpha/translate.c  |  3 ++-
 target-arm/translate.c|  7 ---
 target-cris/translate.c   |  7 ---
 target-i386/kvm.c |  6 --
 target-i386/translate.c   |  5 +++--
 target-lm32/translate.c   |  7 ---
 target-m68k/translate.c   |  7 ---
 target-microblaze/translate.c |  8 +---
 target-mips/translate.c   | 11 +++
 target-moxie/translate.c  |  5 +++--
 target-openrisc/translate.c   |  7 ---
 target-ppc/translate.c|  8 +---
 target-s390x/translate.c  |  5 +++--
 target-sh4/translate.c|  8 +---
 target-sparc/translate.c  |  3 ++-
 target-unicore32/translate.c  |  7 ---
 target-xtensa/translate.c |  7 ---
 23 files changed, 78 insertions(+), 52 deletions(-)

diff --git a/cpu-exec.c b/cpu-exec.c
index 3fccb86..301be28 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -297,7 +297,7 @@ int cpu_exec(CPUArchState *env)
 for(;;) {
 interrupt_request = cpu-interrupt_request;
 if (unlikely(interrupt_request)) {
-if (unlikely(env-singlestep_enabled  SSTEP_NOIRQ)) {
+if (unlikely(cpu-singlestep_enabled  SSTEP_NOIRQ)) {
 /* Mask out external interrupts for this step. */
 interrupt_request = ~CPU_INTERRUPT_SSTEP_MASK;
 }
diff --git a/cpus.c b/cpus.c
index f141428..8a4f395 100644
--- a/cpus.c
+++ b/cpus.c
@@ -1148,7 +1148,7 @@ static void tcg_exec_all(void)
 CPUArchState *env = cpu-env_ptr;
 
 qemu_clock_enable(vm_clock,
-  (env-singlestep_enabled  SSTEP_NOTIMER) == 0);
+  (cpu-singlestep_enabled  SSTEP_NOTIMER) == 0);
 
 if (cpu_can_run(cpu)) {
 r = tcg_cpu_exec(env);
diff --git a/exec.c b/exec.c
index f01e3b6..ae6eb24 100644
--- a/exec.c
+++ b/exec.c
@@ -588,11 +588,13 @@ void cpu_breakpoint_remove_all(CPUArchState *env, int 
mask)
 void cpu_single_step(CPUArchState *env, int enabled)
 {
 #if defined(TARGET_HAS_ICE)
-if (env-singlestep_enabled != enabled) {
-env-singlestep_enabled = enabled;
-if (kvm_enabled())
+CPUState *cpu = ENV_GET_CPU(env);
+
+if (cpu-singlestep_enabled != enabled) {
+cpu-singlestep_enabled = enabled;
+if (kvm_enabled()) {
 kvm_update_guest_debug(env, 0);
-else {
+} else {
 /* must flush all the translated code to avoid inconsistencies */
 /* XXX: only flush what is necessary */
 tb_flush(env);
diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
index 39094b3..12b1ca7 100644
--- a/include/exec/cpu-defs.h
+++ b/include/exec/cpu-defs.h
@@ -170,7 +170,6 @@ typedef struct CPUWatchpoint {
 /* from this point: preserved by CPU reset */   \
 /* ice debug support */ \
 QTAILQ_HEAD(breakpoints_head, CPUBreakpoint) breakpoints;\
-int singlestep_enabled; \
 \
 QTAILQ_HEAD(watchpoints_head, CPUWatchpoint) watchpoints;\
 CPUWatchpoint *watchpoint_hit;  \
diff --git a/include/qom/cpu.h b/include/qom/cpu.h
index 152dad5..136482c 100644
--- a/include/qom/cpu.h
+++ b/include/qom/cpu.h
@@ -129,6 +129,7 @@ struct kvm_run;
  * @stopped: Indicates the CPU has been artificially stopped.
  * @tcg_exit_req: Set to force TCG to stop executing linked TBs for this
  *   CPU and return to its top level loop.
+ * @singlestep_enabled: Flags for single-stepping.
  * @env_ptr: Pointer to subclass-specific CPUArchState field.
  * @current_tb: Currently executing TB.
  * @next_cpu: Next CPU sharing TB cache.
@@ -161,6 +162,7 @@ struct CPUState {
 volatile sig_atomic_t exit_request;
 volatile sig_atomic_t tcg_exit_req;
 uint32_t interrupt_request;
+int singlestep_enabled;
 
 void *env_ptr; /* CPUArchState */
 struct TranslationBlock *current_tb;
diff --git a/kvm-all.c b/kvm-all.c
index c130705..0c16a44 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -1893,7 +1893,7 @@ int kvm_update_guest_debug(CPUArchState *env, unsigned 
long reinject_trap)
 
 data.dbg.control = reinject_trap;
 
-if (env-singlestep_enabled) {
+if (cpu-singlestep_enabled) {
 data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
 }
 kvm_arch_update_guest_debug(cpu, 

KVM call agenda for 2013-07-09

2013-07-07 Thread Juan Quintela

Hi

Please, send any topic that you are interested in covering.

Thanks, Juan.

PD.  If you want to attend and you don't have the call details,
  contact me.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] KVM call agenda for 2013-07-09

2013-07-07 Thread Andreas Färber
Am 08.07.2013 01:10, schrieb Juan Quintela:
 Please, send any topic that you are interested in covering.

Static qdev vs. dynamic QOM properties and -global
(Igor's topic from two weeks ago that we couldn't cover any more)
= needed for X86CPU subclasses (hot-add, topology, NUMA, ...)

Thanks,
Andreas

-- 
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany
GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH qom-cpu v9] target-i386: Move hyperv_* static globals to X86CPU

2013-07-07 Thread Andreas Färber
From: Igor Mammedov imamm...@redhat.com

- since hyperv_* helper functions are used only in target-i386/kvm.c
  move them there as static helpers

Requested-by: Eduardo Habkost ehabk...@redhat.com
Signed-off-by: Igor Mammedov imamm...@redhat.com
Signed-off-by: Andreas Färber afaer...@suse.de
---
 v8 (imammedo) - v9:
 * Use X86CPU instead of CPUX86State (only used in KVM)
 * Changed helper functions to X86CPU argument
 * Moved field initialization to QOM instance_init
 * Fixed subject (not today's CPUState)

 target-i386/Makefile.objs |  2 +-
 target-i386/cpu-qom.h |  4 +++
 target-i386/cpu.c | 16 
 target-i386/cpu.h |  4 +++
 target-i386/hyperv.c  | 64 ---
 target-i386/hyperv.h  | 45 -
 target-i386/kvm.c | 36 ++
 7 files changed, 46 insertions(+), 125 deletions(-)
 delete mode 100644 target-i386/hyperv.c
 delete mode 100644 target-i386/hyperv.h

diff --git a/target-i386/Makefile.objs b/target-i386/Makefile.objs
index c1d4f05..887dca7 100644
--- a/target-i386/Makefile.objs
+++ b/target-i386/Makefile.objs
@@ -2,7 +2,7 @@ obj-y += translate.o helper.o cpu.o
 obj-y += excp_helper.o fpu_helper.o cc_helper.o int_helper.o svm_helper.o
 obj-y += smm_helper.o misc_helper.o mem_helper.o seg_helper.o
 obj-$(CONFIG_SOFTMMU) += machine.o arch_memory_mapping.o arch_dump.o
-obj-$(CONFIG_KVM) += kvm.o hyperv.o
+obj-$(CONFIG_KVM) += kvm.o
 obj-$(CONFIG_NO_KVM) += kvm-stub.o
 obj-$(CONFIG_LINUX_USER) += ioport-user.o
 obj-$(CONFIG_BSD_USER) += ioport-user.o
diff --git a/target-i386/cpu-qom.h b/target-i386/cpu-qom.h
index 7e55e5f..18f08b8 100644
--- a/target-i386/cpu-qom.h
+++ b/target-i386/cpu-qom.h
@@ -66,6 +66,10 @@ typedef struct X86CPU {
 
 CPUX86State env;
 
+bool hyperv_vapic;
+bool hyperv_relaxed_timing;
+int hyperv_spinlock_attempts;
+
 /* Features that were filtered out because of missing host capabilities */
 uint32_t filtered_features[FEATURE_WORDS];
 } X86CPU;
diff --git a/target-i386/cpu.c b/target-i386/cpu.c
index e3f75a8..14e9c7e 100644
--- a/target-i386/cpu.c
+++ b/target-i386/cpu.c
@@ -35,8 +35,6 @@
 #include qapi/visitor.h
 #include sysemu/arch_init.h
 
-#include hyperv.h
-
 #include hw/hw.h
 #if defined(CONFIG_KVM)
 #include linux/kvm_para.h
@@ -1587,12 +1585,19 @@ static void cpu_x86_parse_featurestr(X86CPU *cpu, char 
*features, Error **errp)
 object_property_parse(OBJECT(cpu), num, tsc-frequency, errp);
 } else if (!strcmp(featurestr, hv-spinlocks)) {
 char *err;
+const int min = 0xFFF;
 numvalue = strtoul(val, err, 0);
 if (!*val || *err) {
 error_setg(errp, bad numerical value %s, val);
 goto out;
 }
-hyperv_set_spinlock_retries(numvalue);
+if (numvalue  min) {
+fprintf(stderr, hv-spinlocks value shall always be = 
0x%x
+, fixup will be removed in future versions\n,
+min);
+numvalue = min;
+}
+cpu-hyperv_spinlock_attempts = numvalue;
 } else {
 error_setg(errp, unrecognized feature %s, featurestr);
 goto out;
@@ -1602,9 +1607,9 @@ static void cpu_x86_parse_featurestr(X86CPU *cpu, char 
*features, Error **errp)
 } else if (!strcmp(featurestr, enforce)) {
 check_cpuid = enforce_cpuid = 1;
 } else if (!strcmp(featurestr, hv_relaxed)) {
-hyperv_enable_relaxed_timing(true);
+cpu-hyperv_relaxed_timing = true;
 } else if (!strcmp(featurestr, hv_vapic)) {
-hyperv_enable_vapic_recommended(true);
+cpu-hyperv_vapic = true;
 } else {
 error_setg(errp, feature string `%s' not in format (+feature|
-feature|feature=xyz), featurestr);
@@ -2479,6 +2484,7 @@ static void x86_cpu_initfn(Object *obj)
 x86_cpu_get_feature_words,
 NULL, NULL, (void *)cpu-filtered_features, NULL);
 
+cpu-hyperv_spinlock_attempts = HYPERV_SPINLOCK_NEVER_RETRY;
 env-cpuid_apic_id = x86_cpu_apic_id_from_index(cs-cpu_index);
 
 /* init various static tables used in TCG mode */
diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index 2d005b3..6c3eb86 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -549,6 +549,10 @@ typedef uint32_t FeatureWordArray[FEATURE_WORDS];
 #define CPUID_MWAIT_IBE (1  1) /* Interrupts can exit capability */
 #define CPUID_MWAIT_EMX (1  0) /* enumeration supported */
 
+#ifndef HYPERV_SPINLOCK_NEVER_RETRY
+#define HYPERV_SPINLOCK_NEVER_RETRY 0x
+#endif
+
 #define EXCP00_DIVZ0
 #define EXCP01_DB  1
 #define EXCP02_NMI 2
diff --git a/target-i386/hyperv.c 

Re: [PATCH 4/8] powerpc: Prepare to support kernel handling of IOMMU map/unmap

2013-07-07 Thread Benjamin Herrenschmidt
On Sun, 2013-07-07 at 01:07 +1000, Alexey Kardashevskiy wrote:
 The current VFIO-on-POWER implementation supports only user mode
 driven mapping, i.e. QEMU is sending requests to map/unmap pages.
 However this approach is really slow, so we want to move that to KVM.
 Since H_PUT_TCE can be extremely performance sensitive (especially with
 network adapters where each packet needs to be mapped/unmapped) we chose
 to implement that as a fast hypercall directly in real
 mode (processor still in the guest context but MMU off).
 
 To be able to do that, we need to provide some facilities to
 access the struct page count within that real mode environment as things
 like the sparsemem vmemmap mappings aren't accessible.
 
 This adds an API to increment/decrement page counter as
 get_user_pages API used for user mode mapping does not work
 in the real mode.
 
 CONFIG_SPARSEMEM_VMEMMAP and CONFIG_FLATMEM are supported.

This patch will need an ack from mm people to make sure they are ok
with our approach and ack the change to the generic header.

(Added linux-mm).

Cheers,
Ben.

 Reviewed-by: Paul Mackerras pau...@samba.org
 Signed-off-by: Paul Mackerras pau...@samba.org
 Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
 
 ---
 
 Changes:
 2013/06/27:
 * realmode_get_page() fixed to use get_page_unless_zero(). If failed,
 the call will be passed from real to virtual mode and safely handled.
 * added comment to PageCompound() in include/linux/page-flags.h.
 
 2013/05/20:
 * PageTail() is replaced by PageCompound() in order to have the same checks
 for whether the page is huge in realmode_get_page() and realmode_put_page()
 
 Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
 ---
  arch/powerpc/include/asm/pgtable-ppc64.h |  4 ++
  arch/powerpc/mm/init_64.c| 78 
 +++-
  include/linux/page-flags.h   |  4 +-
  3 files changed, 84 insertions(+), 2 deletions(-)
 
 diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h 
 b/arch/powerpc/include/asm/pgtable-ppc64.h
 index e3d55f6f..7b46e5f 100644
 --- a/arch/powerpc/include/asm/pgtable-ppc64.h
 +++ b/arch/powerpc/include/asm/pgtable-ppc64.h
 @@ -376,6 +376,10 @@ static inline pte_t *find_linux_pte_or_hugepte(pgd_t 
 *pgdir, unsigned long ea,
  }
  #endif /* !CONFIG_HUGETLB_PAGE */
  
 +struct page *realmode_pfn_to_page(unsigned long pfn);
 +int realmode_get_page(struct page *page);
 +int realmode_put_page(struct page *page);
 +
  #endif /* __ASSEMBLY__ */
  
  #endif /* _ASM_POWERPC_PGTABLE_PPC64_H_ */
 diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
 index a90b9c4..7031be3 100644
 --- a/arch/powerpc/mm/init_64.c
 +++ b/arch/powerpc/mm/init_64.c
 @@ -297,5 +297,81 @@ void vmemmap_free(unsigned long start, unsigned long end)
  {
  }
  
 -#endif /* CONFIG_SPARSEMEM_VMEMMAP */
 +/*
 + * We do not have access to the sparsemem vmemmap, so we fallback to
 + * walking the list of sparsemem blocks which we already maintain for
 + * the sake of crashdump. In the long run, we might want to maintain
 + * a tree if performance of that linear walk becomes a problem.
 + *
 + * Any of realmode_ functions can fail due to:
 + * 1) As real sparsemem blocks do not lay in RAM continously (they
 + * are in virtual address space which is not available in the real mode),
 + * the requested page struct can be split between blocks so get_page/put_page
 + * may fail.
 + * 2) When huge pages are used, the get_page/put_page API will fail
 + * in real mode as the linked addresses in the page struct are virtual
 + * too.
 + * When 1) or 2) takes place, the API returns an error code to cause
 + * an exit to kernel virtual mode where the operation will be completed.
 + */
 +struct page *realmode_pfn_to_page(unsigned long pfn)
 +{
 + struct vmemmap_backing *vmem_back;
 + struct page *page;
 + unsigned long page_size = 1  mmu_psize_defs[mmu_vmemmap_psize].shift;
 + unsigned long pg_va = (unsigned long) pfn_to_page(pfn);
  
 + for (vmem_back = vmemmap_list; vmem_back; vmem_back = vmem_back-list) {
 + if (pg_va  vmem_back-virt_addr)
 + continue;
 +
 + /* Check that page struct is not split between real pages */
 + if ((pg_va + sizeof(struct page)) 
 + (vmem_back-virt_addr + page_size))
 + return NULL;
 +
 + page = (struct page *) (vmem_back-phys + pg_va -
 + vmem_back-virt_addr);
 + return page;
 + }
 +
 + return NULL;
 +}
 +EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
 +
 +#elif defined(CONFIG_FLATMEM)
 +
 +struct page *realmode_pfn_to_page(unsigned long pfn)
 +{
 + struct page *page = pfn_to_page(pfn);
 + return page;
 +}
 +EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
 +
 +#endif /* CONFIG_SPARSEMEM_VMEMMAP/CONFIG_FLATMEM */
 +
 +#if defined(CONFIG_SPARSEMEM_VMEMMAP) || defined(CONFIG_FLATMEM)
 +int realmode_get_page(struct page *page)
 +{
 + 

Re: [PATCH] virtio: include asm/barrier explicitly

2013-07-07 Thread Rusty Russell
Michael S. Tsirkin m...@redhat.com writes:
 virtio_ring.h uses mb() and friends, make
 it pull in asm/barrier.h itself, not rely
 on other headers to do it.

 Signed-off-by: Michael S. Tsirkin m...@redhat.com

Applied.

Thanks,
Rusty.
PS.  I'll squeeze these into this merge window, since they're trivial.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] tools/virtio: move module license stub to module.h

2013-07-07 Thread Rusty Russell
Michael S. Tsirkin m...@redhat.com writes:
 This fixes build for the vringh test:
 [linux]$ make -C tools/virtio/
 make: Entering directory `/home/mst/scm/linux/tools/virtio'
 cc -g -O2 -Wall -I. -I ../../usr/include/ -Wno-pointer-sign
 -fno-strict-overflow -fno-strict-aliasing -fno-common -MMD
 -U_FORTIFY_SOURCE   -c -o vringh.o ../../drivers/vhost/vringh.c
 ../../drivers/vhost/vringh.c:1010:16: error: expected declaration
 specifiers or ‘...’ before string constant

 Signed-off-by: Michael S. Tsirkin m...@redhat.com
 ---
  tools/virtio/linux/module.h | 5 +
  tools/virtio/linux/virtio.h | 3 ---
  2 files changed, 5 insertions(+), 3 deletions(-)

Applied, thanks.

Cheers,
Rusty.

 diff --git a/tools/virtio/linux/module.h b/tools/virtio/linux/module.h
 index 3039a7e..28ce95a 100644
 --- a/tools/virtio/linux/module.h
 +++ b/tools/virtio/linux/module.h
 @@ -1 +1,6 @@
  #include linux/export.h
 +
 +#define MODULE_LICENSE(__MODULE_LICENSE_value) \
 + static __attribute__((unused)) const char *__MODULE_LICENSE_name = \
 + __MODULE_LICENSE_value
 +
 diff --git a/tools/virtio/linux/virtio.h b/tools/virtio/linux/virtio.h
 index cd80183..8447830 100644
 --- a/tools/virtio/linux/virtio.h
 +++ b/tools/virtio/linux/virtio.h
 @@ -45,9 +45,6 @@ struct virtqueue {
   void *priv;
  };
  
 -#define MODULE_LICENSE(__MODULE_LICENSE_value) \
 - const char *__MODULE_LICENSE_name = __MODULE_LICENSE_value
 -
  /* Interfaces exported by virtio_ring. */
  int virtqueue_add_sgs(struct virtqueue *vq,
 struct scatterlist *sgs[],
 -- 
 MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC] virtio-pci: new config layout: using memory BAR

2013-07-07 Thread Kevin O'Connor
I only recently saw this email.

On Thu, Jun 06, 2013 at 06:10:12PM +0300, Gleb Natapov wrote:
 On Thu, Jun 06, 2013 at 05:06:32PM +0200, Gerd Hoffmann wrote:
  For seabios itself this isn't a big issue, see pci_{readl,writel} in
  src/pci.c.  When called in 16bit mode it goes into 32bit mode
  temporarily, just for accessing the mmio register.  ahci driver uses it,
  xhci driver (wip atm) will use that too, and virtio-{blk,scsi} drivers
  in seabios can do the same.
  
 Isn't this approach broken? How can SeaBIOS be sure it restores real
 mode registers to exactly same state they were before entering 32bit
 mode?

You are correct - SeaBIOS can't fully restore the hidden segment
registers.  So, in a way it is broken.

In practice, it seems to work on modern bootloaders (eg, ntldr, grub,
lilo).  It definitely doesn't work with EMM386 (old DOS stuff), but
does seem to work okay with FreeDOS as long as one doesn't run EMM386.

The AHCI code uses this 32bit/16bit trampoline because it would not be
possible to support AHCI otherwise.  I haven't seen any complaints of
failures with the AHCI code - probably because people using AHCI are
using modern guests.

I explored this a bit some time back and the only way I could think of
to reliably restore the 16bit registers would be via SMM.
Unfortunately, using SMM introduces a whole host of complexity and
problems.

-Kevin
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] virtio: include asm/barrier explicitly

2013-07-07 Thread Stephen Rothwell
Hi Rusty,

On Mon, 08 Jul 2013 11:31:45 +0930 Rusty Russell ru...@rustcorp.com.au wrote:

 PS.  I'll squeeze these into this merge window, since they're trivial.

s/'re trivial/fix bugs/  ;-)

-- 
Cheers,
Stephen Rothwells...@canb.auug.org.au


pgpQCBOOsFabg.pgp
Description: PGP signature


Re: [PATCH 4/8] powerpc: Prepare to support kernel handling of IOMMU map/unmap

2013-07-07 Thread Benjamin Herrenschmidt
On Sun, 2013-07-07 at 01:07 +1000, Alexey Kardashevskiy wrote:
 The current VFIO-on-POWER implementation supports only user mode
 driven mapping, i.e. QEMU is sending requests to map/unmap pages.
 However this approach is really slow, so we want to move that to KVM.
 Since H_PUT_TCE can be extremely performance sensitive (especially with
 network adapters where each packet needs to be mapped/unmapped) we chose
 to implement that as a fast hypercall directly in real
 mode (processor still in the guest context but MMU off).
 
 To be able to do that, we need to provide some facilities to
 access the struct page count within that real mode environment as things
 like the sparsemem vmemmap mappings aren't accessible.
 
 This adds an API to increment/decrement page counter as
 get_user_pages API used for user mode mapping does not work
 in the real mode.
 
 CONFIG_SPARSEMEM_VMEMMAP and CONFIG_FLATMEM are supported.

This patch will need an ack from mm people to make sure they are ok
with our approach and ack the change to the generic header.

(Added linux-mm).

Cheers,
Ben.

 Reviewed-by: Paul Mackerras pau...@samba.org
 Signed-off-by: Paul Mackerras pau...@samba.org
 Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
 
 ---
 
 Changes:
 2013/06/27:
 * realmode_get_page() fixed to use get_page_unless_zero(). If failed,
 the call will be passed from real to virtual mode and safely handled.
 * added comment to PageCompound() in include/linux/page-flags.h.
 
 2013/05/20:
 * PageTail() is replaced by PageCompound() in order to have the same checks
 for whether the page is huge in realmode_get_page() and realmode_put_page()
 
 Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
 ---
  arch/powerpc/include/asm/pgtable-ppc64.h |  4 ++
  arch/powerpc/mm/init_64.c| 78 
 +++-
  include/linux/page-flags.h   |  4 +-
  3 files changed, 84 insertions(+), 2 deletions(-)
 
 diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h 
 b/arch/powerpc/include/asm/pgtable-ppc64.h
 index e3d55f6f..7b46e5f 100644
 --- a/arch/powerpc/include/asm/pgtable-ppc64.h
 +++ b/arch/powerpc/include/asm/pgtable-ppc64.h
 @@ -376,6 +376,10 @@ static inline pte_t *find_linux_pte_or_hugepte(pgd_t 
 *pgdir, unsigned long ea,
  }
  #endif /* !CONFIG_HUGETLB_PAGE */
  
 +struct page *realmode_pfn_to_page(unsigned long pfn);
 +int realmode_get_page(struct page *page);
 +int realmode_put_page(struct page *page);
 +
  #endif /* __ASSEMBLY__ */
  
  #endif /* _ASM_POWERPC_PGTABLE_PPC64_H_ */
 diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
 index a90b9c4..7031be3 100644
 --- a/arch/powerpc/mm/init_64.c
 +++ b/arch/powerpc/mm/init_64.c
 @@ -297,5 +297,81 @@ void vmemmap_free(unsigned long start, unsigned long end)
  {
  }
  
 -#endif /* CONFIG_SPARSEMEM_VMEMMAP */
 +/*
 + * We do not have access to the sparsemem vmemmap, so we fallback to
 + * walking the list of sparsemem blocks which we already maintain for
 + * the sake of crashdump. In the long run, we might want to maintain
 + * a tree if performance of that linear walk becomes a problem.
 + *
 + * Any of realmode_ functions can fail due to:
 + * 1) As real sparsemem blocks do not lay in RAM continously (they
 + * are in virtual address space which is not available in the real mode),
 + * the requested page struct can be split between blocks so get_page/put_page
 + * may fail.
 + * 2) When huge pages are used, the get_page/put_page API will fail
 + * in real mode as the linked addresses in the page struct are virtual
 + * too.
 + * When 1) or 2) takes place, the API returns an error code to cause
 + * an exit to kernel virtual mode where the operation will be completed.
 + */
 +struct page *realmode_pfn_to_page(unsigned long pfn)
 +{
 + struct vmemmap_backing *vmem_back;
 + struct page *page;
 + unsigned long page_size = 1  mmu_psize_defs[mmu_vmemmap_psize].shift;
 + unsigned long pg_va = (unsigned long) pfn_to_page(pfn);
  
 + for (vmem_back = vmemmap_list; vmem_back; vmem_back = vmem_back-list) {
 + if (pg_va  vmem_back-virt_addr)
 + continue;
 +
 + /* Check that page struct is not split between real pages */
 + if ((pg_va + sizeof(struct page)) 
 + (vmem_back-virt_addr + page_size))
 + return NULL;
 +
 + page = (struct page *) (vmem_back-phys + pg_va -
 + vmem_back-virt_addr);
 + return page;
 + }
 +
 + return NULL;
 +}
 +EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
 +
 +#elif defined(CONFIG_FLATMEM)
 +
 +struct page *realmode_pfn_to_page(unsigned long pfn)
 +{
 + struct page *page = pfn_to_page(pfn);
 + return page;
 +}
 +EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
 +
 +#endif /* CONFIG_SPARSEMEM_VMEMMAP/CONFIG_FLATMEM */
 +
 +#if defined(CONFIG_SPARSEMEM_VMEMMAP) || defined(CONFIG_FLATMEM)
 +int realmode_get_page(struct page *page)
 +{
 +