Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel

2011-02-25 Thread Michael S. Tsirkin
On Fri, Feb 25, 2011 at 02:12:42PM +0800, Sheng Yang wrote:
 On Thursday 24 February 2011 18:17:34 Michael S. Tsirkin wrote:
  On Thu, Feb 24, 2011 at 05:44:20PM +0800, Sheng Yang wrote:
   On Wednesday 23 February 2011 16:45:37 Michael S. Tsirkin wrote:
On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote:
 On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote:
  On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
   Then we can support mask bit operation of assigned devices now.
  
  Looks pretty good overall.  A few comments below.  It seems like we
  should be able to hook this into vfio with a small stub in kvm.  We
  just need to be able to communicate disabling and enabling of
  individual msix vectors.  For brute force, we could do this with a
  lot of eventfds, triggered by kvm and consumed by vfio, two per
  MSI-X vector.  Not sure if there's something smaller that could do
  it. Thanks,
 
 Alex, thanks for your comments. See my comments below:
  Alex
  
   Signed-off-by: Sheng Yang sh...@linux.intel.com
   ---
   
arch/x86/kvm/Makefile|2 +-
arch/x86/kvm/x86.c   |8 +-
include/linux/kvm.h  |   21 
include/linux/kvm_host.h |   25 
virt/kvm/assigned-dev.c  |   44 +++
virt/kvm/kvm_main.c  |   38 ++-
virt/kvm/msix_mmio.c |  286
++
virt/kvm/msix_mmio.h

|   25 

8 files changed, 442 insertions(+), 7 deletions(-)
create mode 100644 virt/kvm/msix_mmio.c
create mode 100644 virt/kvm/msix_mmio.h
   
   diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
   index f15501f..3a0d851 100644
   --- a/arch/x86/kvm/Makefile
   +++ b/arch/x86/kvm/Makefile
   @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
   
kvm-y+= $(addprefix ../../../virt/kvm/, 
   kvm_main.o 
 ioapic.o
   
   \
   
 coalesced_mmio.o irq_comm.o eventfd.o \
   
   - assigned-dev.o)
   + assigned-dev.o msix_mmio.o)
   
kvm-$(CONFIG_IOMMU_API)  += $(addprefix ../../../virt/kvm/,
iommu.o) kvm-$(CONFIG_KVM_ASYNC_PF)  += $(addprefix
../../../virt/kvm/, async_pf.o)
   
   diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
   index fa708c9..89bf12c 100644
   --- a/arch/x86/kvm/x86.c
   +++ b/arch/x86/kvm/x86.c
   @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
   
 case KVM_CAP_X86_ROBUST_SINGLESTEP:
 case KVM_CAP_XSAVE:
   
 case KVM_CAP_ASYNC_PF:
   + case KVM_CAP_MSIX_MMIO:
 r = 1;
 break;
 
 case KVM_CAP_COALESCED_MMIO:
   @@ -3807,6 +3808,7 @@ static int
   emulator_write_emulated_onepage(unsigned long addr,
   
struct kvm_vcpu *vcpu)

{

 gpa_t gpa;
   
   + int r;
   
 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
   
   @@ -3822,14 +3824,16 @@ static int
   emulator_write_emulated_onepage(unsigned long addr,
   
mmio:
 trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
   
   + r = vcpu_mmio_write(vcpu, gpa, bytes, val);
   
 /*
 
  * Is this MMIO handled locally?
  */
   
   - if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
   + if (!r)
   
 return X86EMUL_CONTINUE;
 
 vcpu-mmio_needed = 1;
   
   - vcpu-run-exit_reason = KVM_EXIT_MMIO;
   + vcpu-run-exit_reason = (r == -ENOTSYNC) ?
   + KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;

This use of -ENOTSYNC is IMO confusing.
How about we make vcpu_mmio_write return the positive exit reason?
Negative value will mean an error.
   
   In fact currently nagative value means something more need to be done,
   the same as MMIO exit.
  
  So it would be
  if (!r)
  return X86EMUL_CONTINUE;
  
  vcpu-run-exit_reason = r;
  
   Now I think we can keep it, or update them all later.
  
  The way to do this would be
  1. patch to return KVM_EXIT_MMIO on mmio
  2. your patch that returns KVM_EXIT_MSIX_ROUTING_UPDATE on top
 
 It's not that straightforward. In most condition, the reason 
 vcpu_mmio_write()  0 
 because KVM itself unable to complete the request. That's quite 
 straightforward. 
 But each handler in the chain can't decided it would be KVM_EXIT_MMIO, they 
 can 
 only know when all of them fail to handle the accessing.
 

Well, this just violates the standard API for return value.

The standard semantics are:
- negative - error
- positive - not an error

So you can just return a 

Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel

2011-02-24 Thread Sheng Yang
On Wednesday 23 February 2011 16:45:37 Michael S. Tsirkin wrote:
 On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote:
  On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote:
   On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
Then we can support mask bit operation of assigned devices now.
   
   Looks pretty good overall.  A few comments below.  It seems like we
   should be able to hook this into vfio with a small stub in kvm.  We
   just need to be able to communicate disabling and enabling of
   individual msix vectors.  For brute force, we could do this with a lot
   of eventfds, triggered by kvm and consumed by vfio, two per MSI-X
   vector.  Not sure if there's something smaller that could do it. 
   Thanks,
  
  Alex, thanks for your comments. See my comments below:
   Alex
   
Signed-off-by: Sheng Yang sh...@linux.intel.com
---

 arch/x86/kvm/Makefile|2 +-
 arch/x86/kvm/x86.c   |8 +-
 include/linux/kvm.h  |   21 
 include/linux/kvm_host.h |   25 
 virt/kvm/assigned-dev.c  |   44 +++
 virt/kvm/kvm_main.c  |   38 ++-
 virt/kvm/msix_mmio.c |  286
 ++ virt/kvm/msix_mmio.h
 
 |   25 
 
 8 files changed, 442 insertions(+), 7 deletions(-)
 create mode 100644 virt/kvm/msix_mmio.c
 create mode 100644 virt/kvm/msix_mmio.h

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index f15501f..3a0d851 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.

 kvm-y  += $(addprefix ../../../virt/kvm/, kvm_main.o 
ioapic.o 
\
 
coalesced_mmio.o irq_comm.o eventfd.o \

-   assigned-dev.o)
+   assigned-dev.o msix_mmio.o)

 kvm-$(CONFIG_IOMMU_API)+= $(addprefix ../../../virt/kvm/, 
iommu.o)
 kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/,
 async_pf.o)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fa708c9..89bf12c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)

case KVM_CAP_X86_ROBUST_SINGLESTEP:
case KVM_CAP_XSAVE:

case KVM_CAP_ASYNC_PF:
+   case KVM_CAP_MSIX_MMIO:
r = 1;
break;

case KVM_CAP_COALESCED_MMIO:
@@ -3807,6 +3808,7 @@ static int
emulator_write_emulated_onepage(unsigned long addr,

   struct kvm_vcpu *vcpu)
 
 {
 
gpa_t gpa;

+   int r;

gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);

@@ -3822,14 +3824,16 @@ static int
emulator_write_emulated_onepage(unsigned long addr,

 mmio:
trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);

+   r = vcpu_mmio_write(vcpu, gpa, bytes, val);

/*

 * Is this MMIO handled locally?
 */

-   if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
+   if (!r)

return X86EMUL_CONTINUE;

vcpu-mmio_needed = 1;

-   vcpu-run-exit_reason = KVM_EXIT_MMIO;
+   vcpu-run-exit_reason = (r == -ENOTSYNC) ?
+   KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
 
 This use of -ENOTSYNC is IMO confusing.
 How about we make vcpu_mmio_write return the positive exit reason?
 Negative value will mean an error.

Make sense. I would update it.
 
vcpu-run-mmio.phys_addr = vcpu-mmio_phys_addr = gpa;
vcpu-run-mmio.len = vcpu-mmio_size = bytes;
vcpu-run-mmio.is_write = vcpu-mmio_is_write = 1;

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index ea2dc1a..ad9df4b 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -161,6 +161,7 @@ struct kvm_pit_config {

 #define KVM_EXIT_NMI  16
 #define KVM_EXIT_INTERNAL_ERROR   17
 #define KVM_EXIT_OSI  18

+#define KVM_EXIT_MSIX_ROUTING_UPDATE 19

 /* For KVM_EXIT_INTERNAL_ERROR */
 #define KVM_INTERNAL_ERROR_EMULATION 1

@@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo {

 #define KVM_CAP_PPC_GET_PVINFO 57
 #define KVM_CAP_PPC_IRQ_LEVEL 58
 #define KVM_CAP_ASYNC_PF 59

+#define KVM_CAP_MSIX_MMIO 60

 #ifdef KVM_CAP_IRQ_ROUTING

@@ -672,6 +674,9 @@ struct kvm_clock_data {

 #define KVM_XEN_HVM_CONFIG_IOW(KVMIO,  0x7a, struct
 kvm_xen_hvm_config) #define KVM_SET_CLOCK _IOW(KVMIO,
 0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK
 _IOR(KVMIO,  0x7c, struct 

Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel

2011-02-24 Thread Sheng Yang
On Wednesday 23 February 2011 16:45:37 Michael S. Tsirkin wrote:
 On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote:
  On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote:
   On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
Then we can support mask bit operation of assigned devices now.
   
   Looks pretty good overall.  A few comments below.  It seems like we
   should be able to hook this into vfio with a small stub in kvm.  We
   just need to be able to communicate disabling and enabling of
   individual msix vectors.  For brute force, we could do this with a lot
   of eventfds, triggered by kvm and consumed by vfio, two per MSI-X
   vector.  Not sure if there's something smaller that could do it. 
   Thanks,
  
  Alex, thanks for your comments. See my comments below:
   Alex
   
Signed-off-by: Sheng Yang sh...@linux.intel.com
---

 arch/x86/kvm/Makefile|2 +-
 arch/x86/kvm/x86.c   |8 +-
 include/linux/kvm.h  |   21 
 include/linux/kvm_host.h |   25 
 virt/kvm/assigned-dev.c  |   44 +++
 virt/kvm/kvm_main.c  |   38 ++-
 virt/kvm/msix_mmio.c |  286
 ++ virt/kvm/msix_mmio.h
 
 |   25 
 
 8 files changed, 442 insertions(+), 7 deletions(-)
 create mode 100644 virt/kvm/msix_mmio.c
 create mode 100644 virt/kvm/msix_mmio.h

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index f15501f..3a0d851 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.

 kvm-y  += $(addprefix ../../../virt/kvm/, kvm_main.o 
ioapic.o 
\
 
coalesced_mmio.o irq_comm.o eventfd.o \

-   assigned-dev.o)
+   assigned-dev.o msix_mmio.o)

 kvm-$(CONFIG_IOMMU_API)+= $(addprefix ../../../virt/kvm/, 
iommu.o)
 kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/,
 async_pf.o)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fa708c9..89bf12c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)

case KVM_CAP_X86_ROBUST_SINGLESTEP:
case KVM_CAP_XSAVE:

case KVM_CAP_ASYNC_PF:
+   case KVM_CAP_MSIX_MMIO:
r = 1;
break;

case KVM_CAP_COALESCED_MMIO:
@@ -3807,6 +3808,7 @@ static int
emulator_write_emulated_onepage(unsigned long addr,

   struct kvm_vcpu *vcpu)
 
 {
 
gpa_t gpa;

+   int r;

gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);

@@ -3822,14 +3824,16 @@ static int
emulator_write_emulated_onepage(unsigned long addr,

 mmio:
trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);

+   r = vcpu_mmio_write(vcpu, gpa, bytes, val);

/*

 * Is this MMIO handled locally?
 */

-   if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
+   if (!r)

return X86EMUL_CONTINUE;

vcpu-mmio_needed = 1;

-   vcpu-run-exit_reason = KVM_EXIT_MMIO;
+   vcpu-run-exit_reason = (r == -ENOTSYNC) ?
+   KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
 
 This use of -ENOTSYNC is IMO confusing.
 How about we make vcpu_mmio_write return the positive exit reason?
 Negative value will mean an error.

In fact currently nagative value means something more need to be done, the same 
as 
MMIO exit. Now I think we can keep it, or update them all later.

--
regards
Yang, Sheng
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel

2011-02-24 Thread Michael S. Tsirkin
On Thu, Feb 24, 2011 at 04:08:22PM +0800, Sheng Yang wrote:
 On Wednesday 23 February 2011 16:45:37 Michael S. Tsirkin wrote:
  On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote:
   On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote:
On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
 Then we can support mask bit operation of assigned devices now.

Looks pretty good overall.  A few comments below.  It seems like we
should be able to hook this into vfio with a small stub in kvm.  We
just need to be able to communicate disabling and enabling of
individual msix vectors.  For brute force, we could do this with a lot
of eventfds, triggered by kvm and consumed by vfio, two per MSI-X
vector.  Not sure if there's something smaller that could do it. 
Thanks,
   
   Alex, thanks for your comments. See my comments below:
Alex

 Signed-off-by: Sheng Yang sh...@linux.intel.com
 ---
 
  arch/x86/kvm/Makefile|2 +-
  arch/x86/kvm/x86.c   |8 +-
  include/linux/kvm.h  |   21 
  include/linux/kvm_host.h |   25 
  virt/kvm/assigned-dev.c  |   44 +++
  virt/kvm/kvm_main.c  |   38 ++-
  virt/kvm/msix_mmio.c |  286
  ++ virt/kvm/msix_mmio.h
  
  |   25 
  
  8 files changed, 442 insertions(+), 7 deletions(-)
  create mode 100644 virt/kvm/msix_mmio.c
  create mode 100644 virt/kvm/msix_mmio.h
 
 diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
 index f15501f..3a0d851 100644
 --- a/arch/x86/kvm/Makefile
 +++ b/arch/x86/kvm/Makefile
 @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
 
  kvm-y+= $(addprefix ../../../virt/kvm/, 
 kvm_main.o ioapic.o 
 \
  
   coalesced_mmio.o irq_comm.o eventfd.o \
 
 - assigned-dev.o)
 + assigned-dev.o msix_mmio.o)
 
  kvm-$(CONFIG_IOMMU_API)  += $(addprefix ../../../virt/kvm/, 
 iommu.o)
  kvm-$(CONFIG_KVM_ASYNC_PF)   += $(addprefix ../../../virt/kvm/,
  async_pf.o)
 
 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index fa708c9..89bf12c 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 
   case KVM_CAP_X86_ROBUST_SINGLESTEP:
   case KVM_CAP_XSAVE:
 
   case KVM_CAP_ASYNC_PF:
 + case KVM_CAP_MSIX_MMIO:
   r = 1;
   break;
   
   case KVM_CAP_COALESCED_MMIO:
 @@ -3807,6 +3808,7 @@ static int
 emulator_write_emulated_onepage(unsigned long addr,
 
  struct kvm_vcpu *vcpu)
  
  {
  
   gpa_t gpa;
 
 + int r;
 
   gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
 
 @@ -3822,14 +3824,16 @@ static int
 emulator_write_emulated_onepage(unsigned long addr,
 
  mmio:
   trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
 
 + r = vcpu_mmio_write(vcpu, gpa, bytes, val);
 
   /*
   
* Is this MMIO handled locally?
*/
 
 - if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
 + if (!r)
 
   return X86EMUL_CONTINUE;
   
   vcpu-mmio_needed = 1;
 
 - vcpu-run-exit_reason = KVM_EXIT_MMIO;
 + vcpu-run-exit_reason = (r == -ENOTSYNC) ?
 + KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
  
  This use of -ENOTSYNC is IMO confusing.
  How about we make vcpu_mmio_write return the positive exit reason?
  Negative value will mean an error.
 
 Make sense. I would update it.
  
   vcpu-run-mmio.phys_addr = vcpu-mmio_phys_addr = gpa;
   vcpu-run-mmio.len = vcpu-mmio_size = bytes;
   vcpu-run-mmio.is_write = vcpu-mmio_is_write = 1;
 
 diff --git a/include/linux/kvm.h b/include/linux/kvm.h
 index ea2dc1a..ad9df4b 100644
 --- a/include/linux/kvm.h
 +++ b/include/linux/kvm.h
 @@ -161,6 +161,7 @@ struct kvm_pit_config {
 
  #define KVM_EXIT_NMI  16
  #define KVM_EXIT_INTERNAL_ERROR   17
  #define KVM_EXIT_OSI  18
 
 +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
 
  /* For KVM_EXIT_INTERNAL_ERROR */
  #define KVM_INTERNAL_ERROR_EMULATION 1
 
 @@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo {
 
  #define KVM_CAP_PPC_GET_PVINFO 57
  #define KVM_CAP_PPC_IRQ_LEVEL 58
  #define KVM_CAP_ASYNC_PF 59
 
 +#define KVM_CAP_MSIX_MMIO 60
 
  #ifdef KVM_CAP_IRQ_ROUTING
 
 @@ -672,6 +674,9 @@ struct kvm_clock_data {
 
  #define KVM_XEN_HVM_CONFIG_IOW(KVMIO,  0x7a, struct
  kvm_xen_hvm_config) 

Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel

2011-02-24 Thread Michael S. Tsirkin
On Thu, Feb 24, 2011 at 05:44:20PM +0800, Sheng Yang wrote:
 On Wednesday 23 February 2011 16:45:37 Michael S. Tsirkin wrote:
  On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote:
   On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote:
On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
 Then we can support mask bit operation of assigned devices now.

Looks pretty good overall.  A few comments below.  It seems like we
should be able to hook this into vfio with a small stub in kvm.  We
just need to be able to communicate disabling and enabling of
individual msix vectors.  For brute force, we could do this with a lot
of eventfds, triggered by kvm and consumed by vfio, two per MSI-X
vector.  Not sure if there's something smaller that could do it. 
Thanks,
   
   Alex, thanks for your comments. See my comments below:
Alex

 Signed-off-by: Sheng Yang sh...@linux.intel.com
 ---
 
  arch/x86/kvm/Makefile|2 +-
  arch/x86/kvm/x86.c   |8 +-
  include/linux/kvm.h  |   21 
  include/linux/kvm_host.h |   25 
  virt/kvm/assigned-dev.c  |   44 +++
  virt/kvm/kvm_main.c  |   38 ++-
  virt/kvm/msix_mmio.c |  286
  ++ virt/kvm/msix_mmio.h
  
  |   25 
  
  8 files changed, 442 insertions(+), 7 deletions(-)
  create mode 100644 virt/kvm/msix_mmio.c
  create mode 100644 virt/kvm/msix_mmio.h
 
 diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
 index f15501f..3a0d851 100644
 --- a/arch/x86/kvm/Makefile
 +++ b/arch/x86/kvm/Makefile
 @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
 
  kvm-y+= $(addprefix ../../../virt/kvm/, 
 kvm_main.o ioapic.o 
 \
  
   coalesced_mmio.o irq_comm.o eventfd.o \
 
 - assigned-dev.o)
 + assigned-dev.o msix_mmio.o)
 
  kvm-$(CONFIG_IOMMU_API)  += $(addprefix ../../../virt/kvm/, 
 iommu.o)
  kvm-$(CONFIG_KVM_ASYNC_PF)   += $(addprefix ../../../virt/kvm/,
  async_pf.o)
 
 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index fa708c9..89bf12c 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 
   case KVM_CAP_X86_ROBUST_SINGLESTEP:
   case KVM_CAP_XSAVE:
 
   case KVM_CAP_ASYNC_PF:
 + case KVM_CAP_MSIX_MMIO:
   r = 1;
   break;
   
   case KVM_CAP_COALESCED_MMIO:
 @@ -3807,6 +3808,7 @@ static int
 emulator_write_emulated_onepage(unsigned long addr,
 
  struct kvm_vcpu *vcpu)
  
  {
  
   gpa_t gpa;
 
 + int r;
 
   gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
 
 @@ -3822,14 +3824,16 @@ static int
 emulator_write_emulated_onepage(unsigned long addr,
 
  mmio:
   trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
 
 + r = vcpu_mmio_write(vcpu, gpa, bytes, val);
 
   /*
   
* Is this MMIO handled locally?
*/
 
 - if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
 + if (!r)
 
   return X86EMUL_CONTINUE;
   
   vcpu-mmio_needed = 1;
 
 - vcpu-run-exit_reason = KVM_EXIT_MMIO;
 + vcpu-run-exit_reason = (r == -ENOTSYNC) ?
 + KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
  
  This use of -ENOTSYNC is IMO confusing.
  How about we make vcpu_mmio_write return the positive exit reason?
  Negative value will mean an error.
 
 In fact currently nagative value means something more need to be done, the 
 same as 
 MMIO exit.


So it would be
if (!r)
return X86EMUL_CONTINUE;

vcpu-run-exit_reason = r;

 Now I think we can keep it, or update them all later.

The way to do this would be
1. patch to return KVM_EXIT_MMIO on mmio
2. your patch that returns KVM_EXIT_MSIX_ROUTING_UPDATE on top

 
 --
 regards
 Yang, Sheng
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel

2011-02-24 Thread Sheng Yang
On Thursday 24 February 2011 18:11:44 Michael S. Tsirkin wrote:
 On Thu, Feb 24, 2011 at 04:08:22PM +0800, Sheng Yang wrote:
  On Wednesday 23 February 2011 16:45:37 Michael S. Tsirkin wrote:
   On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote:
On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote:
 On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
  Then we can support mask bit operation of assigned devices now.
 
 Looks pretty good overall.  A few comments below.  It seems like we
 should be able to hook this into vfio with a small stub in kvm.  We
 just need to be able to communicate disabling and enabling of
 individual msix vectors.  For brute force, we could do this with a
 lot of eventfds, triggered by kvm and consumed by vfio, two per
 MSI-X vector.  Not sure if there's something smaller that could do
 it. Thanks,

Alex, thanks for your comments. See my comments below:
 Alex
 
  Signed-off-by: Sheng Yang sh...@linux.intel.com
  ---
  
   arch/x86/kvm/Makefile|2 +-
   arch/x86/kvm/x86.c   |8 +-
   include/linux/kvm.h  |   21 
   include/linux/kvm_host.h |   25 
   virt/kvm/assigned-dev.c  |   44 +++
   virt/kvm/kvm_main.c  |   38 ++-
   virt/kvm/msix_mmio.c |  286
   ++
   virt/kvm/msix_mmio.h
   
   |   25 
   
   8 files changed, 442 insertions(+), 7 deletions(-)
   create mode 100644 virt/kvm/msix_mmio.c
   create mode 100644 virt/kvm/msix_mmio.h
  
  diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
  index f15501f..3a0d851 100644
  --- a/arch/x86/kvm/Makefile
  +++ b/arch/x86/kvm/Makefile
  @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
  
   kvm-y  += $(addprefix ../../../virt/kvm/, 
  kvm_main.o 
ioapic.o
  
  \
  
  coalesced_mmio.o irq_comm.o eventfd.o \
  
  -   assigned-dev.o)
  +   assigned-dev.o msix_mmio.o)
  
   kvm-$(CONFIG_IOMMU_API)+= $(addprefix ../../../virt/kvm/,
   iommu.o) kvm-$(CONFIG_KVM_ASYNC_PF)+= $(addprefix
   ../../../virt/kvm/, async_pf.o)
  
  diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
  index fa708c9..89bf12c 100644
  --- a/arch/x86/kvm/x86.c
  +++ b/arch/x86/kvm/x86.c
  @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
  
  case KVM_CAP_X86_ROBUST_SINGLESTEP:
  case KVM_CAP_XSAVE:
  
  case KVM_CAP_ASYNC_PF:
  +   case KVM_CAP_MSIX_MMIO:
  r = 1;
  break;
  
  case KVM_CAP_COALESCED_MMIO:
  @@ -3807,6 +3808,7 @@ static int
  emulator_write_emulated_onepage(unsigned long addr,
  
 struct kvm_vcpu *vcpu)
   
   {
   
  gpa_t gpa;
  
  +   int r;
  
  gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
  
  @@ -3822,14 +3824,16 @@ static int
  emulator_write_emulated_onepage(unsigned long addr,
  
   mmio:
  trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
  
  +   r = vcpu_mmio_write(vcpu, gpa, bytes, val);
  
  /*
  
   * Is this MMIO handled locally?
   */
  
  -   if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
  +   if (!r)
  
  return X86EMUL_CONTINUE;
  
  vcpu-mmio_needed = 1;
  
  -   vcpu-run-exit_reason = KVM_EXIT_MMIO;
  +   vcpu-run-exit_reason = (r == -ENOTSYNC) ?
  +   KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
   
   This use of -ENOTSYNC is IMO confusing.
   How about we make vcpu_mmio_write return the positive exit reason?
   Negative value will mean an error.
  
  Make sense. I would update it.
  
  vcpu-run-mmio.phys_addr = vcpu-mmio_phys_addr = gpa;
  vcpu-run-mmio.len = vcpu-mmio_size = bytes;
  vcpu-run-mmio.is_write = vcpu-mmio_is_write = 1;
  
  diff --git a/include/linux/kvm.h b/include/linux/kvm.h
  index ea2dc1a..ad9df4b 100644
  --- a/include/linux/kvm.h
  +++ b/include/linux/kvm.h
  @@ -161,6 +161,7 @@ struct kvm_pit_config {
  
   #define KVM_EXIT_NMI  16
   #define KVM_EXIT_INTERNAL_ERROR   17
   #define KVM_EXIT_OSI  18
  
  +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
  
   /* For KVM_EXIT_INTERNAL_ERROR */
   #define KVM_INTERNAL_ERROR_EMULATION 1
  
  @@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo {
  
   #define KVM_CAP_PPC_GET_PVINFO 57
   #define KVM_CAP_PPC_IRQ_LEVEL 58
   #define KVM_CAP_ASYNC_PF 59
  
  +#define KVM_CAP_MSIX_MMIO 60
  
   #ifdef KVM_CAP_IRQ_ROUTING
  
  @@ 

Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel

2011-02-24 Thread Sheng Yang
On Thursday 24 February 2011 18:17:34 Michael S. Tsirkin wrote:
 On Thu, Feb 24, 2011 at 05:44:20PM +0800, Sheng Yang wrote:
  On Wednesday 23 February 2011 16:45:37 Michael S. Tsirkin wrote:
   On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote:
On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote:
 On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
  Then we can support mask bit operation of assigned devices now.
 
 Looks pretty good overall.  A few comments below.  It seems like we
 should be able to hook this into vfio with a small stub in kvm.  We
 just need to be able to communicate disabling and enabling of
 individual msix vectors.  For brute force, we could do this with a
 lot of eventfds, triggered by kvm and consumed by vfio, two per
 MSI-X vector.  Not sure if there's something smaller that could do
 it. Thanks,

Alex, thanks for your comments. See my comments below:
 Alex
 
  Signed-off-by: Sheng Yang sh...@linux.intel.com
  ---
  
   arch/x86/kvm/Makefile|2 +-
   arch/x86/kvm/x86.c   |8 +-
   include/linux/kvm.h  |   21 
   include/linux/kvm_host.h |   25 
   virt/kvm/assigned-dev.c  |   44 +++
   virt/kvm/kvm_main.c  |   38 ++-
   virt/kvm/msix_mmio.c |  286
   ++
   virt/kvm/msix_mmio.h
   
   |   25 
   
   8 files changed, 442 insertions(+), 7 deletions(-)
   create mode 100644 virt/kvm/msix_mmio.c
   create mode 100644 virt/kvm/msix_mmio.h
  
  diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
  index f15501f..3a0d851 100644
  --- a/arch/x86/kvm/Makefile
  +++ b/arch/x86/kvm/Makefile
  @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
  
   kvm-y  += $(addprefix ../../../virt/kvm/, 
  kvm_main.o 
ioapic.o
  
  \
  
  coalesced_mmio.o irq_comm.o eventfd.o \
  
  -   assigned-dev.o)
  +   assigned-dev.o msix_mmio.o)
  
   kvm-$(CONFIG_IOMMU_API)+= $(addprefix ../../../virt/kvm/,
   iommu.o) kvm-$(CONFIG_KVM_ASYNC_PF)+= $(addprefix
   ../../../virt/kvm/, async_pf.o)
  
  diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
  index fa708c9..89bf12c 100644
  --- a/arch/x86/kvm/x86.c
  +++ b/arch/x86/kvm/x86.c
  @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
  
  case KVM_CAP_X86_ROBUST_SINGLESTEP:
  case KVM_CAP_XSAVE:
  
  case KVM_CAP_ASYNC_PF:
  +   case KVM_CAP_MSIX_MMIO:
  r = 1;
  break;
  
  case KVM_CAP_COALESCED_MMIO:
  @@ -3807,6 +3808,7 @@ static int
  emulator_write_emulated_onepage(unsigned long addr,
  
 struct kvm_vcpu *vcpu)
   
   {
   
  gpa_t gpa;
  
  +   int r;
  
  gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
  
  @@ -3822,14 +3824,16 @@ static int
  emulator_write_emulated_onepage(unsigned long addr,
  
   mmio:
  trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
  
  +   r = vcpu_mmio_write(vcpu, gpa, bytes, val);
  
  /*
  
   * Is this MMIO handled locally?
   */
  
  -   if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
  +   if (!r)
  
  return X86EMUL_CONTINUE;
  
  vcpu-mmio_needed = 1;
  
  -   vcpu-run-exit_reason = KVM_EXIT_MMIO;
  +   vcpu-run-exit_reason = (r == -ENOTSYNC) ?
  +   KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
   
   This use of -ENOTSYNC is IMO confusing.
   How about we make vcpu_mmio_write return the positive exit reason?
   Negative value will mean an error.
  
  In fact currently nagative value means something more need to be done,
  the same as MMIO exit.
 
 So it would be
   if (!r)
   return X86EMUL_CONTINUE;
 
   vcpu-run-exit_reason = r;
 
  Now I think we can keep it, or update them all later.
 
 The way to do this would be
 1. patch to return KVM_EXIT_MMIO on mmio
 2. your patch that returns KVM_EXIT_MSIX_ROUTING_UPDATE on top

It's not that straightforward. In most condition, the reason vcpu_mmio_write() 
 0 
because KVM itself unable to complete the request. That's quite 
straightforward. 
But each handler in the chain can't decided it would be KVM_EXIT_MMIO, they 
can 
only know when all of them fail to handle the accessing.

I am not sure if we like every single handler said I want KVM_EXIT_MMIO 
instead 
of a error return. We can discuss more on this, but since it's not API/ABI 
change, 
I think we can get the patch in first.

--
regards
Yang, Sheng

 
  --
  regards
  Yang, Sheng
--
To unsubscribe from 

Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel

2011-02-23 Thread Michael S. Tsirkin
On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote:
 On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote:
  On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
   Then we can support mask bit operation of assigned devices now.
  
  Looks pretty good overall.  A few comments below.  It seems like we
  should be able to hook this into vfio with a small stub in kvm.  We just
  need to be able to communicate disabling and enabling of individual msix
  vectors.  For brute force, we could do this with a lot of eventfds,
  triggered by kvm and consumed by vfio, two per MSI-X vector.  Not sure
  if there's something smaller that could do it.  Thanks,
 
 Alex, thanks for your comments. See my comments below:
 
  
  Alex
  
   Signed-off-by: Sheng Yang sh...@linux.intel.com
   ---
   
arch/x86/kvm/Makefile|2 +-
arch/x86/kvm/x86.c   |8 +-
include/linux/kvm.h  |   21 
include/linux/kvm_host.h |   25 
virt/kvm/assigned-dev.c  |   44 +++
virt/kvm/kvm_main.c  |   38 ++-
virt/kvm/msix_mmio.c |  286
++ virt/kvm/msix_mmio.h
|   25 
8 files changed, 442 insertions(+), 7 deletions(-)
create mode 100644 virt/kvm/msix_mmio.c
create mode 100644 virt/kvm/msix_mmio.h
   
   diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
   index f15501f..3a0d851 100644
   --- a/arch/x86/kvm/Makefile
   +++ b/arch/x86/kvm/Makefile
   @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
   
kvm-y+= $(addprefix ../../../virt/kvm/, kvm_main.o 
   ioapic.o \

 coalesced_mmio.o irq_comm.o eventfd.o \
   
   - assigned-dev.o)
   + assigned-dev.o msix_mmio.o)
   
kvm-$(CONFIG_IOMMU_API)  += $(addprefix ../../../virt/kvm/, iommu.o)
kvm-$(CONFIG_KVM_ASYNC_PF)   += $(addprefix ../../../virt/kvm/,
async_pf.o)
   
   diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
   index fa708c9..89bf12c 100644
   --- a/arch/x86/kvm/x86.c
   +++ b/arch/x86/kvm/x86.c
   @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
   
 case KVM_CAP_X86_ROBUST_SINGLESTEP:
 case KVM_CAP_XSAVE:
   
 case KVM_CAP_ASYNC_PF:
   + case KVM_CAP_MSIX_MMIO:
 r = 1;
 break;
 
 case KVM_CAP_COALESCED_MMIO:
   @@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned
   long addr,
   
struct kvm_vcpu *vcpu)

{

 gpa_t gpa;
   
   + int r;
   
 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
   
   @@ -3822,14 +3824,16 @@ static int
   emulator_write_emulated_onepage(unsigned long addr,
   
mmio:
 trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
   
   + r = vcpu_mmio_write(vcpu, gpa, bytes, val);
   
 /*
 
  * Is this MMIO handled locally?
  */
   
   - if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
   + if (!r)
   
 return X86EMUL_CONTINUE;
 
 vcpu-mmio_needed = 1;
   
   - vcpu-run-exit_reason = KVM_EXIT_MMIO;
   + vcpu-run-exit_reason = (r == -ENOTSYNC) ?
   + KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;

This use of -ENOTSYNC is IMO confusing.
How about we make vcpu_mmio_write return the positive exit reason?
Negative value will mean an error.

   
 vcpu-run-mmio.phys_addr = vcpu-mmio_phys_addr = gpa;
 vcpu-run-mmio.len = vcpu-mmio_size = bytes;
 vcpu-run-mmio.is_write = vcpu-mmio_is_write = 1;
   
   diff --git a/include/linux/kvm.h b/include/linux/kvm.h
   index ea2dc1a..ad9df4b 100644
   --- a/include/linux/kvm.h
   +++ b/include/linux/kvm.h
   @@ -161,6 +161,7 @@ struct kvm_pit_config {
   
#define KVM_EXIT_NMI  16
#define KVM_EXIT_INTERNAL_ERROR   17
#define KVM_EXIT_OSI  18
   
   +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
   
/* For KVM_EXIT_INTERNAL_ERROR */
#define KVM_INTERNAL_ERROR_EMULATION 1
   
   @@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo {
   
#define KVM_CAP_PPC_GET_PVINFO 57
#define KVM_CAP_PPC_IRQ_LEVEL 58
#define KVM_CAP_ASYNC_PF 59
   
   +#define KVM_CAP_MSIX_MMIO 60
   
#ifdef KVM_CAP_IRQ_ROUTING
   
   @@ -672,6 +674,9 @@ struct kvm_clock_data {
   
#define KVM_XEN_HVM_CONFIG_IOW(KVMIO,  0x7a, struct
kvm_xen_hvm_config) #define KVM_SET_CLOCK _IOW(KVMIO, 
0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK
_IOR(KVMIO,  0x7c, struct kvm_clock_data)
   
   +/* Available with KVM_CAP_MSIX_MMIO */
   +#define KVM_REGISTER_MSIX_MMIO_IOW(KVMIO,  0x7d, struct
   kvm_msix_mmio_user) +#define KVM_UNREGISTER_MSIX_MMIO  _IOW(KVMIO, 
   0x7e, struct kvm_msix_mmio_user)
   
/* Available with KVM_CAP_PIT_STATE2 */
#define KVM_GET_PIT2  _IOR(KVMIO,  0x9f, struct
kvm_pit_state2) #define KVM_SET_PIT2  _IOW(KVMIO,  0xa0,
struct 

Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel

2011-02-23 Thread Alex Williamson
On Wed, 2011-02-23 at 14:59 +0800, Sheng Yang wrote:
 On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote:
  On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
   +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr,
   int len, +void *val)
   +{
   + struct kvm_msix_mmio_dev *mmio_dev =
   + container_of(this, struct kvm_msix_mmio_dev, table_dev);
   + struct kvm_msix_mmio *mmio;
   + int idx, ret = 0, entry, offset, r;
   +
   + mutex_lock(mmio_dev-lock);
   + idx = get_mmio_table_index(mmio_dev, addr, len);
   + if (idx  0) {
   + ret = -EOPNOTSUPP;
   + goto out;
   + }
   + if ((addr  0x3) || (len != 4  len != 8))
   + goto out;
   +
   + offset = addr  0xf;
   + if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL  len == 8)
   + goto out;
   +
   + mmio = mmio_dev-mmio[idx];
   + entry = (addr - mmio-table_base_addr) / PCI_MSIX_ENTRY_SIZE;
   + r = copy_from_user(val, (void __user *)(mmio-table_base_va +
   + entry * PCI_MSIX_ENTRY_SIZE + offset), len);
   + if (r)
   + goto out;
   +out:
   + mutex_unlock(mmio_dev-lock);
   + return ret;
   +}
   +
   +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
   + int len, const void *val)
   +{
   + struct kvm_msix_mmio_dev *mmio_dev =
   + container_of(this, struct kvm_msix_mmio_dev, table_dev);
   + struct kvm_msix_mmio *mmio;
   + int idx, entry, offset, ret = 0, r = 0;
   + gpa_t entry_base;
   + u32 old_ctrl, new_ctrl;
   + u32 *ctrl_pos;
   +
   + mutex_lock(mmio_dev-kvm-lock);
   + mutex_lock(mmio_dev-lock);
   + idx = get_mmio_table_index(mmio_dev, addr, len);
   + if (idx  0) {
   + ret = -EOPNOTSUPP;
   + goto out;
   + }
   + if ((addr  0x3) || (len != 4  len != 8))
   + goto out;
   +
   + offset = addr  0xF;
  
  nit, this 0xf above.
 
 So you like another mask?

I'm just noting that above you used 0xf and here you used 0xF.

   + if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL  len == 8)
   + goto out;
   +
   + mmio = mmio_dev-mmio[idx];
   + entry = (addr - mmio-table_base_addr) / PCI_MSIX_ENTRY_SIZE;
   + entry_base = mmio-table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
   + ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
   +
   + if (get_user(old_ctrl, ctrl_pos))
   + goto out;
   +
   + /* No allow writing to other fields when entry is unmasked */
   + if (!(old_ctrl  PCI_MSIX_ENTRY_CTRL_MASKBIT) 
   + offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
   + goto out;
   +
   + if (copy_to_user((void __user *)(entry_base + offset), val, len))
   + goto out;
   +
   + if (get_user(new_ctrl, ctrl_pos))
   + goto out;
   +
   + if ((offset  PCI_MSIX_ENTRY_VECTOR_CTRL  len == 4) ||
   + (offset  PCI_MSIX_ENTRY_DATA  len == 8))
   + ret = -ENOTSYNC;
  
  Couldn't we avoid the above get_user(new_ctrl,...) if we tested this
  first?  I'd also prefer to see a direct goto out here since it's not
  entirely obvious that this first 'if' always falls into the below 'if'.
  I'm not a fan of using an error code as a special non-error return
  either.
 
 In fact when offset==PCI_MSIX_ENTRY_DATA and len ==8, we need to check 
 new_ctrl to 
 see if they indeed modified ctrl bits.
 
 I would try to make the logic here more clear. 

Isn't that what you're testing for though?

(offset  PCI_MSIX_ENTRY_VECTOR_CTRL  len == 4)

If this is true, we can only be writing address, upper address, or data,
not control.

(offset  PCI_MSIX_ENTRY_DATA  len == 8)

If this is true, we're writing address and upper address, not the
control vector.

Additionally, I think the size an alignment test should be more
restrictive.  We really only want to allow naturally aligned 4 or 8 byte
accesses.  Maybe instead of:

if ((addr  0x3) || (len != 4  len != 8))

we should do this:

if (!(len == 4 || len == 8) || addr  (len - 1))

Then the test could become:

if (offset + len = PCI_MSIX_ENTRY_VECTOR_CTRL)

Thanks,

Alex

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel

2011-02-23 Thread Michael S. Tsirkin
On Wed, Feb 23, 2011 at 09:34:19AM -0700, Alex Williamson wrote:
 On Wed, 2011-02-23 at 14:59 +0800, Sheng Yang wrote:
  On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote:
   On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
+static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr,
int len, +  void *val)
+{
+   struct kvm_msix_mmio_dev *mmio_dev =
+   container_of(this, struct kvm_msix_mmio_dev, table_dev);
+   struct kvm_msix_mmio *mmio;
+   int idx, ret = 0, entry, offset, r;
+
+   mutex_lock(mmio_dev-lock);
+   idx = get_mmio_table_index(mmio_dev, addr, len);
+   if (idx  0) {
+   ret = -EOPNOTSUPP;
+   goto out;
+   }
+   if ((addr  0x3) || (len != 4  len != 8))
+   goto out;
+
+   offset = addr  0xf;
+   if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL  len == 8)
+   goto out;
+
+   mmio = mmio_dev-mmio[idx];
+   entry = (addr - mmio-table_base_addr) / PCI_MSIX_ENTRY_SIZE;
+   r = copy_from_user(val, (void __user *)(mmio-table_base_va +
+   entry * PCI_MSIX_ENTRY_SIZE + offset), len);
+   if (r)
+   goto out;
+out:
+   mutex_unlock(mmio_dev-lock);
+   return ret;
+}
+
+static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t 
addr,
+   int len, const void *val)
+{
+   struct kvm_msix_mmio_dev *mmio_dev =
+   container_of(this, struct kvm_msix_mmio_dev, table_dev);
+   struct kvm_msix_mmio *mmio;
+   int idx, entry, offset, ret = 0, r = 0;
+   gpa_t entry_base;
+   u32 old_ctrl, new_ctrl;
+   u32 *ctrl_pos;
+
+   mutex_lock(mmio_dev-kvm-lock);
+   mutex_lock(mmio_dev-lock);
+   idx = get_mmio_table_index(mmio_dev, addr, len);
+   if (idx  0) {
+   ret = -EOPNOTSUPP;
+   goto out;
+   }
+   if ((addr  0x3) || (len != 4  len != 8))
+   goto out;
+
+   offset = addr  0xF;
   
   nit, this 0xf above.
  
  So you like another mask?
 
 I'm just noting that above you used 0xf and here you used 0xF.
 
+   if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL  len == 8)
+   goto out;
+
+   mmio = mmio_dev-mmio[idx];
+   entry = (addr - mmio-table_base_addr) / PCI_MSIX_ENTRY_SIZE;
+   entry_base = mmio-table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
+   ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
+
+   if (get_user(old_ctrl, ctrl_pos))
+   goto out;
+
+   /* No allow writing to other fields when entry is unmasked */
+   if (!(old_ctrl  PCI_MSIX_ENTRY_CTRL_MASKBIT) 
+   offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
+   goto out;
+
+   if (copy_to_user((void __user *)(entry_base + offset), val, 
len))
+   goto out;
+
+   if (get_user(new_ctrl, ctrl_pos))
+   goto out;
+
+   if ((offset  PCI_MSIX_ENTRY_VECTOR_CTRL  len == 4) ||
+   (offset  PCI_MSIX_ENTRY_DATA  len == 8))
+   ret = -ENOTSYNC;
   
   Couldn't we avoid the above get_user(new_ctrl,...) if we tested this
   first?  I'd also prefer to see a direct goto out here since it's not
   entirely obvious that this first 'if' always falls into the below 'if'.
   I'm not a fan of using an error code as a special non-error return
   either.
  
  In fact when offset==PCI_MSIX_ENTRY_DATA and len ==8, we need to check 
  new_ctrl to 
  see if they indeed modified ctrl bits.
  
  I would try to make the logic here more clear. 
 
 Isn't that what you're testing for though?
 
 (offset  PCI_MSIX_ENTRY_VECTOR_CTRL  len == 4)
 
 If this is true, we can only be writing address, upper address, or data,
 not control.
 
 (offset  PCI_MSIX_ENTRY_DATA  len == 8)
 
 If this is true, we're writing address and upper address, not the
 control vector.
 
 Additionally, I think the size an alignment test should be more
 restrictive.  We really only want to allow naturally aligned 4 or 8 byte
 accesses.  Maybe instead of:
 
 if ((addr  0x3) || (len != 4  len != 8))
 
 we should do this:
 
 if (!(len == 4 || len == 8) || addr  (len - 1))
 
 Then the test could become:
 
 if (offset + len = PCI_MSIX_ENTRY_VECTOR_CTRL)
 
 Thanks,
 
 Alex

Or rather

 if (offset + len != PCI_MSIX_ENTRY_VECTOR_CTRL + 4)

we are matching offset + length
to control offset + control length,
and avoid depending on the fact control is the last register.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  

Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel

2011-02-23 Thread Alex Williamson
On Wed, 2011-02-23 at 20:39 +0200, Michael S. Tsirkin wrote:
 On Wed, Feb 23, 2011 at 09:34:19AM -0700, Alex Williamson wrote:
  On Wed, 2011-02-23 at 14:59 +0800, Sheng Yang wrote:
   On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote:
On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
 +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t 
 addr,
 int len, +void *val)
 +{
 + struct kvm_msix_mmio_dev *mmio_dev =
 + container_of(this, struct kvm_msix_mmio_dev, table_dev);
 + struct kvm_msix_mmio *mmio;
 + int idx, ret = 0, entry, offset, r;
 +
 + mutex_lock(mmio_dev-lock);
 + idx = get_mmio_table_index(mmio_dev, addr, len);
 + if (idx  0) {
 + ret = -EOPNOTSUPP;
 + goto out;
 + }
 + if ((addr  0x3) || (len != 4  len != 8))
 + goto out;
 +
 + offset = addr  0xf;
 + if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL  len == 8)
 + goto out;
 +
 + mmio = mmio_dev-mmio[idx];
 + entry = (addr - mmio-table_base_addr) / PCI_MSIX_ENTRY_SIZE;
 + r = copy_from_user(val, (void __user *)(mmio-table_base_va +
 + entry * PCI_MSIX_ENTRY_SIZE + offset), len);
 + if (r)
 + goto out;
 +out:
 + mutex_unlock(mmio_dev-lock);
 + return ret;
 +}
 +
 +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t 
 addr,
 + int len, const void *val)
 +{
 + struct kvm_msix_mmio_dev *mmio_dev =
 + container_of(this, struct kvm_msix_mmio_dev, table_dev);
 + struct kvm_msix_mmio *mmio;
 + int idx, entry, offset, ret = 0, r = 0;
 + gpa_t entry_base;
 + u32 old_ctrl, new_ctrl;
 + u32 *ctrl_pos;
 +
 + mutex_lock(mmio_dev-kvm-lock);
 + mutex_lock(mmio_dev-lock);
 + idx = get_mmio_table_index(mmio_dev, addr, len);
 + if (idx  0) {
 + ret = -EOPNOTSUPP;
 + goto out;
 + }
 + if ((addr  0x3) || (len != 4  len != 8))
 + goto out;
 +
 + offset = addr  0xF;

nit, this 0xf above.
   
   So you like another mask?
  
  I'm just noting that above you used 0xf and here you used 0xF.
  
 + if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL  len == 8)
 + goto out;
 +
 + mmio = mmio_dev-mmio[idx];
 + entry = (addr - mmio-table_base_addr) / PCI_MSIX_ENTRY_SIZE;
 + entry_base = mmio-table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
 + ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
 +
 + if (get_user(old_ctrl, ctrl_pos))
 + goto out;
 +
 + /* No allow writing to other fields when entry is unmasked */
 + if (!(old_ctrl  PCI_MSIX_ENTRY_CTRL_MASKBIT) 
 + offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
 + goto out;
 +
 + if (copy_to_user((void __user *)(entry_base + offset), val, 
 len))
 + goto out;
 +
 + if (get_user(new_ctrl, ctrl_pos))
 + goto out;
 +
 + if ((offset  PCI_MSIX_ENTRY_VECTOR_CTRL  len == 4) ||
 + (offset  PCI_MSIX_ENTRY_DATA  len == 8))
 + ret = -ENOTSYNC;

Couldn't we avoid the above get_user(new_ctrl,...) if we tested this
first?  I'd also prefer to see a direct goto out here since it's not
entirely obvious that this first 'if' always falls into the below 'if'.
I'm not a fan of using an error code as a special non-error return
either.
   
   In fact when offset==PCI_MSIX_ENTRY_DATA and len ==8, we need to check 
   new_ctrl to 
   see if they indeed modified ctrl bits.
   
   I would try to make the logic here more clear. 
  
  Isn't that what you're testing for though?
  
  (offset  PCI_MSIX_ENTRY_VECTOR_CTRL  len == 4)
  
  If this is true, we can only be writing address, upper address, or data,
  not control.
  
  (offset  PCI_MSIX_ENTRY_DATA  len == 8)
  
  If this is true, we're writing address and upper address, not the
  control vector.
  
  Additionally, I think the size an alignment test should be more
  restrictive.  We really only want to allow naturally aligned 4 or 8 byte
  accesses.  Maybe instead of:
  
  if ((addr  0x3) || (len != 4  len != 8))
  
  we should do this:
  
  if (!(len == 4 || len == 8) || addr  (len - 1))
  
  Then the test could become:
  
  if (offset + len = PCI_MSIX_ENTRY_VECTOR_CTRL)
  
  Thanks,
  
  Alex
 
 Or rather
 
  if (offset + len != PCI_MSIX_ENTRY_VECTOR_CTRL + 4)
 
 we are matching offset + length
 to control offset + control length,
 and avoid depending on the fact control is the last register.

That's just silly.  We're writing code to match a hardware
specification.  The registers always 

Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel

2011-02-22 Thread Marcelo Tosatti
On Fri, Feb 18, 2011 at 04:15:38PM +0800, Sheng Yang wrote:
  Why allow this ioctl to succeed if there's an entry already present?
  This case is broken as mmio_dev-mmio_nr is increased below.
 
 Oh, It's a bug to let mmio_nr increase even with MMIO found. I've fixed it.
 
 The reason we allow multiply callings is userspace can register different 
 types of 
 address here(table address and PBA address).

Ah OK. 

  PCI bits must be reviewed...
 
 Pardon? PCI related things are already in 2.6.38-rc.

I meant someone more familiar with PCI should review this patch,
Michael/Alex, CC'ed.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel

2011-02-22 Thread Alex Williamson
On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
 Then we can support mask bit operation of assigned devices now.

Looks pretty good overall.  A few comments below.  It seems like we
should be able to hook this into vfio with a small stub in kvm.  We just
need to be able to communicate disabling and enabling of individual msix
vectors.  For brute force, we could do this with a lot of eventfds,
triggered by kvm and consumed by vfio, two per MSI-X vector.  Not sure
if there's something smaller that could do it.  Thanks,

Alex

 Signed-off-by: Sheng Yang sh...@linux.intel.com
 ---
  arch/x86/kvm/Makefile|2 +-
  arch/x86/kvm/x86.c   |8 +-
  include/linux/kvm.h  |   21 
  include/linux/kvm_host.h |   25 
  virt/kvm/assigned-dev.c  |   44 +++
  virt/kvm/kvm_main.c  |   38 ++-
  virt/kvm/msix_mmio.c |  286 
 ++
  virt/kvm/msix_mmio.h |   25 
  8 files changed, 442 insertions(+), 7 deletions(-)
  create mode 100644 virt/kvm/msix_mmio.c
  create mode 100644 virt/kvm/msix_mmio.h
 
 diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
 index f15501f..3a0d851 100644
 --- a/arch/x86/kvm/Makefile
 +++ b/arch/x86/kvm/Makefile
 @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
  
  kvm-y+= $(addprefix ../../../virt/kvm/, kvm_main.o 
 ioapic.o \
   coalesced_mmio.o irq_comm.o eventfd.o \
 - assigned-dev.o)
 + assigned-dev.o msix_mmio.o)
  kvm-$(CONFIG_IOMMU_API)  += $(addprefix ../../../virt/kvm/, iommu.o)
  kvm-$(CONFIG_KVM_ASYNC_PF)   += $(addprefix ../../../virt/kvm/, async_pf.o)
  
 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
 index fa708c9..89bf12c 100644
 --- a/arch/x86/kvm/x86.c
 +++ b/arch/x86/kvm/x86.c
 @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
   case KVM_CAP_X86_ROBUST_SINGLESTEP:
   case KVM_CAP_XSAVE:
   case KVM_CAP_ASYNC_PF:
 + case KVM_CAP_MSIX_MMIO:
   r = 1;
   break;
   case KVM_CAP_COALESCED_MMIO:
 @@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned 
 long addr,
  struct kvm_vcpu *vcpu)
  {
   gpa_t gpa;
 + int r;
  
   gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
  
 @@ -3822,14 +3824,16 @@ static int emulator_write_emulated_onepage(unsigned 
 long addr,
  
  mmio:
   trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
 + r = vcpu_mmio_write(vcpu, gpa, bytes, val);
   /*
* Is this MMIO handled locally?
*/
 - if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
 + if (!r)
   return X86EMUL_CONTINUE;
  
   vcpu-mmio_needed = 1;
 - vcpu-run-exit_reason = KVM_EXIT_MMIO;
 + vcpu-run-exit_reason = (r == -ENOTSYNC) ?
 + KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
   vcpu-run-mmio.phys_addr = vcpu-mmio_phys_addr = gpa;
   vcpu-run-mmio.len = vcpu-mmio_size = bytes;
   vcpu-run-mmio.is_write = vcpu-mmio_is_write = 1;
 diff --git a/include/linux/kvm.h b/include/linux/kvm.h
 index ea2dc1a..ad9df4b 100644
 --- a/include/linux/kvm.h
 +++ b/include/linux/kvm.h
 @@ -161,6 +161,7 @@ struct kvm_pit_config {
  #define KVM_EXIT_NMI  16
  #define KVM_EXIT_INTERNAL_ERROR   17
  #define KVM_EXIT_OSI  18
 +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
  
  /* For KVM_EXIT_INTERNAL_ERROR */
  #define KVM_INTERNAL_ERROR_EMULATION 1
 @@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo {
  #define KVM_CAP_PPC_GET_PVINFO 57
  #define KVM_CAP_PPC_IRQ_LEVEL 58
  #define KVM_CAP_ASYNC_PF 59
 +#define KVM_CAP_MSIX_MMIO 60
  
  #ifdef KVM_CAP_IRQ_ROUTING
  
 @@ -672,6 +674,9 @@ struct kvm_clock_data {
  #define KVM_XEN_HVM_CONFIG_IOW(KVMIO,  0x7a, struct 
 kvm_xen_hvm_config)
  #define KVM_SET_CLOCK _IOW(KVMIO,  0x7b, struct kvm_clock_data)
  #define KVM_GET_CLOCK _IOR(KVMIO,  0x7c, struct kvm_clock_data)
 +/* Available with KVM_CAP_MSIX_MMIO */
 +#define KVM_REGISTER_MSIX_MMIO_IOW(KVMIO,  0x7d, struct 
 kvm_msix_mmio_user)
 +#define KVM_UNREGISTER_MSIX_MMIO  _IOW(KVMIO,  0x7e, struct 
 kvm_msix_mmio_user)
  /* Available with KVM_CAP_PIT_STATE2 */
  #define KVM_GET_PIT2  _IOR(KVMIO,  0x9f, struct kvm_pit_state2)
  #define KVM_SET_PIT2  _IOW(KVMIO,  0xa0, struct kvm_pit_state2)
 @@ -795,4 +800,20 @@ struct kvm_assigned_msix_entry {
   __u16 padding[3];
  };
  
 +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV  (1  0)
 +
 +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE(1  8)
 +
 +#define KVM_MSIX_MMIO_TYPE_DEV_MASK  0x00ff
 +#define KVM_MSIX_MMIO_TYPE_BASE_MASK 0xff00
 +struct kvm_msix_mmio_user {
 + __u32 dev_id;
 + __u16 type;
 + __u16 max_entries_nr;
 + __u64 base_addr;
 + __u64 base_va;
 + __u64 flags;
 + __u64 reserved[4];
 +};
 +
  #endif /* __LINUX_KVM_H */
 

Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel

2011-02-22 Thread Sheng Yang
On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote:
 On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
  Then we can support mask bit operation of assigned devices now.
 
 Looks pretty good overall.  A few comments below.  It seems like we
 should be able to hook this into vfio with a small stub in kvm.  We just
 need to be able to communicate disabling and enabling of individual msix
 vectors.  For brute force, we could do this with a lot of eventfds,
 triggered by kvm and consumed by vfio, two per MSI-X vector.  Not sure
 if there's something smaller that could do it.  Thanks,

Alex, thanks for your comments. See my comments below:

 
 Alex
 
  Signed-off-by: Sheng Yang sh...@linux.intel.com
  ---
  
   arch/x86/kvm/Makefile|2 +-
   arch/x86/kvm/x86.c   |8 +-
   include/linux/kvm.h  |   21 
   include/linux/kvm_host.h |   25 
   virt/kvm/assigned-dev.c  |   44 +++
   virt/kvm/kvm_main.c  |   38 ++-
   virt/kvm/msix_mmio.c |  286
   ++ virt/kvm/msix_mmio.h
   |   25 
   8 files changed, 442 insertions(+), 7 deletions(-)
   create mode 100644 virt/kvm/msix_mmio.c
   create mode 100644 virt/kvm/msix_mmio.h
  
  diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
  index f15501f..3a0d851 100644
  --- a/arch/x86/kvm/Makefile
  +++ b/arch/x86/kvm/Makefile
  @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
  
   kvm-y  += $(addprefix ../../../virt/kvm/, kvm_main.o 
  ioapic.o \
   
  coalesced_mmio.o irq_comm.o eventfd.o \
  
  -   assigned-dev.o)
  +   assigned-dev.o msix_mmio.o)
  
   kvm-$(CONFIG_IOMMU_API)+= $(addprefix ../../../virt/kvm/, iommu.o)
   kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/,
   async_pf.o)
  
  diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
  index fa708c9..89bf12c 100644
  --- a/arch/x86/kvm/x86.c
  +++ b/arch/x86/kvm/x86.c
  @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
  
  case KVM_CAP_X86_ROBUST_SINGLESTEP:
  case KVM_CAP_XSAVE:
  
  case KVM_CAP_ASYNC_PF:
  +   case KVM_CAP_MSIX_MMIO:
  r = 1;
  break;
  
  case KVM_CAP_COALESCED_MMIO:
  @@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned
  long addr,
  
 struct kvm_vcpu *vcpu)
   
   {
   
  gpa_t gpa;
  
  +   int r;
  
  gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
  
  @@ -3822,14 +3824,16 @@ static int
  emulator_write_emulated_onepage(unsigned long addr,
  
   mmio:
  trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
  
  +   r = vcpu_mmio_write(vcpu, gpa, bytes, val);
  
  /*
  
   * Is this MMIO handled locally?
   */
  
  -   if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
  +   if (!r)
  
  return X86EMUL_CONTINUE;
  
  vcpu-mmio_needed = 1;
  
  -   vcpu-run-exit_reason = KVM_EXIT_MMIO;
  +   vcpu-run-exit_reason = (r == -ENOTSYNC) ?
  +   KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
  
  vcpu-run-mmio.phys_addr = vcpu-mmio_phys_addr = gpa;
  vcpu-run-mmio.len = vcpu-mmio_size = bytes;
  vcpu-run-mmio.is_write = vcpu-mmio_is_write = 1;
  
  diff --git a/include/linux/kvm.h b/include/linux/kvm.h
  index ea2dc1a..ad9df4b 100644
  --- a/include/linux/kvm.h
  +++ b/include/linux/kvm.h
  @@ -161,6 +161,7 @@ struct kvm_pit_config {
  
   #define KVM_EXIT_NMI  16
   #define KVM_EXIT_INTERNAL_ERROR   17
   #define KVM_EXIT_OSI  18
  
  +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
  
   /* For KVM_EXIT_INTERNAL_ERROR */
   #define KVM_INTERNAL_ERROR_EMULATION 1
  
  @@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo {
  
   #define KVM_CAP_PPC_GET_PVINFO 57
   #define KVM_CAP_PPC_IRQ_LEVEL 58
   #define KVM_CAP_ASYNC_PF 59
  
  +#define KVM_CAP_MSIX_MMIO 60
  
   #ifdef KVM_CAP_IRQ_ROUTING
  
  @@ -672,6 +674,9 @@ struct kvm_clock_data {
  
   #define KVM_XEN_HVM_CONFIG_IOW(KVMIO,  0x7a, struct
   kvm_xen_hvm_config) #define KVM_SET_CLOCK _IOW(KVMIO, 
   0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK
   _IOR(KVMIO,  0x7c, struct kvm_clock_data)
  
  +/* Available with KVM_CAP_MSIX_MMIO */
  +#define KVM_REGISTER_MSIX_MMIO_IOW(KVMIO,  0x7d, struct
  kvm_msix_mmio_user) +#define KVM_UNREGISTER_MSIX_MMIO  _IOW(KVMIO, 
  0x7e, struct kvm_msix_mmio_user)
  
   /* Available with KVM_CAP_PIT_STATE2 */
   #define KVM_GET_PIT2  _IOR(KVMIO,  0x9f, struct
   kvm_pit_state2) #define KVM_SET_PIT2  _IOW(KVMIO,  0xa0,
   struct kvm_pit_state2)
  
  @@ -795,4 +800,20 @@ struct kvm_assigned_msix_entry {
  
  __u16 padding[3];
   
   };
  
  +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV(1  0)
  +
  +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE  (1  8)
  +
  +#define KVM_MSIX_MMIO_TYPE_DEV_MASK0x00ff
  

Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel

2011-02-18 Thread Sheng Yang
On Thursday 03 February 2011 09:05:55 Marcelo Tosatti wrote:
 On Sun, Jan 30, 2011 at 01:11:15PM +0800, Sheng Yang wrote:
  Then we can support mask bit operation of assigned devices now.
  
  Signed-off-by: Sheng Yang sh...@linux.intel.com
  
  +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
  +   struct kvm_msix_mmio_user *mmio_user)
  +{
  +   struct kvm_msix_mmio_dev *mmio_dev = kvm-msix_mmio_dev;
  +   struct kvm_msix_mmio *mmio = NULL;
  +   int r = 0, i;
  +
  +   mutex_lock(mmio_dev-lock);
  +   for (i = 0; i  mmio_dev-mmio_nr; i++) {
  +   if (mmio_dev-mmio[i].dev_id == mmio_user-dev_id 
  +   (mmio_dev-mmio[i].type  KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
  +   (mmio_user-type  KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
  +   mmio = mmio_dev-mmio[i];
  +   if (mmio-max_entries_nr != mmio_user-max_entries_nr) {
  +   r = -EINVAL;
  +   goto out;
  +   }
  +   break;
  +   }
 
 Why allow this ioctl to succeed if there's an entry already present?
 This case is broken as mmio_dev-mmio_nr is increased below.

Oh, It's a bug to let mmio_nr increase even with MMIO found. I've fixed it.

The reason we allow multiply callings is userspace can register different types 
of 
address here(table address and PBA address).

 PCI bits must be reviewed...

Pardon? PCI related things are already in 2.6.38-rc.

--
regards
Yang, Sheng


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel

2011-02-02 Thread Marcelo Tosatti
On Sun, Jan 30, 2011 at 01:11:15PM +0800, Sheng Yang wrote:
 Then we can support mask bit operation of assigned devices now.
 
 Signed-off-by: Sheng Yang sh...@linux.intel.com

 +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
 + struct kvm_msix_mmio_user *mmio_user)
 +{
 + struct kvm_msix_mmio_dev *mmio_dev = kvm-msix_mmio_dev;
 + struct kvm_msix_mmio *mmio = NULL;
 + int r = 0, i;
 +
 + mutex_lock(mmio_dev-lock);
 + for (i = 0; i  mmio_dev-mmio_nr; i++) {
 + if (mmio_dev-mmio[i].dev_id == mmio_user-dev_id 
 + (mmio_dev-mmio[i].type  KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
 + (mmio_user-type  KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
 + mmio = mmio_dev-mmio[i];
 + if (mmio-max_entries_nr != mmio_user-max_entries_nr) {
 + r = -EINVAL;
 + goto out;
 + }
 + break;
 + }

Why allow this ioctl to succeed if there's an entry already present?
This case is broken as mmio_dev-mmio_nr is increased below.

PCI bits must be reviewed...

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel

2011-01-31 Thread Avi Kivity

On 01/30/2011 06:38 AM, Sheng Yang wrote:

(Sorry, missed this mail...)

On Mon, Jan 17, 2011 at 02:29:44PM +0200, Avi Kivity wrote:
  On 01/06/2011 12:19 PM, Sheng Yang wrote:
  Then we can support mask bit operation of assigned devices now.
  
  
  
  +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
  + int assigned_dev_id, int entry, bool mask)
  +{
  + int r = -EFAULT;
  + struct kvm_assigned_dev_kernel *adev;
  + int i;
  +
  + if (!irqchip_in_kernel(kvm))
  + return r;
  +
  + mutex_lock(kvm-lock);
  + adev = kvm_find_assigned_dev(kvm-arch.assigned_dev_head,
  +   assigned_dev_id);
  + if (!adev)
  + goto out;
  +
  + for (i = 0; i   adev-entries_nr; i++)
  + if (adev-host_msix_entries[i].entry == entry) {
  + if (mask)
  + disable_irq_nosync(
  + adev-host_msix_entries[i].vector);

  Is it okay to call disable_irq_nosync() here?  IIRC we don't check
  the mask bit on irq delivery, so we may forward an interrupt to the
  guest after the mask bit was set.

  What does pci say about the mask bit?  when does it take effect?

  Another question is whether disable_irq_nosync() actually programs
  the device mask bit, or not.  If it does, then it's slow, and it may
  be better to leave interrupts enabled but have an internal pending
  bit.  If it doesn't program the mask bit, it's fine.

I think Michael and Jan had explained this.

  + else
  + enable_irq(adev-host_msix_entries[i].vector);
  + r = 0;
  + break;
  + }
  +out:
  + mutex_unlock(kvm-lock);
  + return r;
  +}
  
  +
  +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, 
int len,
  + void *val)
  +{
  + struct kvm_msix_mmio_dev *mmio_dev =
  + container_of(this, struct kvm_msix_mmio_dev, table_dev);
  + struct kvm_msix_mmio *mmio;
  + int idx, ret = 0, entry, offset, r;
  +
  + mutex_lock(mmio_dev-lock);
  + idx = get_mmio_table_index(mmio_dev, addr, len);
  + if (idx   0) {
  + ret = -EOPNOTSUPP;
  + goto out;
  + }
  + if ((addr   0x3) || (len != 4   len != 8))
  + goto out;
  +
  + offset = addr   0xf;
  + if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL   len == 8)
  + goto out;
  +
  + mmio =mmio_dev-mmio[idx];
  + entry = (addr - mmio-table_base_addr) / PCI_MSIX_ENTRY_SIZE;
  + r = copy_from_user(val, (void __user *)(mmio-table_base_va +
  + entry * PCI_MSIX_ENTRY_SIZE + offset), len);
  + if (r)
  + goto out;

  and return ret == 0?

Yes. This operation should be handled by in-kernel MSI-X MMIO. So we return 0
in order to omit this action. We can add warning to it later.


But it failed.  We need to return -EFAULT.


The same as above.

  +
  + if ((offset   PCI_MSIX_ENTRY_VECTOR_CTRL   len == 4) ||
  + (offset   PCI_MSIX_ENTRY_DATA   len == 8))
  + ret = -ENOTSYNC;

  goto out?

No. This judgement only check if MSI data/address was touched. And the line
below would check if we need to operate mask bit. Because in theory guest can
use len=8 to modify MSI-X data and ctrl at the same time.



Ok, makes sense.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel

2011-01-31 Thread Sheng Yang
On Mon, Jan 31, 2011 at 03:09:09PM +0200, Avi Kivity wrote:
 On 01/30/2011 06:38 AM, Sheng Yang wrote:
 (Sorry, missed this mail...)
 
 On Mon, Jan 17, 2011 at 02:29:44PM +0200, Avi Kivity wrote:
   On 01/06/2011 12:19 PM, Sheng Yang wrote:
   Then we can support mask bit operation of assigned devices now.
   
   
   
   +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
   +int assigned_dev_id, int entry, bool 
  mask)
   +{
   +int r = -EFAULT;
   +struct kvm_assigned_dev_kernel *adev;
   +int i;
   +
   +if (!irqchip_in_kernel(kvm))
   +return r;
   +
   +mutex_lock(kvm-lock);
   +adev = kvm_find_assigned_dev(kvm-arch.assigned_dev_head,
   +  assigned_dev_id);
   +if (!adev)
   +goto out;
   +
   +for (i = 0; i   adev-entries_nr; i++)
   +if (adev-host_msix_entries[i].entry == entry) {
   +if (mask)
   +disable_irq_nosync(
   +
  adev-host_msix_entries[i].vector);
 
   Is it okay to call disable_irq_nosync() here?  IIRC we don't check
   the mask bit on irq delivery, so we may forward an interrupt to the
   guest after the mask bit was set.
 
   What does pci say about the mask bit?  when does it take effect?
 
   Another question is whether disable_irq_nosync() actually programs
   the device mask bit, or not.  If it does, then it's slow, and it may
   be better to leave interrupts enabled but have an internal pending
   bit.  If it doesn't program the mask bit, it's fine.
 
 I think Michael and Jan had explained this.
 
   +else
   +
  enable_irq(adev-host_msix_entries[i].vector);
   +r = 0;
   +break;
   +}
   +out:
   +mutex_unlock(kvm-lock);
   +return r;
   +}
   
   +
   +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, 
  int len,
   +void *val)
   +{
   +struct kvm_msix_mmio_dev *mmio_dev =
   +container_of(this, struct kvm_msix_mmio_dev, table_dev);
   +struct kvm_msix_mmio *mmio;
   +int idx, ret = 0, entry, offset, r;
   +
   +mutex_lock(mmio_dev-lock);
   +idx = get_mmio_table_index(mmio_dev, addr, len);
   +if (idx   0) {
   +ret = -EOPNOTSUPP;
   +goto out;
   +}
   +if ((addr   0x3) || (len != 4   len != 8))
   +goto out;
   +
   +offset = addr   0xf;
   +if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL   len == 8)
   +goto out;
   +
   +mmio =mmio_dev-mmio[idx];
   +entry = (addr - mmio-table_base_addr) / PCI_MSIX_ENTRY_SIZE;
   +r = copy_from_user(val, (void __user *)(mmio-table_base_va +
   +entry * PCI_MSIX_ENTRY_SIZE + offset), len);
   +if (r)
   +goto out;
 
   and return ret == 0?
 
 Yes. This operation should be handled by in-kernel MSI-X MMIO. So we return 0
 in order to omit this action. We can add warning to it later.
 
 But it failed.  We need to return -EFAULT.

So it would return to QEmu. OK, let QEmu prints warning about it.

-- 
regards
Yang, Sheng
 
 The same as above.
 
   +
   +if ((offset   PCI_MSIX_ENTRY_VECTOR_CTRL   len == 4) ||
   +(offset   PCI_MSIX_ENTRY_DATA   len == 8))
   +ret = -ENOTSYNC;
 
   goto out?
 
 No. This judgement only check if MSI data/address was touched. And the line
 below would check if we need to operate mask bit. Because in theory guest can
 use len=8 to modify MSI-X data and ctrl at the same time.
 
 
 Ok, makes sense.
 
 -- 
 error compiling committee.c: too many arguments to function
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel

2011-01-29 Thread Sheng Yang
(Sorry, missed this mail...)

On Mon, Jan 17, 2011 at 02:29:44PM +0200, Avi Kivity wrote:
 On 01/06/2011 12:19 PM, Sheng Yang wrote:
 Then we can support mask bit operation of assigned devices now.
 
 
 
 +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
 +int assigned_dev_id, int entry, bool mask)
 +{
 +int r = -EFAULT;
 +struct kvm_assigned_dev_kernel *adev;
 +int i;
 +
 +if (!irqchip_in_kernel(kvm))
 +return r;
 +
 +mutex_lock(kvm-lock);
 +adev = kvm_find_assigned_dev(kvm-arch.assigned_dev_head,
 +  assigned_dev_id);
 +if (!adev)
 +goto out;
 +
 +for (i = 0; i  adev-entries_nr; i++)
 +if (adev-host_msix_entries[i].entry == entry) {
 +if (mask)
 +disable_irq_nosync(
 +adev-host_msix_entries[i].vector);
 
 Is it okay to call disable_irq_nosync() here?  IIRC we don't check
 the mask bit on irq delivery, so we may forward an interrupt to the
 guest after the mask bit was set.
 
 What does pci say about the mask bit?  when does it take effect?
 
 Another question is whether disable_irq_nosync() actually programs
 the device mask bit, or not.  If it does, then it's slow, and it may
 be better to leave interrupts enabled but have an internal pending
 bit.  If it doesn't program the mask bit, it's fine.

I think Michael and Jan had explained this.
 
 +else
 +enable_irq(adev-host_msix_entries[i].vector);
 +r = 0;
 +break;
 +}
 +out:
 +mutex_unlock(kvm-lock);
 +return r;
 +}
 
 +
 +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int 
 len,
 +void *val)
 +{
 +struct kvm_msix_mmio_dev *mmio_dev =
 +container_of(this, struct kvm_msix_mmio_dev, table_dev);
 +struct kvm_msix_mmio *mmio;
 +int idx, ret = 0, entry, offset, r;
 +
 +mutex_lock(mmio_dev-lock);
 +idx = get_mmio_table_index(mmio_dev, addr, len);
 +if (idx  0) {
 +ret = -EOPNOTSUPP;
 +goto out;
 +}
 +if ((addr  0x3) || (len != 4  len != 8))
 +goto out;
 +
 +offset = addr  0xf;
 +if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL  len == 8)
 +goto out;
 +
 +mmio =mmio_dev-mmio[idx];
 +entry = (addr - mmio-table_base_addr) / PCI_MSIX_ENTRY_SIZE;
 +r = copy_from_user(val, (void __user *)(mmio-table_base_va +
 +entry * PCI_MSIX_ENTRY_SIZE + offset), len);
 +if (r)
 +goto out;
 
 and return ret == 0?

Yes. This operation should be handled by in-kernel MSI-X MMIO. So we return 0
in order to omit this action. We can add warning to it later.
 
 +out:
 +mutex_unlock(mmio_dev-lock);
 +return ret;
 +}
 +
 +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
 +int len, const void *val)
 +{
 +struct kvm_msix_mmio_dev *mmio_dev =
 +container_of(this, struct kvm_msix_mmio_dev, table_dev);
 +struct kvm_msix_mmio *mmio;
 +int idx, entry, offset, ret = 0, r = 0;
 +gpa_t entry_base;
 +u32 old_ctrl, new_ctrl;
 +u32 *ctrl_pos;
 +
 +mutex_lock(mmio_dev-lock);
 +idx = get_mmio_table_index(mmio_dev, addr, len);
 +if (idx  0) {
 +ret = -EOPNOTSUPP;
 +goto out;
 +}
 +if ((addr  0x3) || (len != 4  len != 8))
 +goto out;
 +
 +offset = addr  0xF;
 +if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL  len == 8)
 +goto out;
 +
 +mmio =mmio_dev-mmio[idx];
 +entry = (addr - mmio-table_base_addr) / PCI_MSIX_ENTRY_SIZE;
 +entry_base = mmio-table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
 +ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
 +
 +if (get_user(old_ctrl, ctrl_pos))
 +goto out;
 +
 +/* No allow writing to other fields when entry is unmasked */
 +if (!(old_ctrl  PCI_MSIX_ENTRY_CTRL_MASKBIT)
 +offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
 +goto out;
 +
 +if (copy_to_user((void __user *)(entry_base + offset), val, len))
 +goto out;
 +
 +if (get_user(new_ctrl, ctrl_pos))
 +goto out;
 
 here, too.

The same as above.
 
 +
 +if ((offset  PCI_MSIX_ENTRY_VECTOR_CTRL  len == 4) ||
 +(offset  PCI_MSIX_ENTRY_DATA  len == 8))
 +ret = -ENOTSYNC;
 
 goto out?

No. This judgement only check if MSI data/address was touched. And the line
below would check if we need to operate mask bit. Because in theory guest can
use len=8 to modify MSI-X data and ctrl at the same time.

-- 
regards
Yang, Sheng
 
 +if (old_ctrl == new_ctrl)
 +goto out;
 +if (!(old_ctrl  PCI_MSIX_ENTRY_CTRL_MASKBIT)
 +(new_ctrl  PCI_MSIX_ENTRY_CTRL_MASKBIT))
 +r = update_msix_mask_bit(mmio_dev-kvm, 

[PATCH 2/3] KVM: Emulate MSI-X table in kernel

2011-01-29 Thread Sheng Yang
Then we can support mask bit operation of assigned devices now.

Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 arch/x86/kvm/Makefile|2 +-
 arch/x86/kvm/x86.c   |8 +-
 include/linux/kvm.h  |   21 
 include/linux/kvm_host.h |   25 
 virt/kvm/assigned-dev.c  |   44 +++
 virt/kvm/kvm_main.c  |   38 ++-
 virt/kvm/msix_mmio.c |  286 ++
 virt/kvm/msix_mmio.h |   25 
 8 files changed, 442 insertions(+), 7 deletions(-)
 create mode 100644 virt/kvm/msix_mmio.c
 create mode 100644 virt/kvm/msix_mmio.h

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index f15501f..3a0d851 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
 
 kvm-y  += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
coalesced_mmio.o irq_comm.o eventfd.o \
-   assigned-dev.o)
+   assigned-dev.o msix_mmio.o)
 kvm-$(CONFIG_IOMMU_API)+= $(addprefix ../../../virt/kvm/, iommu.o)
 kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o)
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fa708c9..89bf12c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_X86_ROBUST_SINGLESTEP:
case KVM_CAP_XSAVE:
case KVM_CAP_ASYNC_PF:
+   case KVM_CAP_MSIX_MMIO:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned long 
addr,
   struct kvm_vcpu *vcpu)
 {
gpa_t gpa;
+   int r;
 
gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
 
@@ -3822,14 +3824,16 @@ static int emulator_write_emulated_onepage(unsigned 
long addr,
 
 mmio:
trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
+   r = vcpu_mmio_write(vcpu, gpa, bytes, val);
/*
 * Is this MMIO handled locally?
 */
-   if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
+   if (!r)
return X86EMUL_CONTINUE;
 
vcpu-mmio_needed = 1;
-   vcpu-run-exit_reason = KVM_EXIT_MMIO;
+   vcpu-run-exit_reason = (r == -ENOTSYNC) ?
+   KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
vcpu-run-mmio.phys_addr = vcpu-mmio_phys_addr = gpa;
vcpu-run-mmio.len = vcpu-mmio_size = bytes;
vcpu-run-mmio.is_write = vcpu-mmio_is_write = 1;
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index ea2dc1a..ad9df4b 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -161,6 +161,7 @@ struct kvm_pit_config {
 #define KVM_EXIT_NMI  16
 #define KVM_EXIT_INTERNAL_ERROR   17
 #define KVM_EXIT_OSI  18
+#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 #define KVM_INTERNAL_ERROR_EMULATION 1
@@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_PPC_GET_PVINFO 57
 #define KVM_CAP_PPC_IRQ_LEVEL 58
 #define KVM_CAP_ASYNC_PF 59
+#define KVM_CAP_MSIX_MMIO 60
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -672,6 +674,9 @@ struct kvm_clock_data {
 #define KVM_XEN_HVM_CONFIG_IOW(KVMIO,  0x7a, struct kvm_xen_hvm_config)
 #define KVM_SET_CLOCK _IOW(KVMIO,  0x7b, struct kvm_clock_data)
 #define KVM_GET_CLOCK _IOR(KVMIO,  0x7c, struct kvm_clock_data)
+/* Available with KVM_CAP_MSIX_MMIO */
+#define KVM_REGISTER_MSIX_MMIO_IOW(KVMIO,  0x7d, struct kvm_msix_mmio_user)
+#define KVM_UNREGISTER_MSIX_MMIO  _IOW(KVMIO,  0x7e, struct kvm_msix_mmio_user)
 /* Available with KVM_CAP_PIT_STATE2 */
 #define KVM_GET_PIT2  _IOR(KVMIO,  0x9f, struct kvm_pit_state2)
 #define KVM_SET_PIT2  _IOW(KVMIO,  0xa0, struct kvm_pit_state2)
@@ -795,4 +800,20 @@ struct kvm_assigned_msix_entry {
__u16 padding[3];
 };
 
+#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV(1  0)
+
+#define KVM_MSIX_MMIO_TYPE_BASE_TABLE  (1  8)
+
+#define KVM_MSIX_MMIO_TYPE_DEV_MASK0x00ff
+#define KVM_MSIX_MMIO_TYPE_BASE_MASK   0xff00
+struct kvm_msix_mmio_user {
+   __u32 dev_id;
+   __u16 type;
+   __u16 max_entries_nr;
+   __u64 base_addr;
+   __u64 base_va;
+   __u64 flags;
+   __u64 reserved[4];
+};
+
 #endif /* __LINUX_KVM_H */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7d313e0..c10670c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -233,6 +233,27 @@ struct kvm_memslots {
KVM_PRIVATE_MEM_SLOTS];
 };
 
+#define KVM_MSIX_MMIO_MAX32
+
+struct kvm_msix_mmio {
+   u32 dev_id;
+   u16 type;
+   u16 max_entries_nr;
+   u64 flags;
+   gpa_t table_base_addr;
+   hva_t table_base_va;
+   gpa_t pba_base_addr;
+   hva_t 

Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel

2011-01-19 Thread Sheng Yang
On Monday 17 January 2011 20:39:30 Marcelo Tosatti wrote:
 On Mon, Jan 17, 2011 at 08:18:22PM +0800, Sheng Yang wrote:
+   goto out;
+
+   mmio = mmio_dev-mmio[idx];
+   entry = (addr - mmio-table_base_addr) / PCI_MSIX_ENTRY_SIZE;
+   entry_base = mmio-table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
+   ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
+
+   if (get_user(old_ctrl, ctrl_pos))
+   goto out;
+
+   /* No allow writing to other fields when entry is unmasked */
+   if (!(old_ctrl  PCI_MSIX_ENTRY_CTRL_MASKBIT) 
+   offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
+   goto out;
+
+   if (copy_to_user((void __user *)(entry_base + offset), val, 
len))
+   goto out;
   
   Instead of copying to/from userspace (which is subject to swapin,
   unexpected values), you could include the guest written value in a
   kvm_run structure, along with address. Qemu-kvm would use that to
   synchronize its copy of the table, on KVM_EXIT_MSIX_ROUTING_UPDATE
   exit.
  
  We want to acelerate MSI-X mask bit accessing, which won't exit to
  userspace in the most condition. That's the cost we want to optimize.
  Also it's possible to userspace to read the correct value of MMIO(but
  mostly userspace can't write to it in order to prevent synchronize
  issue).
 
 Yes, i meant exit to userspace only when necessary, but instead of
 copying directly everytime record the value the guest has written in
 kvm_run and exit with KVM_EXIT_MSIX_ROUTING_UPDATE.
 
+   if (get_user(new_ctrl, ctrl_pos))
+   goto out;
+
+   if ((offset  PCI_MSIX_ENTRY_VECTOR_CTRL  len == 4) ||
+   (offset  PCI_MSIX_ENTRY_DATA  len == 8))
+   ret = -ENOTSYNC;
+   if (old_ctrl == new_ctrl)
+   goto out;
+   if (!(old_ctrl  PCI_MSIX_ENTRY_CTRL_MASKBIT) 
+   (new_ctrl  PCI_MSIX_ENTRY_CTRL_MASKBIT))
+   r = update_msix_mask_bit(mmio_dev-kvm, mmio, entry, 1);
+   else if ((old_ctrl  PCI_MSIX_ENTRY_CTRL_MASKBIT) 
+   !(new_ctrl  PCI_MSIX_ENTRY_CTRL_MASKBIT))
+   r = update_msix_mask_bit(mmio_dev-kvm, mmio, entry, 0);
   
   Then you rely on the kernel copy of the values to enable/disable irq.
  
  Yes, they are guest owned assigned IRQs. Any potential issue?
 
 Nothing prevents the kernel from enabling or disabling irq multiple
 times with this code (what prevents it is a qemu-kvm that behaves as
 expected). This is not very good.
 
 Perhaps the guest can only harm itself with that, but i'm not sure.

MSI-X interrupts are not shared, so I think guest can only harm itself if it 
was 
doing it wrong.
 
 And also if an msix table page is swapped out guest will hang.
 
+   return r;
+}
   
   This is not consistent with registration, where there are no checks
   regarding validity of assigned device id. So why is it necessary?
  
  I am not quite understand. We need to free mmio anyway, otherwise it
  would result in wrong mmio interception...
 
 If you return -EOPNOTSUPP in case kvm_find_assigned_dev fails in the
 read/write paths, there is no need to free anything.

It may work with assigned devices, but the potential user of this is including 
vfio 
drivers and emulate devices. And I don't think it's a good idea to have 
registeration process but no free process...

--
regards
Yang, Sheng

 
   There is a lock ordering problem BTW:
   
   - read/write handlers: dev-lock - kvm-lock
   - vm_ioctl_deassign_device - kvm_free_msix_mmio: kvm-lock -
   dev-lock
  
  Good catch! Would fix it(and other comments of course).
  
  --
  regards
  Yang, Sheng
  
+
+int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
+ struct kvm_msix_mmio_user 
*mmio_user)
+{
+   struct kvm_msix_mmio mmio;
+
+   mmio.dev_id = mmio_user-dev_id;
+   mmio.type = mmio_user-type;
+
+   return kvm_free_msix_mmio(kvm, mmio);
+}
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel

2011-01-17 Thread Marcelo Tosatti
On Thu, Jan 06, 2011 at 06:19:44PM +0800, Sheng Yang wrote:
 Then we can support mask bit operation of assigned devices now.
 
 Signed-off-by: Sheng Yang sh...@linux.intel.com
 ---
  arch/x86/kvm/Makefile|2 +-
  arch/x86/kvm/x86.c   |8 +-
  include/linux/kvm.h  |   21 
  include/linux/kvm_host.h |   25 
  virt/kvm/assigned-dev.c  |   44 +++
  virt/kvm/kvm_main.c  |   38 ++-
  virt/kvm/msix_mmio.c |  284 
 ++
  virt/kvm/msix_mmio.h |   25 
  8 files changed, 440 insertions(+), 7 deletions(-)
  create mode 100644 virt/kvm/msix_mmio.c
  create mode 100644 virt/kvm/msix_mmio.h
 
 diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
 index ae72ae6..7444dcd 100644
 --- a/virt/kvm/assigned-dev.c
 +++ b/virt/kvm/assigned-dev.c
 @@ -18,6 +18,7 @@
  #include linux/interrupt.h
  #include linux/slab.h
  #include irq.h
 +#include msix_mmio.h
  
  static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct 
 list_head *head,
 int assigned_dev_id)
 @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
   kvm_deassign_irq(kvm, assigned_dev, assigned_dev-irq_requested_type);
  }
  
 +static void assigned_device_free_msix_mmio(struct kvm *kvm,
 + struct kvm_assigned_dev_kernel *adev)
 +{
 + struct kvm_msix_mmio mmio;
 +
 + mmio.dev_id = adev-assigned_dev_id;
 + mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
 + KVM_MSIX_MMIO_TYPE_BASE_TABLE;
 + kvm_free_msix_mmio(kvm, mmio);
 +}
 +
  static void kvm_free_assigned_device(struct kvm *kvm,
struct kvm_assigned_dev_kernel
*assigned_dev)
  {
   kvm_free_assigned_irq(kvm, assigned_dev);
  
 + assigned_device_free_msix_mmio(kvm, assigned_dev);
 +
   __pci_reset_function(assigned_dev-dev);
   pci_restore_state(assigned_dev-dev);
  
 @@ -785,3 +799,33 @@ out:
   return r;
  }
  
 +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
 + int assigned_dev_id, int entry, bool mask)
 +{
 + int r = -EFAULT;
 + struct kvm_assigned_dev_kernel *adev;
 + int i;
 +
 + if (!irqchip_in_kernel(kvm))
 + return r;
 +
 + mutex_lock(kvm-lock);
 + adev = kvm_find_assigned_dev(kvm-arch.assigned_dev_head,
 +   assigned_dev_id);
 + if (!adev)
 + goto out;
 +
 + for (i = 0; i  adev-entries_nr; i++)
 + if (adev-host_msix_entries[i].entry == entry) {
 + if (mask)
 + disable_irq_nosync(
 + adev-host_msix_entries[i].vector);
 + else
 + enable_irq(adev-host_msix_entries[i].vector);
 + r = 0;
 + break;
 + }

Should check if the host irq is registered as MSIX type.

 +out:
 + mutex_unlock(kvm-lock);
 + return r;
 +}
 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
 index b1b6cbb..b7807c8 100644
 --- a/virt/kvm/kvm_main.c
 +++ b/virt/kvm/kvm_main.c
 @@ -56,6 +56,7 @@
  
  #include coalesced_mmio.h
  #include async_pf.h
 +#include msix_mmio.h
  
  #define CREATE_TRACE_POINTS
  #include trace/events/kvm.h
 @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
   struct mm_struct *mm = kvm-mm;
  
   kvm_arch_sync_events(kvm);
 + kvm_unregister_msix_mmio_dev(kvm);
   spin_lock(kvm_lock);
   list_del(kvm-vm_list);
   spin_unlock(kvm_lock);
 @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp,
   mutex_unlock(kvm-lock);
   break;
  #endif
 + case KVM_REGISTER_MSIX_MMIO: {
 + struct kvm_msix_mmio_user mmio_user;
 +
 + r = -EFAULT;
 + if (copy_from_user(mmio_user, argp, sizeof mmio_user))
 + goto out;
 + r = kvm_vm_ioctl_register_msix_mmio(kvm, mmio_user);
 + break;
 + }
 + case KVM_UNREGISTER_MSIX_MMIO: {
 + struct kvm_msix_mmio_user mmio_user;
 +
 + r = -EFAULT;
 + if (copy_from_user(mmio_user, argp, sizeof mmio_user))
 + goto out;
 + r = kvm_vm_ioctl_unregister_msix_mmio(kvm, mmio_user);
 + break;
 + }
   default:
   r = kvm_arch_vm_ioctl(filp, ioctl, arg);
   if (r == -ENOTTY)
 @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
   return r;
   }
  #endif
 + r = kvm_register_msix_mmio_dev(kvm);
 + if (r  0) {
 + kvm_put_kvm(kvm);
 + return r;
 + }
 +
   r = anon_inode_getfd(kvm-vm, kvm_vm_fops, kvm, O_RDWR);
   if (r  0)
   kvm_put_kvm(kvm);
 @@ -2223,14 +2249,18 @@ static void 

Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel

2011-01-17 Thread Sheng Yang
On Monday 17 January 2011 19:54:47 Marcelo Tosatti wrote:
 On Thu, Jan 06, 2011 at 06:19:44PM +0800, Sheng Yang wrote:
  Then we can support mask bit operation of assigned devices now.
  
  Signed-off-by: Sheng Yang sh...@linux.intel.com
  ---
  
   arch/x86/kvm/Makefile|2 +-
   arch/x86/kvm/x86.c   |8 +-
   include/linux/kvm.h  |   21 
   include/linux/kvm_host.h |   25 
   virt/kvm/assigned-dev.c  |   44 +++
   virt/kvm/kvm_main.c  |   38 ++-
   virt/kvm/msix_mmio.c |  284
   ++ virt/kvm/msix_mmio.h
   |   25 
   8 files changed, 440 insertions(+), 7 deletions(-)
   create mode 100644 virt/kvm/msix_mmio.c
   create mode 100644 virt/kvm/msix_mmio.h
  
  diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
  index ae72ae6..7444dcd 100644
  --- a/virt/kvm/assigned-dev.c
  +++ b/virt/kvm/assigned-dev.c
  @@ -18,6 +18,7 @@
  
   #include linux/interrupt.h
   #include linux/slab.h
   #include irq.h
  
  +#include msix_mmio.h
  
   static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct
   list_head *head,
   
int assigned_dev_id)
  
  @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
  
  kvm_deassign_irq(kvm, assigned_dev, assigned_dev-irq_requested_type);
   
   }
  
  +static void assigned_device_free_msix_mmio(struct kvm *kvm,
  +   struct kvm_assigned_dev_kernel *adev)
  +{
  +   struct kvm_msix_mmio mmio;
  +
  +   mmio.dev_id = adev-assigned_dev_id;
  +   mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
  +   KVM_MSIX_MMIO_TYPE_BASE_TABLE;
  +   kvm_free_msix_mmio(kvm, mmio);
  +}
  +
  
   static void kvm_free_assigned_device(struct kvm *kvm,
   
   struct kvm_assigned_dev_kernel
   *assigned_dev)
   
   {
   
  kvm_free_assigned_irq(kvm, assigned_dev);
  
  +   assigned_device_free_msix_mmio(kvm, assigned_dev);
  +
  
  __pci_reset_function(assigned_dev-dev);
  pci_restore_state(assigned_dev-dev);
  
  @@ -785,3 +799,33 @@ out:
  return r;
   
   }
  
  +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
  +   int assigned_dev_id, int entry, bool mask)
  +{
  +   int r = -EFAULT;
  +   struct kvm_assigned_dev_kernel *adev;
  +   int i;
  +
  +   if (!irqchip_in_kernel(kvm))
  +   return r;
  +
  +   mutex_lock(kvm-lock);
  +   adev = kvm_find_assigned_dev(kvm-arch.assigned_dev_head,
  + assigned_dev_id);
  +   if (!adev)
  +   goto out;
  +
  +   for (i = 0; i  adev-entries_nr; i++)
  +   if (adev-host_msix_entries[i].entry == entry) {
  +   if (mask)
  +   disable_irq_nosync(
  +   adev-host_msix_entries[i].vector);
  +   else
  +   enable_irq(adev-host_msix_entries[i].vector);
  +   r = 0;
  +   break;
  +   }
 
 Should check if the host irq is registered as MSIX type.
 
  +out:
  +   mutex_unlock(kvm-lock);
  +   return r;
  +}
  diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
  index b1b6cbb..b7807c8 100644
  --- a/virt/kvm/kvm_main.c
  +++ b/virt/kvm/kvm_main.c
  @@ -56,6 +56,7 @@
  
   #include coalesced_mmio.h
   #include async_pf.h
  
  +#include msix_mmio.h
  
   #define CREATE_TRACE_POINTS
   #include trace/events/kvm.h
  
  @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
  
  struct mm_struct *mm = kvm-mm;
  
  kvm_arch_sync_events(kvm);
  
  +   kvm_unregister_msix_mmio_dev(kvm);
  
  spin_lock(kvm_lock);
  list_del(kvm-vm_list);
  spin_unlock(kvm_lock);
  
  @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp,
  
  mutex_unlock(kvm-lock);
  break;
   
   #endif
  
  +   case KVM_REGISTER_MSIX_MMIO: {
  +   struct kvm_msix_mmio_user mmio_user;
  +
  +   r = -EFAULT;
  +   if (copy_from_user(mmio_user, argp, sizeof mmio_user))
  +   goto out;
  +   r = kvm_vm_ioctl_register_msix_mmio(kvm, mmio_user);
  +   break;
  +   }
  +   case KVM_UNREGISTER_MSIX_MMIO: {
  +   struct kvm_msix_mmio_user mmio_user;
  +
  +   r = -EFAULT;
  +   if (copy_from_user(mmio_user, argp, sizeof mmio_user))
  +   goto out;
  +   r = kvm_vm_ioctl_unregister_msix_mmio(kvm, mmio_user);
  +   break;
  +   }
  
  default:
  r = kvm_arch_vm_ioctl(filp, ioctl, arg);
  if (r == -ENOTTY)
  
  @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
  
  return r;
  
  }
   
   #endif
  
  +   r = kvm_register_msix_mmio_dev(kvm);
  +   if (r  0) {
  +   kvm_put_kvm(kvm);
  +   return r;
  +   }
  +
  
  r = 

Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel

2011-01-17 Thread Avi Kivity

On 01/17/2011 02:18 PM, Sheng Yang wrote:

+
+   if (copy_to_user((void __user *)(entry_base + offset), val, len))
+   goto out;

  Instead of copying to/from userspace (which is subject to swapin,
  unexpected values), you could include the guest written value in a
  kvm_run structure, along with address. Qemu-kvm would use that to
  synchronize its copy of the table, on KVM_EXIT_MSIX_ROUTING_UPDATE exit.

We want to acelerate MSI-X mask bit accessing, which won't exit to userspace in
the most condition. That's the cost we want to optimize. Also it's possible to
userspace to read the correct value of MMIO(but mostly userspace can't write to 
it
in order to prevent synchronize issue).


It's also good to have the values in just one place; using userspace 
makes it easy for both the kernel and userspace to see the values (and 
set them after migration, if/when we extend this to virtio).


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel

2011-01-17 Thread Avi Kivity

On 01/06/2011 12:19 PM, Sheng Yang wrote:

Then we can support mask bit operation of assigned devices now.



+int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
+   int assigned_dev_id, int entry, bool mask)
+{
+   int r = -EFAULT;
+   struct kvm_assigned_dev_kernel *adev;
+   int i;
+
+   if (!irqchip_in_kernel(kvm))
+   return r;
+
+   mutex_lock(kvm-lock);
+   adev = kvm_find_assigned_dev(kvm-arch.assigned_dev_head,
+ assigned_dev_id);
+   if (!adev)
+   goto out;
+
+   for (i = 0; i  adev-entries_nr; i++)
+   if (adev-host_msix_entries[i].entry == entry) {
+   if (mask)
+   disable_irq_nosync(
+   adev-host_msix_entries[i].vector);


Is it okay to call disable_irq_nosync() here?  IIRC we don't check the 
mask bit on irq delivery, so we may forward an interrupt to the guest 
after the mask bit was set.


What does pci say about the mask bit?  when does it take effect?

Another question is whether disable_irq_nosync() actually programs the 
device mask bit, or not.  If it does, then it's slow, and it may be 
better to leave interrupts enabled but have an internal pending bit.  If 
it doesn't program the mask bit, it's fine.



+   else
+   enable_irq(adev-host_msix_entries[i].vector);
+   r = 0;
+   break;
+   }
+out:
+   mutex_unlock(kvm-lock);
+   return r;
+}

+
+static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int 
len,
+   void *val)
+{
+   struct kvm_msix_mmio_dev *mmio_dev =
+   container_of(this, struct kvm_msix_mmio_dev, table_dev);
+   struct kvm_msix_mmio *mmio;
+   int idx, ret = 0, entry, offset, r;
+
+   mutex_lock(mmio_dev-lock);
+   idx = get_mmio_table_index(mmio_dev, addr, len);
+   if (idx  0) {
+   ret = -EOPNOTSUPP;
+   goto out;
+   }
+   if ((addr  0x3) || (len != 4  len != 8))
+   goto out;
+
+   offset = addr  0xf;
+   if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL  len == 8)
+   goto out;
+
+   mmio =mmio_dev-mmio[idx];
+   entry = (addr - mmio-table_base_addr) / PCI_MSIX_ENTRY_SIZE;
+   r = copy_from_user(val, (void __user *)(mmio-table_base_va +
+   entry * PCI_MSIX_ENTRY_SIZE + offset), len);
+   if (r)
+   goto out;


and return ret == 0?


+out:
+   mutex_unlock(mmio_dev-lock);
+   return ret;
+}
+
+static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
+   int len, const void *val)
+{
+   struct kvm_msix_mmio_dev *mmio_dev =
+   container_of(this, struct kvm_msix_mmio_dev, table_dev);
+   struct kvm_msix_mmio *mmio;
+   int idx, entry, offset, ret = 0, r = 0;
+   gpa_t entry_base;
+   u32 old_ctrl, new_ctrl;
+   u32 *ctrl_pos;
+
+   mutex_lock(mmio_dev-lock);
+   idx = get_mmio_table_index(mmio_dev, addr, len);
+   if (idx  0) {
+   ret = -EOPNOTSUPP;
+   goto out;
+   }
+   if ((addr  0x3) || (len != 4  len != 8))
+   goto out;
+
+   offset = addr  0xF;
+   if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL  len == 8)
+   goto out;
+
+   mmio =mmio_dev-mmio[idx];
+   entry = (addr - mmio-table_base_addr) / PCI_MSIX_ENTRY_SIZE;
+   entry_base = mmio-table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
+   ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
+
+   if (get_user(old_ctrl, ctrl_pos))
+   goto out;
+
+   /* No allow writing to other fields when entry is unmasked */
+   if (!(old_ctrl  PCI_MSIX_ENTRY_CTRL_MASKBIT)
+   offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
+   goto out;
+
+   if (copy_to_user((void __user *)(entry_base + offset), val, len))
+   goto out;
+
+   if (get_user(new_ctrl, ctrl_pos))
+   goto out;


here, too.


+
+   if ((offset  PCI_MSIX_ENTRY_VECTOR_CTRL  len == 4) ||
+   (offset  PCI_MSIX_ENTRY_DATA  len == 8))
+   ret = -ENOTSYNC;


goto out?


+   if (old_ctrl == new_ctrl)
+   goto out;
+   if (!(old_ctrl  PCI_MSIX_ENTRY_CTRL_MASKBIT)
+   (new_ctrl  PCI_MSIX_ENTRY_CTRL_MASKBIT))
+   r = update_msix_mask_bit(mmio_dev-kvm, mmio, entry, 1);
+   else if ((old_ctrl  PCI_MSIX_ENTRY_CTRL_MASKBIT)
+   !(new_ctrl  PCI_MSIX_ENTRY_CTRL_MASKBIT))
+   r = update_msix_mask_bit(mmio_dev-kvm, mmio, entry, 0);
+   if (r || ret)
+   ret = -ENOTSYNC;
+out:
+   mutex_unlock(mmio_dev-lock);
+   return ret;
+}
+


--
error compiling committee.c: too many arguments to 

Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel

2011-01-17 Thread Marcelo Tosatti
On Mon, Jan 17, 2011 at 08:18:22PM +0800, Sheng Yang wrote:
   + goto out;
   +
   + mmio = mmio_dev-mmio[idx];
   + entry = (addr - mmio-table_base_addr) / PCI_MSIX_ENTRY_SIZE;
   + entry_base = mmio-table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
   + ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
   +
   + if (get_user(old_ctrl, ctrl_pos))
   + goto out;
   +
   + /* No allow writing to other fields when entry is unmasked */
   + if (!(old_ctrl  PCI_MSIX_ENTRY_CTRL_MASKBIT) 
   + offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
   + goto out;
   +
   + if (copy_to_user((void __user *)(entry_base + offset), val, len))
   + goto out;
  
  Instead of copying to/from userspace (which is subject to swapin,
  unexpected values), you could include the guest written value in a
  kvm_run structure, along with address. Qemu-kvm would use that to
  synchronize its copy of the table, on KVM_EXIT_MSIX_ROUTING_UPDATE exit.
 
 We want to acelerate MSI-X mask bit accessing, which won't exit to userspace 
 in 
 the most condition. That's the cost we want to optimize. Also it's possible 
 to 
 userspace to read the correct value of MMIO(but mostly userspace can't write 
 to it 
 in order to prevent synchronize issue).

Yes, i meant exit to userspace only when necessary, but instead of
copying directly everytime record the value the guest has written in
kvm_run and exit with KVM_EXIT_MSIX_ROUTING_UPDATE.

   + if (get_user(new_ctrl, ctrl_pos))
   + goto out;
   +
   + if ((offset  PCI_MSIX_ENTRY_VECTOR_CTRL  len == 4) ||
   + (offset  PCI_MSIX_ENTRY_DATA  len == 8))
   + ret = -ENOTSYNC;
   + if (old_ctrl == new_ctrl)
   + goto out;
   + if (!(old_ctrl  PCI_MSIX_ENTRY_CTRL_MASKBIT) 
   + (new_ctrl  PCI_MSIX_ENTRY_CTRL_MASKBIT))
   + r = update_msix_mask_bit(mmio_dev-kvm, mmio, entry, 1);
   + else if ((old_ctrl  PCI_MSIX_ENTRY_CTRL_MASKBIT) 
   + !(new_ctrl  PCI_MSIX_ENTRY_CTRL_MASKBIT))
   + r = update_msix_mask_bit(mmio_dev-kvm, mmio, entry, 0);
  
  Then you rely on the kernel copy of the values to enable/disable irq.
 
 Yes, they are guest owned assigned IRQs. Any potential issue?

Nothing prevents the kernel from enabling or disabling irq multiple
times with this code (what prevents it is a qemu-kvm that behaves as
expected). This is not very good.

Perhaps the guest can only harm itself with that, but i'm not sure.

And also if an msix table page is swapped out guest will hang.

   + return r;
   +}
  
  This is not consistent with registration, where there are no checks
  regarding validity of assigned device id. So why is it necessary?
 
 I am not quite understand. We need to free mmio anyway, otherwise it would 
 result 
 in wrong mmio interception...

If you return -EOPNOTSUPP in case kvm_find_assigned_dev fails in the
read/write paths, there is no need to free anything.

 
  
  There is a lock ordering problem BTW:
  
  - read/write handlers: dev-lock - kvm-lock
  - vm_ioctl_deassign_device - kvm_free_msix_mmio: kvm-lock - dev-lock
 
 Good catch! Would fix it(and other comments of course).
 
 --
 regards
 Yang, Sheng
 
  
   +
   +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
   +   struct kvm_msix_mmio_user *mmio_user)
   +{
   + struct kvm_msix_mmio mmio;
   +
   + mmio.dev_id = mmio_user-dev_id;
   + mmio.type = mmio_user-type;
   +
   + return kvm_free_msix_mmio(kvm, mmio);
   +}
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel

2011-01-17 Thread Marcelo Tosatti
On Mon, Jan 17, 2011 at 02:18:43PM +0200, Avi Kivity wrote:
 On 01/17/2011 02:18 PM, Sheng Yang wrote:
 +
 +  if (copy_to_user((void __user *)(entry_base + offset), val, 
  len))
 +  goto out;
 
   Instead of copying to/from userspace (which is subject to swapin,
   unexpected values), you could include the guest written value in a
   kvm_run structure, along with address. Qemu-kvm would use that to
   synchronize its copy of the table, on KVM_EXIT_MSIX_ROUTING_UPDATE exit.
 
 We want to acelerate MSI-X mask bit accessing, which won't exit to userspace 
 in
 the most condition. That's the cost we want to optimize. Also it's possible 
 to
 userspace to read the correct value of MMIO(but mostly userspace can't write 
 to it
 in order to prevent synchronize issue).
 
 It's also good to have the values in just one place; using userspace
 makes it easy for both the kernel and userspace to see the values
 (and set them after migration, if/when we extend this to virtio).

Right, thats an advantage, but:

- How can userspace ever synchronize with updates by the kernel 
  to the MSI-X entry?
- Reading/writing to the userspace area must be done carefully,
  values must be validated before used.
- Swapping issue (minor?).


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel

2011-01-17 Thread Avi Kivity

On 01/17/2011 02:48 PM, Marcelo Tosatti wrote:

On Mon, Jan 17, 2011 at 02:18:43PM +0200, Avi Kivity wrote:
  On 01/17/2011 02:18 PM, Sheng Yang wrote:
+
+   if (copy_to_user((void __user *)(entry_base + offset), val, 
len))
+   goto out;
  
 Instead of copying to/from userspace (which is subject to swapin,
 unexpected values), you could include the guest written value in a
 kvm_run structure, along with address. Qemu-kvm would use that to
 synchronize its copy of the table, on KVM_EXIT_MSIX_ROUTING_UPDATE exit.
  
  We want to acelerate MSI-X mask bit accessing, which won't exit to 
userspace in
  the most condition. That's the cost we want to optimize. Also it's possible 
to
  userspace to read the correct value of MMIO(but mostly userspace can't 
write to it
  in order to prevent synchronize issue).

  It's also good to have the values in just one place; using userspace
  makes it easy for both the kernel and userspace to see the values
  (and set them after migration, if/when we extend this to virtio).

Right, thats an advantage, but:

- How can userspace ever synchronize with updates by the kernel
   to the MSI-X entry?


What a value is written by the guest, which kvm cannot handle itself 
(i.e. a change to anything other than the mask bit), we exit with the 
table and entry ids, so userspace can reread them.



- Reading/writing to the userspace area must be done carefully,
   values must be validated before used.


True every time...


- Swapping issue (minor?).


I don't see the issue... just like any part of qemu that may be swapped 
out, blocking the vcpu thread.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel

2011-01-17 Thread Michael S. Tsirkin
On Mon, Jan 17, 2011 at 02:29:44PM +0200, Avi Kivity wrote:
 On 01/06/2011 12:19 PM, Sheng Yang wrote:
 Then we can support mask bit operation of assigned devices now.
 
 
 
 +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
 +int assigned_dev_id, int entry, bool mask)
 +{
 +int r = -EFAULT;
 +struct kvm_assigned_dev_kernel *adev;
 +int i;
 +
 +if (!irqchip_in_kernel(kvm))
 +return r;
 +
 +mutex_lock(kvm-lock);
 +adev = kvm_find_assigned_dev(kvm-arch.assigned_dev_head,
 +  assigned_dev_id);
 +if (!adev)
 +goto out;
 +
 +for (i = 0; i  adev-entries_nr; i++)
 +if (adev-host_msix_entries[i].entry == entry) {
 +if (mask)
 +disable_irq_nosync(
 +adev-host_msix_entries[i].vector);
 
 Is it okay to call disable_irq_nosync() here?  IIRC we don't check
 the mask bit on irq delivery, so we may forward an interrupt to the
 guest after the mask bit was set.
 
 What does pci say about the mask bit?  when does it take effect?

It is only guaranteed to take effect on the next read.
In practice, it takes effect earlier and guests
rely on this as an optimization.

 Another question is whether disable_irq_nosync() actually programs
 the device mask bit, or not.  If it does, then it's slow, and it may
 be better to leave interrupts enabled but have an internal pending
 bit.  If it doesn't program the mask bit, it's fine.
 
 +else
 +enable_irq(adev-host_msix_entries[i].vector);
 +r = 0;
 +break;
 +}
 +out:
 +mutex_unlock(kvm-lock);
 +return r;
 +}
 
 +
 +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int 
 len,
 +void *val)
 +{
 +struct kvm_msix_mmio_dev *mmio_dev =
 +container_of(this, struct kvm_msix_mmio_dev, table_dev);
 +struct kvm_msix_mmio *mmio;
 +int idx, ret = 0, entry, offset, r;
 +
 +mutex_lock(mmio_dev-lock);
 +idx = get_mmio_table_index(mmio_dev, addr, len);
 +if (idx  0) {
 +ret = -EOPNOTSUPP;
 +goto out;
 +}
 +if ((addr  0x3) || (len != 4  len != 8))
 +goto out;
 +
 +offset = addr  0xf;
 +if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL  len == 8)
 +goto out;
 +
 +mmio =mmio_dev-mmio[idx];
 +entry = (addr - mmio-table_base_addr) / PCI_MSIX_ENTRY_SIZE;
 +r = copy_from_user(val, (void __user *)(mmio-table_base_va +
 +entry * PCI_MSIX_ENTRY_SIZE + offset), len);
 +if (r)
 +goto out;
 
 and return ret == 0?
 
 +out:
 +mutex_unlock(mmio_dev-lock);
 +return ret;
 +}
 +
 +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
 +int len, const void *val)
 +{
 +struct kvm_msix_mmio_dev *mmio_dev =
 +container_of(this, struct kvm_msix_mmio_dev, table_dev);
 +struct kvm_msix_mmio *mmio;
 +int idx, entry, offset, ret = 0, r = 0;
 +gpa_t entry_base;
 +u32 old_ctrl, new_ctrl;
 +u32 *ctrl_pos;
 +
 +mutex_lock(mmio_dev-lock);
 +idx = get_mmio_table_index(mmio_dev, addr, len);
 +if (idx  0) {
 +ret = -EOPNOTSUPP;
 +goto out;
 +}
 +if ((addr  0x3) || (len != 4  len != 8))
 +goto out;
 +
 +offset = addr  0xF;
 +if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL  len == 8)
 +goto out;
 +
 +mmio =mmio_dev-mmio[idx];
 +entry = (addr - mmio-table_base_addr) / PCI_MSIX_ENTRY_SIZE;
 +entry_base = mmio-table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
 +ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
 +
 +if (get_user(old_ctrl, ctrl_pos))
 +goto out;
 +
 +/* No allow writing to other fields when entry is unmasked */
 +if (!(old_ctrl  PCI_MSIX_ENTRY_CTRL_MASKBIT)
 +offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
 +goto out;
 +
 +if (copy_to_user((void __user *)(entry_base + offset), val, len))
 +goto out;
 +
 +if (get_user(new_ctrl, ctrl_pos))
 +goto out;
 
 here, too.
 
 +
 +if ((offset  PCI_MSIX_ENTRY_VECTOR_CTRL  len == 4) ||
 +(offset  PCI_MSIX_ENTRY_DATA  len == 8))
 +ret = -ENOTSYNC;
 
 goto out?
 
 +if (old_ctrl == new_ctrl)
 +goto out;
 +if (!(old_ctrl  PCI_MSIX_ENTRY_CTRL_MASKBIT)
 +(new_ctrl  PCI_MSIX_ENTRY_CTRL_MASKBIT))
 +r = update_msix_mask_bit(mmio_dev-kvm, mmio, entry, 1);
 +else if ((old_ctrl  PCI_MSIX_ENTRY_CTRL_MASKBIT)
 +!(new_ctrl  PCI_MSIX_ENTRY_CTRL_MASKBIT))
 +r = update_msix_mask_bit(mmio_dev-kvm, mmio, entry, 0);
 +if (r || ret)
 +ret = -ENOTSYNC;
 +out:
 +mutex_unlock(mmio_dev-lock);
 +return ret;
 +}
 +
 
 -- 
 error 

Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel

2011-01-17 Thread Jan Kiszka
On 2011-01-17 13:29, Avi Kivity wrote:
 On 01/06/2011 12:19 PM, Sheng Yang wrote:
 Then we can support mask bit operation of assigned devices now.



 +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
 +int assigned_dev_id, int entry, bool mask)
 +{
 +int r = -EFAULT;
 +struct kvm_assigned_dev_kernel *adev;
 +int i;
 +
 +if (!irqchip_in_kernel(kvm))
 +return r;
 +
 +mutex_lock(kvm-lock);
 +adev = kvm_find_assigned_dev(kvm-arch.assigned_dev_head,
 +  assigned_dev_id);
 +if (!adev)
 +goto out;
 +
 +for (i = 0; i  adev-entries_nr; i++)
 +if (adev-host_msix_entries[i].entry == entry) {
 +if (mask)
 +disable_irq_nosync(
 +adev-host_msix_entries[i].vector);
 
 Is it okay to call disable_irq_nosync() here?  IIRC we don't check the
 mask bit on irq delivery, so we may forward an interrupt to the guest
 after the mask bit was set.
 
 What does pci say about the mask bit?  when does it take effect?
 
 Another question is whether disable_irq_nosync() actually programs the
 device mask bit, or not.  If it does, then it's slow, and it may be
 better to leave interrupts enabled but have an internal pending bit.  If
 it doesn't program the mask bit, it's fine.

disable_irq* works lazily, only disabling the line in software. If an
IRQ actually occurs, it is marked pending and the line is left masked on
handler return (that's one reason why masking via PCI config space is
slower at macro level).

Jan

-- 
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel

2011-01-17 Thread Marcelo Tosatti
On Mon, Jan 17, 2011 at 02:51:37PM +0200, Avi Kivity wrote:
 On 01/17/2011 02:48 PM, Marcelo Tosatti wrote:
 On Mon, Jan 17, 2011 at 02:18:43PM +0200, Avi Kivity wrote:
   On 01/17/2011 02:18 PM, Sheng Yang wrote:
 +
 +if (copy_to_user((void __user *)(entry_base + offset), 
  val, len))
 +goto out;
   
  Instead of copying to/from userspace (which is subject to swapin,
  unexpected values), you could include the guest written value in a
  kvm_run structure, along with address. Qemu-kvm would use that to
  synchronize its copy of the table, on KVM_EXIT_MSIX_ROUTING_UPDATE 
  exit.
   
   We want to acelerate MSI-X mask bit accessing, which won't exit to 
  userspace in
   the most condition. That's the cost we want to optimize. Also it's 
  possible to
   userspace to read the correct value of MMIO(but mostly userspace can't 
  write to it
   in order to prevent synchronize issue).
 
   It's also good to have the values in just one place; using userspace
   makes it easy for both the kernel and userspace to see the values
   (and set them after migration, if/when we extend this to virtio).
 
 Right, thats an advantage, but:
 
 - How can userspace ever synchronize with updates by the kernel
to the MSI-X entry?
 
 What a value is written by the guest, which kvm cannot handle itself
 (i.e. a change to anything other than the mask bit), we exit with
 the table and entry ids, so userspace can reread them.

OK. But regarding access to the MSI-X entry in userspace, it can 
only be accessed safely wrt parallel updates by the kernel in the
exit handler.

Is the exit handler the only location where the MSI-X entry will be
read or written to, in userspace?

 - Reading/writing to the userspace area must be done carefully,
values must be validated before used.
 
 True every time...
 
 - Swapping issue (minor?).
 
 I don't see the issue... just like any part of qemu that may be
 swapped out, blocking the vcpu thread.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel

2011-01-17 Thread Avi Kivity

On 01/17/2011 05:52 PM, Marcelo Tosatti wrote:


  What a value is written by the guest, which kvm cannot handle itself
  (i.e. a change to anything other than the mask bit), we exit with
  the table and entry ids, so userspace can reread them.

OK. But regarding access to the MSI-X entry in userspace, it can
only be accessed safely wrt parallel updates by the kernel in the
exit handler.

Is the exit handler the only location where the MSI-X entry will be
read or written to, in userspace?



Yes.  This should be documented, that the kernel owns the msix area as 
far as writes are concerned.



--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/3] KVM: Emulate MSI-X table in kernel

2011-01-06 Thread Sheng Yang
Then we can support mask bit operation of assigned devices now.

Signed-off-by: Sheng Yang sh...@linux.intel.com
---
 arch/x86/kvm/Makefile|2 +-
 arch/x86/kvm/x86.c   |8 +-
 include/linux/kvm.h  |   21 
 include/linux/kvm_host.h |   25 
 virt/kvm/assigned-dev.c  |   44 +++
 virt/kvm/kvm_main.c  |   38 ++-
 virt/kvm/msix_mmio.c |  284 ++
 virt/kvm/msix_mmio.h |   25 
 8 files changed, 440 insertions(+), 7 deletions(-)
 create mode 100644 virt/kvm/msix_mmio.c
 create mode 100644 virt/kvm/msix_mmio.h

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index f15501f..3a0d851 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
 
 kvm-y  += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
coalesced_mmio.o irq_comm.o eventfd.o \
-   assigned-dev.o)
+   assigned-dev.o msix_mmio.o)
 kvm-$(CONFIG_IOMMU_API)+= $(addprefix ../../../virt/kvm/, iommu.o)
 kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o)
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fa708c9..89bf12c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_X86_ROBUST_SINGLESTEP:
case KVM_CAP_XSAVE:
case KVM_CAP_ASYNC_PF:
+   case KVM_CAP_MSIX_MMIO:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned long 
addr,
   struct kvm_vcpu *vcpu)
 {
gpa_t gpa;
+   int r;
 
gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
 
@@ -3822,14 +3824,16 @@ static int emulator_write_emulated_onepage(unsigned 
long addr,
 
 mmio:
trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
+   r = vcpu_mmio_write(vcpu, gpa, bytes, val);
/*
 * Is this MMIO handled locally?
 */
-   if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
+   if (!r)
return X86EMUL_CONTINUE;
 
vcpu-mmio_needed = 1;
-   vcpu-run-exit_reason = KVM_EXIT_MMIO;
+   vcpu-run-exit_reason = (r == -ENOTSYNC) ?
+   KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
vcpu-run-mmio.phys_addr = vcpu-mmio_phys_addr = gpa;
vcpu-run-mmio.len = vcpu-mmio_size = bytes;
vcpu-run-mmio.is_write = vcpu-mmio_is_write = 1;
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index ea2dc1a..ad9df4b 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -161,6 +161,7 @@ struct kvm_pit_config {
 #define KVM_EXIT_NMI  16
 #define KVM_EXIT_INTERNAL_ERROR   17
 #define KVM_EXIT_OSI  18
+#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 #define KVM_INTERNAL_ERROR_EMULATION 1
@@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_PPC_GET_PVINFO 57
 #define KVM_CAP_PPC_IRQ_LEVEL 58
 #define KVM_CAP_ASYNC_PF 59
+#define KVM_CAP_MSIX_MMIO 60
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -672,6 +674,9 @@ struct kvm_clock_data {
 #define KVM_XEN_HVM_CONFIG_IOW(KVMIO,  0x7a, struct kvm_xen_hvm_config)
 #define KVM_SET_CLOCK _IOW(KVMIO,  0x7b, struct kvm_clock_data)
 #define KVM_GET_CLOCK _IOR(KVMIO,  0x7c, struct kvm_clock_data)
+/* Available with KVM_CAP_MSIX_MMIO */
+#define KVM_REGISTER_MSIX_MMIO_IOW(KVMIO,  0x7d, struct kvm_msix_mmio_user)
+#define KVM_UNREGISTER_MSIX_MMIO  _IOW(KVMIO,  0x7e, struct kvm_msix_mmio_user)
 /* Available with KVM_CAP_PIT_STATE2 */
 #define KVM_GET_PIT2  _IOR(KVMIO,  0x9f, struct kvm_pit_state2)
 #define KVM_SET_PIT2  _IOW(KVMIO,  0xa0, struct kvm_pit_state2)
@@ -795,4 +800,20 @@ struct kvm_assigned_msix_entry {
__u16 padding[3];
 };
 
+#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV(1  0)
+
+#define KVM_MSIX_MMIO_TYPE_BASE_TABLE  (1  8)
+
+#define KVM_MSIX_MMIO_TYPE_DEV_MASK0x00ff
+#define KVM_MSIX_MMIO_TYPE_BASE_MASK   0xff00
+struct kvm_msix_mmio_user {
+   __u32 dev_id;
+   __u16 type;
+   __u16 max_entries_nr;
+   __u64 base_addr;
+   __u64 base_va;
+   __u64 flags;
+   __u64 reserved[4];
+};
+
 #endif /* __LINUX_KVM_H */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7d313e0..c10670c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -233,6 +233,27 @@ struct kvm_memslots {
KVM_PRIVATE_MEM_SLOTS];
 };
 
+#define KVM_MSIX_MMIO_MAX32
+
+struct kvm_msix_mmio {
+   u32 dev_id;
+   u16 type;
+   u16 max_entries_nr;
+   u64 flags;
+   gpa_t table_base_addr;
+   hva_t table_base_va;
+   gpa_t pba_base_addr;
+   hva_t