Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
On Fri, Feb 25, 2011 at 02:12:42PM +0800, Sheng Yang wrote: On Thursday 24 February 2011 18:17:34 Michael S. Tsirkin wrote: On Thu, Feb 24, 2011 at 05:44:20PM +0800, Sheng Yang wrote: On Wednesday 23 February 2011 16:45:37 Michael S. Tsirkin wrote: On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote: On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote: On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote: Then we can support mask bit operation of assigned devices now. Looks pretty good overall. A few comments below. It seems like we should be able to hook this into vfio with a small stub in kvm. We just need to be able to communicate disabling and enabling of individual msix vectors. For brute force, we could do this with a lot of eventfds, triggered by kvm and consumed by vfio, two per MSI-X vector. Not sure if there's something smaller that could do it. Thanks, Alex, thanks for your comments. See my comments below: Alex Signed-off-by: Sheng Yang sh...@linux.intel.com --- arch/x86/kvm/Makefile|2 +- arch/x86/kvm/x86.c |8 +- include/linux/kvm.h | 21 include/linux/kvm_host.h | 25 virt/kvm/assigned-dev.c | 44 +++ virt/kvm/kvm_main.c | 38 ++- virt/kvm/msix_mmio.c | 286 ++ virt/kvm/msix_mmio.h | 25 8 files changed, 442 insertions(+), 7 deletions(-) create mode 100644 virt/kvm/msix_mmio.c create mode 100644 virt/kvm/msix_mmio.h diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index f15501f..3a0d851 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I. kvm-y+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ coalesced_mmio.o irq_comm.o eventfd.o \ - assigned-dev.o) + assigned-dev.o msix_mmio.o) kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fa708c9..89bf12c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_X86_ROBUST_SINGLESTEP: case KVM_CAP_XSAVE: case KVM_CAP_ASYNC_PF: + case KVM_CAP_MSIX_MMIO: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned long addr, struct kvm_vcpu *vcpu) { gpa_t gpa; + int r; gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); @@ -3822,14 +3824,16 @@ static int emulator_write_emulated_onepage(unsigned long addr, mmio: trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); + r = vcpu_mmio_write(vcpu, gpa, bytes, val); /* * Is this MMIO handled locally? */ - if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) + if (!r) return X86EMUL_CONTINUE; vcpu-mmio_needed = 1; - vcpu-run-exit_reason = KVM_EXIT_MMIO; + vcpu-run-exit_reason = (r == -ENOTSYNC) ? + KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO; This use of -ENOTSYNC is IMO confusing. How about we make vcpu_mmio_write return the positive exit reason? Negative value will mean an error. In fact currently nagative value means something more need to be done, the same as MMIO exit. So it would be if (!r) return X86EMUL_CONTINUE; vcpu-run-exit_reason = r; Now I think we can keep it, or update them all later. The way to do this would be 1. patch to return KVM_EXIT_MMIO on mmio 2. your patch that returns KVM_EXIT_MSIX_ROUTING_UPDATE on top It's not that straightforward. In most condition, the reason vcpu_mmio_write() 0 because KVM itself unable to complete the request. That's quite straightforward. But each handler in the chain can't decided it would be KVM_EXIT_MMIO, they can only know when all of them fail to handle the accessing. Well, this just violates the standard API for return value. The standard semantics are: - negative - error - positive - not an error So you can just return a
Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
On Wednesday 23 February 2011 16:45:37 Michael S. Tsirkin wrote: On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote: On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote: On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote: Then we can support mask bit operation of assigned devices now. Looks pretty good overall. A few comments below. It seems like we should be able to hook this into vfio with a small stub in kvm. We just need to be able to communicate disabling and enabling of individual msix vectors. For brute force, we could do this with a lot of eventfds, triggered by kvm and consumed by vfio, two per MSI-X vector. Not sure if there's something smaller that could do it. Thanks, Alex, thanks for your comments. See my comments below: Alex Signed-off-by: Sheng Yang sh...@linux.intel.com --- arch/x86/kvm/Makefile|2 +- arch/x86/kvm/x86.c |8 +- include/linux/kvm.h | 21 include/linux/kvm_host.h | 25 virt/kvm/assigned-dev.c | 44 +++ virt/kvm/kvm_main.c | 38 ++- virt/kvm/msix_mmio.c | 286 ++ virt/kvm/msix_mmio.h | 25 8 files changed, 442 insertions(+), 7 deletions(-) create mode 100644 virt/kvm/msix_mmio.c create mode 100644 virt/kvm/msix_mmio.h diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index f15501f..3a0d851 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I. kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ coalesced_mmio.o irq_comm.o eventfd.o \ - assigned-dev.o) + assigned-dev.o msix_mmio.o) kvm-$(CONFIG_IOMMU_API)+= $(addprefix ../../../virt/kvm/, iommu.o) kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fa708c9..89bf12c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_X86_ROBUST_SINGLESTEP: case KVM_CAP_XSAVE: case KVM_CAP_ASYNC_PF: + case KVM_CAP_MSIX_MMIO: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned long addr, struct kvm_vcpu *vcpu) { gpa_t gpa; + int r; gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); @@ -3822,14 +3824,16 @@ static int emulator_write_emulated_onepage(unsigned long addr, mmio: trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); + r = vcpu_mmio_write(vcpu, gpa, bytes, val); /* * Is this MMIO handled locally? */ - if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) + if (!r) return X86EMUL_CONTINUE; vcpu-mmio_needed = 1; - vcpu-run-exit_reason = KVM_EXIT_MMIO; + vcpu-run-exit_reason = (r == -ENOTSYNC) ? + KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO; This use of -ENOTSYNC is IMO confusing. How about we make vcpu_mmio_write return the positive exit reason? Negative value will mean an error. Make sense. I would update it. vcpu-run-mmio.phys_addr = vcpu-mmio_phys_addr = gpa; vcpu-run-mmio.len = vcpu-mmio_size = bytes; vcpu-run-mmio.is_write = vcpu-mmio_is_write = 1; diff --git a/include/linux/kvm.h b/include/linux/kvm.h index ea2dc1a..ad9df4b 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -161,6 +161,7 @@ struct kvm_pit_config { #define KVM_EXIT_NMI 16 #define KVM_EXIT_INTERNAL_ERROR 17 #define KVM_EXIT_OSI 18 +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19 /* For KVM_EXIT_INTERNAL_ERROR */ #define KVM_INTERNAL_ERROR_EMULATION 1 @@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_PPC_GET_PVINFO 57 #define KVM_CAP_PPC_IRQ_LEVEL 58 #define KVM_CAP_ASYNC_PF 59 +#define KVM_CAP_MSIX_MMIO 60 #ifdef KVM_CAP_IRQ_ROUTING @@ -672,6 +674,9 @@ struct kvm_clock_data { #define KVM_XEN_HVM_CONFIG_IOW(KVMIO, 0x7a, struct kvm_xen_hvm_config) #define KVM_SET_CLOCK _IOW(KVMIO, 0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK _IOR(KVMIO, 0x7c, struct
Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
On Wednesday 23 February 2011 16:45:37 Michael S. Tsirkin wrote: On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote: On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote: On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote: Then we can support mask bit operation of assigned devices now. Looks pretty good overall. A few comments below. It seems like we should be able to hook this into vfio with a small stub in kvm. We just need to be able to communicate disabling and enabling of individual msix vectors. For brute force, we could do this with a lot of eventfds, triggered by kvm and consumed by vfio, two per MSI-X vector. Not sure if there's something smaller that could do it. Thanks, Alex, thanks for your comments. See my comments below: Alex Signed-off-by: Sheng Yang sh...@linux.intel.com --- arch/x86/kvm/Makefile|2 +- arch/x86/kvm/x86.c |8 +- include/linux/kvm.h | 21 include/linux/kvm_host.h | 25 virt/kvm/assigned-dev.c | 44 +++ virt/kvm/kvm_main.c | 38 ++- virt/kvm/msix_mmio.c | 286 ++ virt/kvm/msix_mmio.h | 25 8 files changed, 442 insertions(+), 7 deletions(-) create mode 100644 virt/kvm/msix_mmio.c create mode 100644 virt/kvm/msix_mmio.h diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index f15501f..3a0d851 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I. kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ coalesced_mmio.o irq_comm.o eventfd.o \ - assigned-dev.o) + assigned-dev.o msix_mmio.o) kvm-$(CONFIG_IOMMU_API)+= $(addprefix ../../../virt/kvm/, iommu.o) kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fa708c9..89bf12c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_X86_ROBUST_SINGLESTEP: case KVM_CAP_XSAVE: case KVM_CAP_ASYNC_PF: + case KVM_CAP_MSIX_MMIO: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned long addr, struct kvm_vcpu *vcpu) { gpa_t gpa; + int r; gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); @@ -3822,14 +3824,16 @@ static int emulator_write_emulated_onepage(unsigned long addr, mmio: trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); + r = vcpu_mmio_write(vcpu, gpa, bytes, val); /* * Is this MMIO handled locally? */ - if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) + if (!r) return X86EMUL_CONTINUE; vcpu-mmio_needed = 1; - vcpu-run-exit_reason = KVM_EXIT_MMIO; + vcpu-run-exit_reason = (r == -ENOTSYNC) ? + KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO; This use of -ENOTSYNC is IMO confusing. How about we make vcpu_mmio_write return the positive exit reason? Negative value will mean an error. In fact currently nagative value means something more need to be done, the same as MMIO exit. Now I think we can keep it, or update them all later. -- regards Yang, Sheng -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
On Thu, Feb 24, 2011 at 04:08:22PM +0800, Sheng Yang wrote: On Wednesday 23 February 2011 16:45:37 Michael S. Tsirkin wrote: On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote: On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote: On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote: Then we can support mask bit operation of assigned devices now. Looks pretty good overall. A few comments below. It seems like we should be able to hook this into vfio with a small stub in kvm. We just need to be able to communicate disabling and enabling of individual msix vectors. For brute force, we could do this with a lot of eventfds, triggered by kvm and consumed by vfio, two per MSI-X vector. Not sure if there's something smaller that could do it. Thanks, Alex, thanks for your comments. See my comments below: Alex Signed-off-by: Sheng Yang sh...@linux.intel.com --- arch/x86/kvm/Makefile|2 +- arch/x86/kvm/x86.c |8 +- include/linux/kvm.h | 21 include/linux/kvm_host.h | 25 virt/kvm/assigned-dev.c | 44 +++ virt/kvm/kvm_main.c | 38 ++- virt/kvm/msix_mmio.c | 286 ++ virt/kvm/msix_mmio.h | 25 8 files changed, 442 insertions(+), 7 deletions(-) create mode 100644 virt/kvm/msix_mmio.c create mode 100644 virt/kvm/msix_mmio.h diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index f15501f..3a0d851 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I. kvm-y+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ coalesced_mmio.o irq_comm.o eventfd.o \ - assigned-dev.o) + assigned-dev.o msix_mmio.o) kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fa708c9..89bf12c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_X86_ROBUST_SINGLESTEP: case KVM_CAP_XSAVE: case KVM_CAP_ASYNC_PF: + case KVM_CAP_MSIX_MMIO: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned long addr, struct kvm_vcpu *vcpu) { gpa_t gpa; + int r; gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); @@ -3822,14 +3824,16 @@ static int emulator_write_emulated_onepage(unsigned long addr, mmio: trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); + r = vcpu_mmio_write(vcpu, gpa, bytes, val); /* * Is this MMIO handled locally? */ - if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) + if (!r) return X86EMUL_CONTINUE; vcpu-mmio_needed = 1; - vcpu-run-exit_reason = KVM_EXIT_MMIO; + vcpu-run-exit_reason = (r == -ENOTSYNC) ? + KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO; This use of -ENOTSYNC is IMO confusing. How about we make vcpu_mmio_write return the positive exit reason? Negative value will mean an error. Make sense. I would update it. vcpu-run-mmio.phys_addr = vcpu-mmio_phys_addr = gpa; vcpu-run-mmio.len = vcpu-mmio_size = bytes; vcpu-run-mmio.is_write = vcpu-mmio_is_write = 1; diff --git a/include/linux/kvm.h b/include/linux/kvm.h index ea2dc1a..ad9df4b 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -161,6 +161,7 @@ struct kvm_pit_config { #define KVM_EXIT_NMI 16 #define KVM_EXIT_INTERNAL_ERROR 17 #define KVM_EXIT_OSI 18 +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19 /* For KVM_EXIT_INTERNAL_ERROR */ #define KVM_INTERNAL_ERROR_EMULATION 1 @@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_PPC_GET_PVINFO 57 #define KVM_CAP_PPC_IRQ_LEVEL 58 #define KVM_CAP_ASYNC_PF 59 +#define KVM_CAP_MSIX_MMIO 60 #ifdef KVM_CAP_IRQ_ROUTING @@ -672,6 +674,9 @@ struct kvm_clock_data { #define KVM_XEN_HVM_CONFIG_IOW(KVMIO, 0x7a, struct kvm_xen_hvm_config)
Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
On Thu, Feb 24, 2011 at 05:44:20PM +0800, Sheng Yang wrote: On Wednesday 23 February 2011 16:45:37 Michael S. Tsirkin wrote: On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote: On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote: On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote: Then we can support mask bit operation of assigned devices now. Looks pretty good overall. A few comments below. It seems like we should be able to hook this into vfio with a small stub in kvm. We just need to be able to communicate disabling and enabling of individual msix vectors. For brute force, we could do this with a lot of eventfds, triggered by kvm and consumed by vfio, two per MSI-X vector. Not sure if there's something smaller that could do it. Thanks, Alex, thanks for your comments. See my comments below: Alex Signed-off-by: Sheng Yang sh...@linux.intel.com --- arch/x86/kvm/Makefile|2 +- arch/x86/kvm/x86.c |8 +- include/linux/kvm.h | 21 include/linux/kvm_host.h | 25 virt/kvm/assigned-dev.c | 44 +++ virt/kvm/kvm_main.c | 38 ++- virt/kvm/msix_mmio.c | 286 ++ virt/kvm/msix_mmio.h | 25 8 files changed, 442 insertions(+), 7 deletions(-) create mode 100644 virt/kvm/msix_mmio.c create mode 100644 virt/kvm/msix_mmio.h diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index f15501f..3a0d851 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I. kvm-y+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ coalesced_mmio.o irq_comm.o eventfd.o \ - assigned-dev.o) + assigned-dev.o msix_mmio.o) kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fa708c9..89bf12c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_X86_ROBUST_SINGLESTEP: case KVM_CAP_XSAVE: case KVM_CAP_ASYNC_PF: + case KVM_CAP_MSIX_MMIO: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned long addr, struct kvm_vcpu *vcpu) { gpa_t gpa; + int r; gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); @@ -3822,14 +3824,16 @@ static int emulator_write_emulated_onepage(unsigned long addr, mmio: trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); + r = vcpu_mmio_write(vcpu, gpa, bytes, val); /* * Is this MMIO handled locally? */ - if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) + if (!r) return X86EMUL_CONTINUE; vcpu-mmio_needed = 1; - vcpu-run-exit_reason = KVM_EXIT_MMIO; + vcpu-run-exit_reason = (r == -ENOTSYNC) ? + KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO; This use of -ENOTSYNC is IMO confusing. How about we make vcpu_mmio_write return the positive exit reason? Negative value will mean an error. In fact currently nagative value means something more need to be done, the same as MMIO exit. So it would be if (!r) return X86EMUL_CONTINUE; vcpu-run-exit_reason = r; Now I think we can keep it, or update them all later. The way to do this would be 1. patch to return KVM_EXIT_MMIO on mmio 2. your patch that returns KVM_EXIT_MSIX_ROUTING_UPDATE on top -- regards Yang, Sheng -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
On Thursday 24 February 2011 18:11:44 Michael S. Tsirkin wrote: On Thu, Feb 24, 2011 at 04:08:22PM +0800, Sheng Yang wrote: On Wednesday 23 February 2011 16:45:37 Michael S. Tsirkin wrote: On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote: On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote: On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote: Then we can support mask bit operation of assigned devices now. Looks pretty good overall. A few comments below. It seems like we should be able to hook this into vfio with a small stub in kvm. We just need to be able to communicate disabling and enabling of individual msix vectors. For brute force, we could do this with a lot of eventfds, triggered by kvm and consumed by vfio, two per MSI-X vector. Not sure if there's something smaller that could do it. Thanks, Alex, thanks for your comments. See my comments below: Alex Signed-off-by: Sheng Yang sh...@linux.intel.com --- arch/x86/kvm/Makefile|2 +- arch/x86/kvm/x86.c |8 +- include/linux/kvm.h | 21 include/linux/kvm_host.h | 25 virt/kvm/assigned-dev.c | 44 +++ virt/kvm/kvm_main.c | 38 ++- virt/kvm/msix_mmio.c | 286 ++ virt/kvm/msix_mmio.h | 25 8 files changed, 442 insertions(+), 7 deletions(-) create mode 100644 virt/kvm/msix_mmio.c create mode 100644 virt/kvm/msix_mmio.h diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index f15501f..3a0d851 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I. kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ coalesced_mmio.o irq_comm.o eventfd.o \ - assigned-dev.o) + assigned-dev.o msix_mmio.o) kvm-$(CONFIG_IOMMU_API)+= $(addprefix ../../../virt/kvm/, iommu.o) kvm-$(CONFIG_KVM_ASYNC_PF)+= $(addprefix ../../../virt/kvm/, async_pf.o) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fa708c9..89bf12c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_X86_ROBUST_SINGLESTEP: case KVM_CAP_XSAVE: case KVM_CAP_ASYNC_PF: + case KVM_CAP_MSIX_MMIO: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned long addr, struct kvm_vcpu *vcpu) { gpa_t gpa; + int r; gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); @@ -3822,14 +3824,16 @@ static int emulator_write_emulated_onepage(unsigned long addr, mmio: trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); + r = vcpu_mmio_write(vcpu, gpa, bytes, val); /* * Is this MMIO handled locally? */ - if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) + if (!r) return X86EMUL_CONTINUE; vcpu-mmio_needed = 1; - vcpu-run-exit_reason = KVM_EXIT_MMIO; + vcpu-run-exit_reason = (r == -ENOTSYNC) ? + KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO; This use of -ENOTSYNC is IMO confusing. How about we make vcpu_mmio_write return the positive exit reason? Negative value will mean an error. Make sense. I would update it. vcpu-run-mmio.phys_addr = vcpu-mmio_phys_addr = gpa; vcpu-run-mmio.len = vcpu-mmio_size = bytes; vcpu-run-mmio.is_write = vcpu-mmio_is_write = 1; diff --git a/include/linux/kvm.h b/include/linux/kvm.h index ea2dc1a..ad9df4b 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -161,6 +161,7 @@ struct kvm_pit_config { #define KVM_EXIT_NMI 16 #define KVM_EXIT_INTERNAL_ERROR 17 #define KVM_EXIT_OSI 18 +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19 /* For KVM_EXIT_INTERNAL_ERROR */ #define KVM_INTERNAL_ERROR_EMULATION 1 @@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_PPC_GET_PVINFO 57 #define KVM_CAP_PPC_IRQ_LEVEL 58 #define KVM_CAP_ASYNC_PF 59 +#define KVM_CAP_MSIX_MMIO 60 #ifdef KVM_CAP_IRQ_ROUTING @@
Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
On Thursday 24 February 2011 18:17:34 Michael S. Tsirkin wrote: On Thu, Feb 24, 2011 at 05:44:20PM +0800, Sheng Yang wrote: On Wednesday 23 February 2011 16:45:37 Michael S. Tsirkin wrote: On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote: On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote: On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote: Then we can support mask bit operation of assigned devices now. Looks pretty good overall. A few comments below. It seems like we should be able to hook this into vfio with a small stub in kvm. We just need to be able to communicate disabling and enabling of individual msix vectors. For brute force, we could do this with a lot of eventfds, triggered by kvm and consumed by vfio, two per MSI-X vector. Not sure if there's something smaller that could do it. Thanks, Alex, thanks for your comments. See my comments below: Alex Signed-off-by: Sheng Yang sh...@linux.intel.com --- arch/x86/kvm/Makefile|2 +- arch/x86/kvm/x86.c |8 +- include/linux/kvm.h | 21 include/linux/kvm_host.h | 25 virt/kvm/assigned-dev.c | 44 +++ virt/kvm/kvm_main.c | 38 ++- virt/kvm/msix_mmio.c | 286 ++ virt/kvm/msix_mmio.h | 25 8 files changed, 442 insertions(+), 7 deletions(-) create mode 100644 virt/kvm/msix_mmio.c create mode 100644 virt/kvm/msix_mmio.h diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index f15501f..3a0d851 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I. kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ coalesced_mmio.o irq_comm.o eventfd.o \ - assigned-dev.o) + assigned-dev.o msix_mmio.o) kvm-$(CONFIG_IOMMU_API)+= $(addprefix ../../../virt/kvm/, iommu.o) kvm-$(CONFIG_KVM_ASYNC_PF)+= $(addprefix ../../../virt/kvm/, async_pf.o) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fa708c9..89bf12c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_X86_ROBUST_SINGLESTEP: case KVM_CAP_XSAVE: case KVM_CAP_ASYNC_PF: + case KVM_CAP_MSIX_MMIO: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned long addr, struct kvm_vcpu *vcpu) { gpa_t gpa; + int r; gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); @@ -3822,14 +3824,16 @@ static int emulator_write_emulated_onepage(unsigned long addr, mmio: trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); + r = vcpu_mmio_write(vcpu, gpa, bytes, val); /* * Is this MMIO handled locally? */ - if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) + if (!r) return X86EMUL_CONTINUE; vcpu-mmio_needed = 1; - vcpu-run-exit_reason = KVM_EXIT_MMIO; + vcpu-run-exit_reason = (r == -ENOTSYNC) ? + KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO; This use of -ENOTSYNC is IMO confusing. How about we make vcpu_mmio_write return the positive exit reason? Negative value will mean an error. In fact currently nagative value means something more need to be done, the same as MMIO exit. So it would be if (!r) return X86EMUL_CONTINUE; vcpu-run-exit_reason = r; Now I think we can keep it, or update them all later. The way to do this would be 1. patch to return KVM_EXIT_MMIO on mmio 2. your patch that returns KVM_EXIT_MSIX_ROUTING_UPDATE on top It's not that straightforward. In most condition, the reason vcpu_mmio_write() 0 because KVM itself unable to complete the request. That's quite straightforward. But each handler in the chain can't decided it would be KVM_EXIT_MMIO, they can only know when all of them fail to handle the accessing. I am not sure if we like every single handler said I want KVM_EXIT_MMIO instead of a error return. We can discuss more on this, but since it's not API/ABI change, I think we can get the patch in first. -- regards Yang, Sheng -- regards Yang, Sheng -- To unsubscribe from
Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote: On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote: On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote: Then we can support mask bit operation of assigned devices now. Looks pretty good overall. A few comments below. It seems like we should be able to hook this into vfio with a small stub in kvm. We just need to be able to communicate disabling and enabling of individual msix vectors. For brute force, we could do this with a lot of eventfds, triggered by kvm and consumed by vfio, two per MSI-X vector. Not sure if there's something smaller that could do it. Thanks, Alex, thanks for your comments. See my comments below: Alex Signed-off-by: Sheng Yang sh...@linux.intel.com --- arch/x86/kvm/Makefile|2 +- arch/x86/kvm/x86.c |8 +- include/linux/kvm.h | 21 include/linux/kvm_host.h | 25 virt/kvm/assigned-dev.c | 44 +++ virt/kvm/kvm_main.c | 38 ++- virt/kvm/msix_mmio.c | 286 ++ virt/kvm/msix_mmio.h | 25 8 files changed, 442 insertions(+), 7 deletions(-) create mode 100644 virt/kvm/msix_mmio.c create mode 100644 virt/kvm/msix_mmio.h diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index f15501f..3a0d851 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I. kvm-y+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ coalesced_mmio.o irq_comm.o eventfd.o \ - assigned-dev.o) + assigned-dev.o msix_mmio.o) kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fa708c9..89bf12c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_X86_ROBUST_SINGLESTEP: case KVM_CAP_XSAVE: case KVM_CAP_ASYNC_PF: + case KVM_CAP_MSIX_MMIO: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned long addr, struct kvm_vcpu *vcpu) { gpa_t gpa; + int r; gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); @@ -3822,14 +3824,16 @@ static int emulator_write_emulated_onepage(unsigned long addr, mmio: trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); + r = vcpu_mmio_write(vcpu, gpa, bytes, val); /* * Is this MMIO handled locally? */ - if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) + if (!r) return X86EMUL_CONTINUE; vcpu-mmio_needed = 1; - vcpu-run-exit_reason = KVM_EXIT_MMIO; + vcpu-run-exit_reason = (r == -ENOTSYNC) ? + KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO; This use of -ENOTSYNC is IMO confusing. How about we make vcpu_mmio_write return the positive exit reason? Negative value will mean an error. vcpu-run-mmio.phys_addr = vcpu-mmio_phys_addr = gpa; vcpu-run-mmio.len = vcpu-mmio_size = bytes; vcpu-run-mmio.is_write = vcpu-mmio_is_write = 1; diff --git a/include/linux/kvm.h b/include/linux/kvm.h index ea2dc1a..ad9df4b 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -161,6 +161,7 @@ struct kvm_pit_config { #define KVM_EXIT_NMI 16 #define KVM_EXIT_INTERNAL_ERROR 17 #define KVM_EXIT_OSI 18 +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19 /* For KVM_EXIT_INTERNAL_ERROR */ #define KVM_INTERNAL_ERROR_EMULATION 1 @@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_PPC_GET_PVINFO 57 #define KVM_CAP_PPC_IRQ_LEVEL 58 #define KVM_CAP_ASYNC_PF 59 +#define KVM_CAP_MSIX_MMIO 60 #ifdef KVM_CAP_IRQ_ROUTING @@ -672,6 +674,9 @@ struct kvm_clock_data { #define KVM_XEN_HVM_CONFIG_IOW(KVMIO, 0x7a, struct kvm_xen_hvm_config) #define KVM_SET_CLOCK _IOW(KVMIO, 0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK _IOR(KVMIO, 0x7c, struct kvm_clock_data) +/* Available with KVM_CAP_MSIX_MMIO */ +#define KVM_REGISTER_MSIX_MMIO_IOW(KVMIO, 0x7d, struct kvm_msix_mmio_user) +#define KVM_UNREGISTER_MSIX_MMIO _IOW(KVMIO, 0x7e, struct kvm_msix_mmio_user) /* Available with KVM_CAP_PIT_STATE2 */ #define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct kvm_pit_state2) #define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct
Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
On Wed, 2011-02-23 at 14:59 +0800, Sheng Yang wrote: On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote: On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote: +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, +void *val) +{ + struct kvm_msix_mmio_dev *mmio_dev = + container_of(this, struct kvm_msix_mmio_dev, table_dev); + struct kvm_msix_mmio *mmio; + int idx, ret = 0, entry, offset, r; + + mutex_lock(mmio_dev-lock); + idx = get_mmio_table_index(mmio_dev, addr, len); + if (idx 0) { + ret = -EOPNOTSUPP; + goto out; + } + if ((addr 0x3) || (len != 4 len != 8)) + goto out; + + offset = addr 0xf; + if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL len == 8) + goto out; + + mmio = mmio_dev-mmio[idx]; + entry = (addr - mmio-table_base_addr) / PCI_MSIX_ENTRY_SIZE; + r = copy_from_user(val, (void __user *)(mmio-table_base_va + + entry * PCI_MSIX_ENTRY_SIZE + offset), len); + if (r) + goto out; +out: + mutex_unlock(mmio_dev-lock); + return ret; +} + +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr, + int len, const void *val) +{ + struct kvm_msix_mmio_dev *mmio_dev = + container_of(this, struct kvm_msix_mmio_dev, table_dev); + struct kvm_msix_mmio *mmio; + int idx, entry, offset, ret = 0, r = 0; + gpa_t entry_base; + u32 old_ctrl, new_ctrl; + u32 *ctrl_pos; + + mutex_lock(mmio_dev-kvm-lock); + mutex_lock(mmio_dev-lock); + idx = get_mmio_table_index(mmio_dev, addr, len); + if (idx 0) { + ret = -EOPNOTSUPP; + goto out; + } + if ((addr 0x3) || (len != 4 len != 8)) + goto out; + + offset = addr 0xF; nit, this 0xf above. So you like another mask? I'm just noting that above you used 0xf and here you used 0xF. + if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL len == 8) + goto out; + + mmio = mmio_dev-mmio[idx]; + entry = (addr - mmio-table_base_addr) / PCI_MSIX_ENTRY_SIZE; + entry_base = mmio-table_base_va + entry * PCI_MSIX_ENTRY_SIZE; + ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL); + + if (get_user(old_ctrl, ctrl_pos)) + goto out; + + /* No allow writing to other fields when entry is unmasked */ + if (!(old_ctrl PCI_MSIX_ENTRY_CTRL_MASKBIT) + offset != PCI_MSIX_ENTRY_VECTOR_CTRL) + goto out; + + if (copy_to_user((void __user *)(entry_base + offset), val, len)) + goto out; + + if (get_user(new_ctrl, ctrl_pos)) + goto out; + + if ((offset PCI_MSIX_ENTRY_VECTOR_CTRL len == 4) || + (offset PCI_MSIX_ENTRY_DATA len == 8)) + ret = -ENOTSYNC; Couldn't we avoid the above get_user(new_ctrl,...) if we tested this first? I'd also prefer to see a direct goto out here since it's not entirely obvious that this first 'if' always falls into the below 'if'. I'm not a fan of using an error code as a special non-error return either. In fact when offset==PCI_MSIX_ENTRY_DATA and len ==8, we need to check new_ctrl to see if they indeed modified ctrl bits. I would try to make the logic here more clear. Isn't that what you're testing for though? (offset PCI_MSIX_ENTRY_VECTOR_CTRL len == 4) If this is true, we can only be writing address, upper address, or data, not control. (offset PCI_MSIX_ENTRY_DATA len == 8) If this is true, we're writing address and upper address, not the control vector. Additionally, I think the size an alignment test should be more restrictive. We really only want to allow naturally aligned 4 or 8 byte accesses. Maybe instead of: if ((addr 0x3) || (len != 4 len != 8)) we should do this: if (!(len == 4 || len == 8) || addr (len - 1)) Then the test could become: if (offset + len = PCI_MSIX_ENTRY_VECTOR_CTRL) Thanks, Alex -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
On Wed, Feb 23, 2011 at 09:34:19AM -0700, Alex Williamson wrote: On Wed, 2011-02-23 at 14:59 +0800, Sheng Yang wrote: On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote: On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote: +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, + void *val) +{ + struct kvm_msix_mmio_dev *mmio_dev = + container_of(this, struct kvm_msix_mmio_dev, table_dev); + struct kvm_msix_mmio *mmio; + int idx, ret = 0, entry, offset, r; + + mutex_lock(mmio_dev-lock); + idx = get_mmio_table_index(mmio_dev, addr, len); + if (idx 0) { + ret = -EOPNOTSUPP; + goto out; + } + if ((addr 0x3) || (len != 4 len != 8)) + goto out; + + offset = addr 0xf; + if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL len == 8) + goto out; + + mmio = mmio_dev-mmio[idx]; + entry = (addr - mmio-table_base_addr) / PCI_MSIX_ENTRY_SIZE; + r = copy_from_user(val, (void __user *)(mmio-table_base_va + + entry * PCI_MSIX_ENTRY_SIZE + offset), len); + if (r) + goto out; +out: + mutex_unlock(mmio_dev-lock); + return ret; +} + +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr, + int len, const void *val) +{ + struct kvm_msix_mmio_dev *mmio_dev = + container_of(this, struct kvm_msix_mmio_dev, table_dev); + struct kvm_msix_mmio *mmio; + int idx, entry, offset, ret = 0, r = 0; + gpa_t entry_base; + u32 old_ctrl, new_ctrl; + u32 *ctrl_pos; + + mutex_lock(mmio_dev-kvm-lock); + mutex_lock(mmio_dev-lock); + idx = get_mmio_table_index(mmio_dev, addr, len); + if (idx 0) { + ret = -EOPNOTSUPP; + goto out; + } + if ((addr 0x3) || (len != 4 len != 8)) + goto out; + + offset = addr 0xF; nit, this 0xf above. So you like another mask? I'm just noting that above you used 0xf and here you used 0xF. + if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL len == 8) + goto out; + + mmio = mmio_dev-mmio[idx]; + entry = (addr - mmio-table_base_addr) / PCI_MSIX_ENTRY_SIZE; + entry_base = mmio-table_base_va + entry * PCI_MSIX_ENTRY_SIZE; + ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL); + + if (get_user(old_ctrl, ctrl_pos)) + goto out; + + /* No allow writing to other fields when entry is unmasked */ + if (!(old_ctrl PCI_MSIX_ENTRY_CTRL_MASKBIT) + offset != PCI_MSIX_ENTRY_VECTOR_CTRL) + goto out; + + if (copy_to_user((void __user *)(entry_base + offset), val, len)) + goto out; + + if (get_user(new_ctrl, ctrl_pos)) + goto out; + + if ((offset PCI_MSIX_ENTRY_VECTOR_CTRL len == 4) || + (offset PCI_MSIX_ENTRY_DATA len == 8)) + ret = -ENOTSYNC; Couldn't we avoid the above get_user(new_ctrl,...) if we tested this first? I'd also prefer to see a direct goto out here since it's not entirely obvious that this first 'if' always falls into the below 'if'. I'm not a fan of using an error code as a special non-error return either. In fact when offset==PCI_MSIX_ENTRY_DATA and len ==8, we need to check new_ctrl to see if they indeed modified ctrl bits. I would try to make the logic here more clear. Isn't that what you're testing for though? (offset PCI_MSIX_ENTRY_VECTOR_CTRL len == 4) If this is true, we can only be writing address, upper address, or data, not control. (offset PCI_MSIX_ENTRY_DATA len == 8) If this is true, we're writing address and upper address, not the control vector. Additionally, I think the size an alignment test should be more restrictive. We really only want to allow naturally aligned 4 or 8 byte accesses. Maybe instead of: if ((addr 0x3) || (len != 4 len != 8)) we should do this: if (!(len == 4 || len == 8) || addr (len - 1)) Then the test could become: if (offset + len = PCI_MSIX_ENTRY_VECTOR_CTRL) Thanks, Alex Or rather if (offset + len != PCI_MSIX_ENTRY_VECTOR_CTRL + 4) we are matching offset + length to control offset + control length, and avoid depending on the fact control is the last register. -- MST -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at
Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
On Wed, 2011-02-23 at 20:39 +0200, Michael S. Tsirkin wrote: On Wed, Feb 23, 2011 at 09:34:19AM -0700, Alex Williamson wrote: On Wed, 2011-02-23 at 14:59 +0800, Sheng Yang wrote: On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote: On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote: +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, +void *val) +{ + struct kvm_msix_mmio_dev *mmio_dev = + container_of(this, struct kvm_msix_mmio_dev, table_dev); + struct kvm_msix_mmio *mmio; + int idx, ret = 0, entry, offset, r; + + mutex_lock(mmio_dev-lock); + idx = get_mmio_table_index(mmio_dev, addr, len); + if (idx 0) { + ret = -EOPNOTSUPP; + goto out; + } + if ((addr 0x3) || (len != 4 len != 8)) + goto out; + + offset = addr 0xf; + if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL len == 8) + goto out; + + mmio = mmio_dev-mmio[idx]; + entry = (addr - mmio-table_base_addr) / PCI_MSIX_ENTRY_SIZE; + r = copy_from_user(val, (void __user *)(mmio-table_base_va + + entry * PCI_MSIX_ENTRY_SIZE + offset), len); + if (r) + goto out; +out: + mutex_unlock(mmio_dev-lock); + return ret; +} + +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr, + int len, const void *val) +{ + struct kvm_msix_mmio_dev *mmio_dev = + container_of(this, struct kvm_msix_mmio_dev, table_dev); + struct kvm_msix_mmio *mmio; + int idx, entry, offset, ret = 0, r = 0; + gpa_t entry_base; + u32 old_ctrl, new_ctrl; + u32 *ctrl_pos; + + mutex_lock(mmio_dev-kvm-lock); + mutex_lock(mmio_dev-lock); + idx = get_mmio_table_index(mmio_dev, addr, len); + if (idx 0) { + ret = -EOPNOTSUPP; + goto out; + } + if ((addr 0x3) || (len != 4 len != 8)) + goto out; + + offset = addr 0xF; nit, this 0xf above. So you like another mask? I'm just noting that above you used 0xf and here you used 0xF. + if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL len == 8) + goto out; + + mmio = mmio_dev-mmio[idx]; + entry = (addr - mmio-table_base_addr) / PCI_MSIX_ENTRY_SIZE; + entry_base = mmio-table_base_va + entry * PCI_MSIX_ENTRY_SIZE; + ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL); + + if (get_user(old_ctrl, ctrl_pos)) + goto out; + + /* No allow writing to other fields when entry is unmasked */ + if (!(old_ctrl PCI_MSIX_ENTRY_CTRL_MASKBIT) + offset != PCI_MSIX_ENTRY_VECTOR_CTRL) + goto out; + + if (copy_to_user((void __user *)(entry_base + offset), val, len)) + goto out; + + if (get_user(new_ctrl, ctrl_pos)) + goto out; + + if ((offset PCI_MSIX_ENTRY_VECTOR_CTRL len == 4) || + (offset PCI_MSIX_ENTRY_DATA len == 8)) + ret = -ENOTSYNC; Couldn't we avoid the above get_user(new_ctrl,...) if we tested this first? I'd also prefer to see a direct goto out here since it's not entirely obvious that this first 'if' always falls into the below 'if'. I'm not a fan of using an error code as a special non-error return either. In fact when offset==PCI_MSIX_ENTRY_DATA and len ==8, we need to check new_ctrl to see if they indeed modified ctrl bits. I would try to make the logic here more clear. Isn't that what you're testing for though? (offset PCI_MSIX_ENTRY_VECTOR_CTRL len == 4) If this is true, we can only be writing address, upper address, or data, not control. (offset PCI_MSIX_ENTRY_DATA len == 8) If this is true, we're writing address and upper address, not the control vector. Additionally, I think the size an alignment test should be more restrictive. We really only want to allow naturally aligned 4 or 8 byte accesses. Maybe instead of: if ((addr 0x3) || (len != 4 len != 8)) we should do this: if (!(len == 4 || len == 8) || addr (len - 1)) Then the test could become: if (offset + len = PCI_MSIX_ENTRY_VECTOR_CTRL) Thanks, Alex Or rather if (offset + len != PCI_MSIX_ENTRY_VECTOR_CTRL + 4) we are matching offset + length to control offset + control length, and avoid depending on the fact control is the last register. That's just silly. We're writing code to match a hardware specification. The registers always
Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
On Fri, Feb 18, 2011 at 04:15:38PM +0800, Sheng Yang wrote: Why allow this ioctl to succeed if there's an entry already present? This case is broken as mmio_dev-mmio_nr is increased below. Oh, It's a bug to let mmio_nr increase even with MMIO found. I've fixed it. The reason we allow multiply callings is userspace can register different types of address here(table address and PBA address). Ah OK. PCI bits must be reviewed... Pardon? PCI related things are already in 2.6.38-rc. I meant someone more familiar with PCI should review this patch, Michael/Alex, CC'ed. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote: Then we can support mask bit operation of assigned devices now. Looks pretty good overall. A few comments below. It seems like we should be able to hook this into vfio with a small stub in kvm. We just need to be able to communicate disabling and enabling of individual msix vectors. For brute force, we could do this with a lot of eventfds, triggered by kvm and consumed by vfio, two per MSI-X vector. Not sure if there's something smaller that could do it. Thanks, Alex Signed-off-by: Sheng Yang sh...@linux.intel.com --- arch/x86/kvm/Makefile|2 +- arch/x86/kvm/x86.c |8 +- include/linux/kvm.h | 21 include/linux/kvm_host.h | 25 virt/kvm/assigned-dev.c | 44 +++ virt/kvm/kvm_main.c | 38 ++- virt/kvm/msix_mmio.c | 286 ++ virt/kvm/msix_mmio.h | 25 8 files changed, 442 insertions(+), 7 deletions(-) create mode 100644 virt/kvm/msix_mmio.c create mode 100644 virt/kvm/msix_mmio.h diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index f15501f..3a0d851 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I. kvm-y+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ coalesced_mmio.o irq_comm.o eventfd.o \ - assigned-dev.o) + assigned-dev.o msix_mmio.o) kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fa708c9..89bf12c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_X86_ROBUST_SINGLESTEP: case KVM_CAP_XSAVE: case KVM_CAP_ASYNC_PF: + case KVM_CAP_MSIX_MMIO: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned long addr, struct kvm_vcpu *vcpu) { gpa_t gpa; + int r; gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); @@ -3822,14 +3824,16 @@ static int emulator_write_emulated_onepage(unsigned long addr, mmio: trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); + r = vcpu_mmio_write(vcpu, gpa, bytes, val); /* * Is this MMIO handled locally? */ - if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) + if (!r) return X86EMUL_CONTINUE; vcpu-mmio_needed = 1; - vcpu-run-exit_reason = KVM_EXIT_MMIO; + vcpu-run-exit_reason = (r == -ENOTSYNC) ? + KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO; vcpu-run-mmio.phys_addr = vcpu-mmio_phys_addr = gpa; vcpu-run-mmio.len = vcpu-mmio_size = bytes; vcpu-run-mmio.is_write = vcpu-mmio_is_write = 1; diff --git a/include/linux/kvm.h b/include/linux/kvm.h index ea2dc1a..ad9df4b 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -161,6 +161,7 @@ struct kvm_pit_config { #define KVM_EXIT_NMI 16 #define KVM_EXIT_INTERNAL_ERROR 17 #define KVM_EXIT_OSI 18 +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19 /* For KVM_EXIT_INTERNAL_ERROR */ #define KVM_INTERNAL_ERROR_EMULATION 1 @@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_PPC_GET_PVINFO 57 #define KVM_CAP_PPC_IRQ_LEVEL 58 #define KVM_CAP_ASYNC_PF 59 +#define KVM_CAP_MSIX_MMIO 60 #ifdef KVM_CAP_IRQ_ROUTING @@ -672,6 +674,9 @@ struct kvm_clock_data { #define KVM_XEN_HVM_CONFIG_IOW(KVMIO, 0x7a, struct kvm_xen_hvm_config) #define KVM_SET_CLOCK _IOW(KVMIO, 0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK _IOR(KVMIO, 0x7c, struct kvm_clock_data) +/* Available with KVM_CAP_MSIX_MMIO */ +#define KVM_REGISTER_MSIX_MMIO_IOW(KVMIO, 0x7d, struct kvm_msix_mmio_user) +#define KVM_UNREGISTER_MSIX_MMIO _IOW(KVMIO, 0x7e, struct kvm_msix_mmio_user) /* Available with KVM_CAP_PIT_STATE2 */ #define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct kvm_pit_state2) #define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2) @@ -795,4 +800,20 @@ struct kvm_assigned_msix_entry { __u16 padding[3]; }; +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV (1 0) + +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE(1 8) + +#define KVM_MSIX_MMIO_TYPE_DEV_MASK 0x00ff +#define KVM_MSIX_MMIO_TYPE_BASE_MASK 0xff00 +struct kvm_msix_mmio_user { + __u32 dev_id; + __u16 type; + __u16 max_entries_nr; + __u64 base_addr; + __u64 base_va; + __u64 flags; + __u64 reserved[4]; +}; + #endif /* __LINUX_KVM_H */
Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote: On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote: Then we can support mask bit operation of assigned devices now. Looks pretty good overall. A few comments below. It seems like we should be able to hook this into vfio with a small stub in kvm. We just need to be able to communicate disabling and enabling of individual msix vectors. For brute force, we could do this with a lot of eventfds, triggered by kvm and consumed by vfio, two per MSI-X vector. Not sure if there's something smaller that could do it. Thanks, Alex, thanks for your comments. See my comments below: Alex Signed-off-by: Sheng Yang sh...@linux.intel.com --- arch/x86/kvm/Makefile|2 +- arch/x86/kvm/x86.c |8 +- include/linux/kvm.h | 21 include/linux/kvm_host.h | 25 virt/kvm/assigned-dev.c | 44 +++ virt/kvm/kvm_main.c | 38 ++- virt/kvm/msix_mmio.c | 286 ++ virt/kvm/msix_mmio.h | 25 8 files changed, 442 insertions(+), 7 deletions(-) create mode 100644 virt/kvm/msix_mmio.c create mode 100644 virt/kvm/msix_mmio.h diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index f15501f..3a0d851 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I. kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ coalesced_mmio.o irq_comm.o eventfd.o \ - assigned-dev.o) + assigned-dev.o msix_mmio.o) kvm-$(CONFIG_IOMMU_API)+= $(addprefix ../../../virt/kvm/, iommu.o) kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fa708c9..89bf12c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_X86_ROBUST_SINGLESTEP: case KVM_CAP_XSAVE: case KVM_CAP_ASYNC_PF: + case KVM_CAP_MSIX_MMIO: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned long addr, struct kvm_vcpu *vcpu) { gpa_t gpa; + int r; gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); @@ -3822,14 +3824,16 @@ static int emulator_write_emulated_onepage(unsigned long addr, mmio: trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); + r = vcpu_mmio_write(vcpu, gpa, bytes, val); /* * Is this MMIO handled locally? */ - if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) + if (!r) return X86EMUL_CONTINUE; vcpu-mmio_needed = 1; - vcpu-run-exit_reason = KVM_EXIT_MMIO; + vcpu-run-exit_reason = (r == -ENOTSYNC) ? + KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO; vcpu-run-mmio.phys_addr = vcpu-mmio_phys_addr = gpa; vcpu-run-mmio.len = vcpu-mmio_size = bytes; vcpu-run-mmio.is_write = vcpu-mmio_is_write = 1; diff --git a/include/linux/kvm.h b/include/linux/kvm.h index ea2dc1a..ad9df4b 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -161,6 +161,7 @@ struct kvm_pit_config { #define KVM_EXIT_NMI 16 #define KVM_EXIT_INTERNAL_ERROR 17 #define KVM_EXIT_OSI 18 +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19 /* For KVM_EXIT_INTERNAL_ERROR */ #define KVM_INTERNAL_ERROR_EMULATION 1 @@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_PPC_GET_PVINFO 57 #define KVM_CAP_PPC_IRQ_LEVEL 58 #define KVM_CAP_ASYNC_PF 59 +#define KVM_CAP_MSIX_MMIO 60 #ifdef KVM_CAP_IRQ_ROUTING @@ -672,6 +674,9 @@ struct kvm_clock_data { #define KVM_XEN_HVM_CONFIG_IOW(KVMIO, 0x7a, struct kvm_xen_hvm_config) #define KVM_SET_CLOCK _IOW(KVMIO, 0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK _IOR(KVMIO, 0x7c, struct kvm_clock_data) +/* Available with KVM_CAP_MSIX_MMIO */ +#define KVM_REGISTER_MSIX_MMIO_IOW(KVMIO, 0x7d, struct kvm_msix_mmio_user) +#define KVM_UNREGISTER_MSIX_MMIO _IOW(KVMIO, 0x7e, struct kvm_msix_mmio_user) /* Available with KVM_CAP_PIT_STATE2 */ #define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct kvm_pit_state2) #define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2) @@ -795,4 +800,20 @@ struct kvm_assigned_msix_entry { __u16 padding[3]; }; +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV(1 0) + +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE (1 8) + +#define KVM_MSIX_MMIO_TYPE_DEV_MASK0x00ff
Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
On Thursday 03 February 2011 09:05:55 Marcelo Tosatti wrote: On Sun, Jan 30, 2011 at 01:11:15PM +0800, Sheng Yang wrote: Then we can support mask bit operation of assigned devices now. Signed-off-by: Sheng Yang sh...@linux.intel.com +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm, + struct kvm_msix_mmio_user *mmio_user) +{ + struct kvm_msix_mmio_dev *mmio_dev = kvm-msix_mmio_dev; + struct kvm_msix_mmio *mmio = NULL; + int r = 0, i; + + mutex_lock(mmio_dev-lock); + for (i = 0; i mmio_dev-mmio_nr; i++) { + if (mmio_dev-mmio[i].dev_id == mmio_user-dev_id + (mmio_dev-mmio[i].type KVM_MSIX_MMIO_TYPE_DEV_MASK) == + (mmio_user-type KVM_MSIX_MMIO_TYPE_DEV_MASK)) { + mmio = mmio_dev-mmio[i]; + if (mmio-max_entries_nr != mmio_user-max_entries_nr) { + r = -EINVAL; + goto out; + } + break; + } Why allow this ioctl to succeed if there's an entry already present? This case is broken as mmio_dev-mmio_nr is increased below. Oh, It's a bug to let mmio_nr increase even with MMIO found. I've fixed it. The reason we allow multiply callings is userspace can register different types of address here(table address and PBA address). PCI bits must be reviewed... Pardon? PCI related things are already in 2.6.38-rc. -- regards Yang, Sheng -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
On Sun, Jan 30, 2011 at 01:11:15PM +0800, Sheng Yang wrote: Then we can support mask bit operation of assigned devices now. Signed-off-by: Sheng Yang sh...@linux.intel.com +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm, + struct kvm_msix_mmio_user *mmio_user) +{ + struct kvm_msix_mmio_dev *mmio_dev = kvm-msix_mmio_dev; + struct kvm_msix_mmio *mmio = NULL; + int r = 0, i; + + mutex_lock(mmio_dev-lock); + for (i = 0; i mmio_dev-mmio_nr; i++) { + if (mmio_dev-mmio[i].dev_id == mmio_user-dev_id + (mmio_dev-mmio[i].type KVM_MSIX_MMIO_TYPE_DEV_MASK) == + (mmio_user-type KVM_MSIX_MMIO_TYPE_DEV_MASK)) { + mmio = mmio_dev-mmio[i]; + if (mmio-max_entries_nr != mmio_user-max_entries_nr) { + r = -EINVAL; + goto out; + } + break; + } Why allow this ioctl to succeed if there's an entry already present? This case is broken as mmio_dev-mmio_nr is increased below. PCI bits must be reviewed... -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
On 01/30/2011 06:38 AM, Sheng Yang wrote: (Sorry, missed this mail...) On Mon, Jan 17, 2011 at 02:29:44PM +0200, Avi Kivity wrote: On 01/06/2011 12:19 PM, Sheng Yang wrote: Then we can support mask bit operation of assigned devices now. +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm, + int assigned_dev_id, int entry, bool mask) +{ + int r = -EFAULT; + struct kvm_assigned_dev_kernel *adev; + int i; + + if (!irqchip_in_kernel(kvm)) + return r; + + mutex_lock(kvm-lock); + adev = kvm_find_assigned_dev(kvm-arch.assigned_dev_head, + assigned_dev_id); + if (!adev) + goto out; + + for (i = 0; i adev-entries_nr; i++) + if (adev-host_msix_entries[i].entry == entry) { + if (mask) + disable_irq_nosync( + adev-host_msix_entries[i].vector); Is it okay to call disable_irq_nosync() here? IIRC we don't check the mask bit on irq delivery, so we may forward an interrupt to the guest after the mask bit was set. What does pci say about the mask bit? when does it take effect? Another question is whether disable_irq_nosync() actually programs the device mask bit, or not. If it does, then it's slow, and it may be better to leave interrupts enabled but have an internal pending bit. If it doesn't program the mask bit, it's fine. I think Michael and Jan had explained this. + else + enable_irq(adev-host_msix_entries[i].vector); + r = 0; + break; + } +out: + mutex_unlock(kvm-lock); + return r; +} + +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, + void *val) +{ + struct kvm_msix_mmio_dev *mmio_dev = + container_of(this, struct kvm_msix_mmio_dev, table_dev); + struct kvm_msix_mmio *mmio; + int idx, ret = 0, entry, offset, r; + + mutex_lock(mmio_dev-lock); + idx = get_mmio_table_index(mmio_dev, addr, len); + if (idx 0) { + ret = -EOPNOTSUPP; + goto out; + } + if ((addr 0x3) || (len != 4 len != 8)) + goto out; + + offset = addr 0xf; + if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL len == 8) + goto out; + + mmio =mmio_dev-mmio[idx]; + entry = (addr - mmio-table_base_addr) / PCI_MSIX_ENTRY_SIZE; + r = copy_from_user(val, (void __user *)(mmio-table_base_va + + entry * PCI_MSIX_ENTRY_SIZE + offset), len); + if (r) + goto out; and return ret == 0? Yes. This operation should be handled by in-kernel MSI-X MMIO. So we return 0 in order to omit this action. We can add warning to it later. But it failed. We need to return -EFAULT. The same as above. + + if ((offset PCI_MSIX_ENTRY_VECTOR_CTRL len == 4) || + (offset PCI_MSIX_ENTRY_DATA len == 8)) + ret = -ENOTSYNC; goto out? No. This judgement only check if MSI data/address was touched. And the line below would check if we need to operate mask bit. Because in theory guest can use len=8 to modify MSI-X data and ctrl at the same time. Ok, makes sense. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
On Mon, Jan 31, 2011 at 03:09:09PM +0200, Avi Kivity wrote: On 01/30/2011 06:38 AM, Sheng Yang wrote: (Sorry, missed this mail...) On Mon, Jan 17, 2011 at 02:29:44PM +0200, Avi Kivity wrote: On 01/06/2011 12:19 PM, Sheng Yang wrote: Then we can support mask bit operation of assigned devices now. +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm, +int assigned_dev_id, int entry, bool mask) +{ +int r = -EFAULT; +struct kvm_assigned_dev_kernel *adev; +int i; + +if (!irqchip_in_kernel(kvm)) +return r; + +mutex_lock(kvm-lock); +adev = kvm_find_assigned_dev(kvm-arch.assigned_dev_head, + assigned_dev_id); +if (!adev) +goto out; + +for (i = 0; i adev-entries_nr; i++) +if (adev-host_msix_entries[i].entry == entry) { +if (mask) +disable_irq_nosync( + adev-host_msix_entries[i].vector); Is it okay to call disable_irq_nosync() here? IIRC we don't check the mask bit on irq delivery, so we may forward an interrupt to the guest after the mask bit was set. What does pci say about the mask bit? when does it take effect? Another question is whether disable_irq_nosync() actually programs the device mask bit, or not. If it does, then it's slow, and it may be better to leave interrupts enabled but have an internal pending bit. If it doesn't program the mask bit, it's fine. I think Michael and Jan had explained this. +else + enable_irq(adev-host_msix_entries[i].vector); +r = 0; +break; +} +out: +mutex_unlock(kvm-lock); +return r; +} + +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, +void *val) +{ +struct kvm_msix_mmio_dev *mmio_dev = +container_of(this, struct kvm_msix_mmio_dev, table_dev); +struct kvm_msix_mmio *mmio; +int idx, ret = 0, entry, offset, r; + +mutex_lock(mmio_dev-lock); +idx = get_mmio_table_index(mmio_dev, addr, len); +if (idx 0) { +ret = -EOPNOTSUPP; +goto out; +} +if ((addr 0x3) || (len != 4 len != 8)) +goto out; + +offset = addr 0xf; +if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL len == 8) +goto out; + +mmio =mmio_dev-mmio[idx]; +entry = (addr - mmio-table_base_addr) / PCI_MSIX_ENTRY_SIZE; +r = copy_from_user(val, (void __user *)(mmio-table_base_va + +entry * PCI_MSIX_ENTRY_SIZE + offset), len); +if (r) +goto out; and return ret == 0? Yes. This operation should be handled by in-kernel MSI-X MMIO. So we return 0 in order to omit this action. We can add warning to it later. But it failed. We need to return -EFAULT. So it would return to QEmu. OK, let QEmu prints warning about it. -- regards Yang, Sheng The same as above. + +if ((offset PCI_MSIX_ENTRY_VECTOR_CTRL len == 4) || +(offset PCI_MSIX_ENTRY_DATA len == 8)) +ret = -ENOTSYNC; goto out? No. This judgement only check if MSI data/address was touched. And the line below would check if we need to operate mask bit. Because in theory guest can use len=8 to modify MSI-X data and ctrl at the same time. Ok, makes sense. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
(Sorry, missed this mail...) On Mon, Jan 17, 2011 at 02:29:44PM +0200, Avi Kivity wrote: On 01/06/2011 12:19 PM, Sheng Yang wrote: Then we can support mask bit operation of assigned devices now. +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm, +int assigned_dev_id, int entry, bool mask) +{ +int r = -EFAULT; +struct kvm_assigned_dev_kernel *adev; +int i; + +if (!irqchip_in_kernel(kvm)) +return r; + +mutex_lock(kvm-lock); +adev = kvm_find_assigned_dev(kvm-arch.assigned_dev_head, + assigned_dev_id); +if (!adev) +goto out; + +for (i = 0; i adev-entries_nr; i++) +if (adev-host_msix_entries[i].entry == entry) { +if (mask) +disable_irq_nosync( +adev-host_msix_entries[i].vector); Is it okay to call disable_irq_nosync() here? IIRC we don't check the mask bit on irq delivery, so we may forward an interrupt to the guest after the mask bit was set. What does pci say about the mask bit? when does it take effect? Another question is whether disable_irq_nosync() actually programs the device mask bit, or not. If it does, then it's slow, and it may be better to leave interrupts enabled but have an internal pending bit. If it doesn't program the mask bit, it's fine. I think Michael and Jan had explained this. +else +enable_irq(adev-host_msix_entries[i].vector); +r = 0; +break; +} +out: +mutex_unlock(kvm-lock); +return r; +} + +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, +void *val) +{ +struct kvm_msix_mmio_dev *mmio_dev = +container_of(this, struct kvm_msix_mmio_dev, table_dev); +struct kvm_msix_mmio *mmio; +int idx, ret = 0, entry, offset, r; + +mutex_lock(mmio_dev-lock); +idx = get_mmio_table_index(mmio_dev, addr, len); +if (idx 0) { +ret = -EOPNOTSUPP; +goto out; +} +if ((addr 0x3) || (len != 4 len != 8)) +goto out; + +offset = addr 0xf; +if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL len == 8) +goto out; + +mmio =mmio_dev-mmio[idx]; +entry = (addr - mmio-table_base_addr) / PCI_MSIX_ENTRY_SIZE; +r = copy_from_user(val, (void __user *)(mmio-table_base_va + +entry * PCI_MSIX_ENTRY_SIZE + offset), len); +if (r) +goto out; and return ret == 0? Yes. This operation should be handled by in-kernel MSI-X MMIO. So we return 0 in order to omit this action. We can add warning to it later. +out: +mutex_unlock(mmio_dev-lock); +return ret; +} + +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr, +int len, const void *val) +{ +struct kvm_msix_mmio_dev *mmio_dev = +container_of(this, struct kvm_msix_mmio_dev, table_dev); +struct kvm_msix_mmio *mmio; +int idx, entry, offset, ret = 0, r = 0; +gpa_t entry_base; +u32 old_ctrl, new_ctrl; +u32 *ctrl_pos; + +mutex_lock(mmio_dev-lock); +idx = get_mmio_table_index(mmio_dev, addr, len); +if (idx 0) { +ret = -EOPNOTSUPP; +goto out; +} +if ((addr 0x3) || (len != 4 len != 8)) +goto out; + +offset = addr 0xF; +if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL len == 8) +goto out; + +mmio =mmio_dev-mmio[idx]; +entry = (addr - mmio-table_base_addr) / PCI_MSIX_ENTRY_SIZE; +entry_base = mmio-table_base_va + entry * PCI_MSIX_ENTRY_SIZE; +ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL); + +if (get_user(old_ctrl, ctrl_pos)) +goto out; + +/* No allow writing to other fields when entry is unmasked */ +if (!(old_ctrl PCI_MSIX_ENTRY_CTRL_MASKBIT) +offset != PCI_MSIX_ENTRY_VECTOR_CTRL) +goto out; + +if (copy_to_user((void __user *)(entry_base + offset), val, len)) +goto out; + +if (get_user(new_ctrl, ctrl_pos)) +goto out; here, too. The same as above. + +if ((offset PCI_MSIX_ENTRY_VECTOR_CTRL len == 4) || +(offset PCI_MSIX_ENTRY_DATA len == 8)) +ret = -ENOTSYNC; goto out? No. This judgement only check if MSI data/address was touched. And the line below would check if we need to operate mask bit. Because in theory guest can use len=8 to modify MSI-X data and ctrl at the same time. -- regards Yang, Sheng +if (old_ctrl == new_ctrl) +goto out; +if (!(old_ctrl PCI_MSIX_ENTRY_CTRL_MASKBIT) +(new_ctrl PCI_MSIX_ENTRY_CTRL_MASKBIT)) +r = update_msix_mask_bit(mmio_dev-kvm,
[PATCH 2/3] KVM: Emulate MSI-X table in kernel
Then we can support mask bit operation of assigned devices now. Signed-off-by: Sheng Yang sh...@linux.intel.com --- arch/x86/kvm/Makefile|2 +- arch/x86/kvm/x86.c |8 +- include/linux/kvm.h | 21 include/linux/kvm_host.h | 25 virt/kvm/assigned-dev.c | 44 +++ virt/kvm/kvm_main.c | 38 ++- virt/kvm/msix_mmio.c | 286 ++ virt/kvm/msix_mmio.h | 25 8 files changed, 442 insertions(+), 7 deletions(-) create mode 100644 virt/kvm/msix_mmio.c create mode 100644 virt/kvm/msix_mmio.h diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index f15501f..3a0d851 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I. kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ coalesced_mmio.o irq_comm.o eventfd.o \ - assigned-dev.o) + assigned-dev.o msix_mmio.o) kvm-$(CONFIG_IOMMU_API)+= $(addprefix ../../../virt/kvm/, iommu.o) kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fa708c9..89bf12c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_X86_ROBUST_SINGLESTEP: case KVM_CAP_XSAVE: case KVM_CAP_ASYNC_PF: + case KVM_CAP_MSIX_MMIO: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned long addr, struct kvm_vcpu *vcpu) { gpa_t gpa; + int r; gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); @@ -3822,14 +3824,16 @@ static int emulator_write_emulated_onepage(unsigned long addr, mmio: trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); + r = vcpu_mmio_write(vcpu, gpa, bytes, val); /* * Is this MMIO handled locally? */ - if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) + if (!r) return X86EMUL_CONTINUE; vcpu-mmio_needed = 1; - vcpu-run-exit_reason = KVM_EXIT_MMIO; + vcpu-run-exit_reason = (r == -ENOTSYNC) ? + KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO; vcpu-run-mmio.phys_addr = vcpu-mmio_phys_addr = gpa; vcpu-run-mmio.len = vcpu-mmio_size = bytes; vcpu-run-mmio.is_write = vcpu-mmio_is_write = 1; diff --git a/include/linux/kvm.h b/include/linux/kvm.h index ea2dc1a..ad9df4b 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -161,6 +161,7 @@ struct kvm_pit_config { #define KVM_EXIT_NMI 16 #define KVM_EXIT_INTERNAL_ERROR 17 #define KVM_EXIT_OSI 18 +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19 /* For KVM_EXIT_INTERNAL_ERROR */ #define KVM_INTERNAL_ERROR_EMULATION 1 @@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_PPC_GET_PVINFO 57 #define KVM_CAP_PPC_IRQ_LEVEL 58 #define KVM_CAP_ASYNC_PF 59 +#define KVM_CAP_MSIX_MMIO 60 #ifdef KVM_CAP_IRQ_ROUTING @@ -672,6 +674,9 @@ struct kvm_clock_data { #define KVM_XEN_HVM_CONFIG_IOW(KVMIO, 0x7a, struct kvm_xen_hvm_config) #define KVM_SET_CLOCK _IOW(KVMIO, 0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK _IOR(KVMIO, 0x7c, struct kvm_clock_data) +/* Available with KVM_CAP_MSIX_MMIO */ +#define KVM_REGISTER_MSIX_MMIO_IOW(KVMIO, 0x7d, struct kvm_msix_mmio_user) +#define KVM_UNREGISTER_MSIX_MMIO _IOW(KVMIO, 0x7e, struct kvm_msix_mmio_user) /* Available with KVM_CAP_PIT_STATE2 */ #define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct kvm_pit_state2) #define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2) @@ -795,4 +800,20 @@ struct kvm_assigned_msix_entry { __u16 padding[3]; }; +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV(1 0) + +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE (1 8) + +#define KVM_MSIX_MMIO_TYPE_DEV_MASK0x00ff +#define KVM_MSIX_MMIO_TYPE_BASE_MASK 0xff00 +struct kvm_msix_mmio_user { + __u32 dev_id; + __u16 type; + __u16 max_entries_nr; + __u64 base_addr; + __u64 base_va; + __u64 flags; + __u64 reserved[4]; +}; + #endif /* __LINUX_KVM_H */ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7d313e0..c10670c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -233,6 +233,27 @@ struct kvm_memslots { KVM_PRIVATE_MEM_SLOTS]; }; +#define KVM_MSIX_MMIO_MAX32 + +struct kvm_msix_mmio { + u32 dev_id; + u16 type; + u16 max_entries_nr; + u64 flags; + gpa_t table_base_addr; + hva_t table_base_va; + gpa_t pba_base_addr; + hva_t
Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
On Monday 17 January 2011 20:39:30 Marcelo Tosatti wrote: On Mon, Jan 17, 2011 at 08:18:22PM +0800, Sheng Yang wrote: + goto out; + + mmio = mmio_dev-mmio[idx]; + entry = (addr - mmio-table_base_addr) / PCI_MSIX_ENTRY_SIZE; + entry_base = mmio-table_base_va + entry * PCI_MSIX_ENTRY_SIZE; + ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL); + + if (get_user(old_ctrl, ctrl_pos)) + goto out; + + /* No allow writing to other fields when entry is unmasked */ + if (!(old_ctrl PCI_MSIX_ENTRY_CTRL_MASKBIT) + offset != PCI_MSIX_ENTRY_VECTOR_CTRL) + goto out; + + if (copy_to_user((void __user *)(entry_base + offset), val, len)) + goto out; Instead of copying to/from userspace (which is subject to swapin, unexpected values), you could include the guest written value in a kvm_run structure, along with address. Qemu-kvm would use that to synchronize its copy of the table, on KVM_EXIT_MSIX_ROUTING_UPDATE exit. We want to acelerate MSI-X mask bit accessing, which won't exit to userspace in the most condition. That's the cost we want to optimize. Also it's possible to userspace to read the correct value of MMIO(but mostly userspace can't write to it in order to prevent synchronize issue). Yes, i meant exit to userspace only when necessary, but instead of copying directly everytime record the value the guest has written in kvm_run and exit with KVM_EXIT_MSIX_ROUTING_UPDATE. + if (get_user(new_ctrl, ctrl_pos)) + goto out; + + if ((offset PCI_MSIX_ENTRY_VECTOR_CTRL len == 4) || + (offset PCI_MSIX_ENTRY_DATA len == 8)) + ret = -ENOTSYNC; + if (old_ctrl == new_ctrl) + goto out; + if (!(old_ctrl PCI_MSIX_ENTRY_CTRL_MASKBIT) + (new_ctrl PCI_MSIX_ENTRY_CTRL_MASKBIT)) + r = update_msix_mask_bit(mmio_dev-kvm, mmio, entry, 1); + else if ((old_ctrl PCI_MSIX_ENTRY_CTRL_MASKBIT) + !(new_ctrl PCI_MSIX_ENTRY_CTRL_MASKBIT)) + r = update_msix_mask_bit(mmio_dev-kvm, mmio, entry, 0); Then you rely on the kernel copy of the values to enable/disable irq. Yes, they are guest owned assigned IRQs. Any potential issue? Nothing prevents the kernel from enabling or disabling irq multiple times with this code (what prevents it is a qemu-kvm that behaves as expected). This is not very good. Perhaps the guest can only harm itself with that, but i'm not sure. MSI-X interrupts are not shared, so I think guest can only harm itself if it was doing it wrong. And also if an msix table page is swapped out guest will hang. + return r; +} This is not consistent with registration, where there are no checks regarding validity of assigned device id. So why is it necessary? I am not quite understand. We need to free mmio anyway, otherwise it would result in wrong mmio interception... If you return -EOPNOTSUPP in case kvm_find_assigned_dev fails in the read/write paths, there is no need to free anything. It may work with assigned devices, but the potential user of this is including vfio drivers and emulate devices. And I don't think it's a good idea to have registeration process but no free process... -- regards Yang, Sheng There is a lock ordering problem BTW: - read/write handlers: dev-lock - kvm-lock - vm_ioctl_deassign_device - kvm_free_msix_mmio: kvm-lock - dev-lock Good catch! Would fix it(and other comments of course). -- regards Yang, Sheng + +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm, + struct kvm_msix_mmio_user *mmio_user) +{ + struct kvm_msix_mmio mmio; + + mmio.dev_id = mmio_user-dev_id; + mmio.type = mmio_user-type; + + return kvm_free_msix_mmio(kvm, mmio); +} -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
On Thu, Jan 06, 2011 at 06:19:44PM +0800, Sheng Yang wrote: Then we can support mask bit operation of assigned devices now. Signed-off-by: Sheng Yang sh...@linux.intel.com --- arch/x86/kvm/Makefile|2 +- arch/x86/kvm/x86.c |8 +- include/linux/kvm.h | 21 include/linux/kvm_host.h | 25 virt/kvm/assigned-dev.c | 44 +++ virt/kvm/kvm_main.c | 38 ++- virt/kvm/msix_mmio.c | 284 ++ virt/kvm/msix_mmio.h | 25 8 files changed, 440 insertions(+), 7 deletions(-) create mode 100644 virt/kvm/msix_mmio.c create mode 100644 virt/kvm/msix_mmio.h diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index ae72ae6..7444dcd 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c @@ -18,6 +18,7 @@ #include linux/interrupt.h #include linux/slab.h #include irq.h +#include msix_mmio.h static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, int assigned_dev_id) @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm *kvm, kvm_deassign_irq(kvm, assigned_dev, assigned_dev-irq_requested_type); } +static void assigned_device_free_msix_mmio(struct kvm *kvm, + struct kvm_assigned_dev_kernel *adev) +{ + struct kvm_msix_mmio mmio; + + mmio.dev_id = adev-assigned_dev_id; + mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV | + KVM_MSIX_MMIO_TYPE_BASE_TABLE; + kvm_free_msix_mmio(kvm, mmio); +} + static void kvm_free_assigned_device(struct kvm *kvm, struct kvm_assigned_dev_kernel *assigned_dev) { kvm_free_assigned_irq(kvm, assigned_dev); + assigned_device_free_msix_mmio(kvm, assigned_dev); + __pci_reset_function(assigned_dev-dev); pci_restore_state(assigned_dev-dev); @@ -785,3 +799,33 @@ out: return r; } +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm, + int assigned_dev_id, int entry, bool mask) +{ + int r = -EFAULT; + struct kvm_assigned_dev_kernel *adev; + int i; + + if (!irqchip_in_kernel(kvm)) + return r; + + mutex_lock(kvm-lock); + adev = kvm_find_assigned_dev(kvm-arch.assigned_dev_head, + assigned_dev_id); + if (!adev) + goto out; + + for (i = 0; i adev-entries_nr; i++) + if (adev-host_msix_entries[i].entry == entry) { + if (mask) + disable_irq_nosync( + adev-host_msix_entries[i].vector); + else + enable_irq(adev-host_msix_entries[i].vector); + r = 0; + break; + } Should check if the host irq is registered as MSIX type. +out: + mutex_unlock(kvm-lock); + return r; +} diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b1b6cbb..b7807c8 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -56,6 +56,7 @@ #include coalesced_mmio.h #include async_pf.h +#include msix_mmio.h #define CREATE_TRACE_POINTS #include trace/events/kvm.h @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm) struct mm_struct *mm = kvm-mm; kvm_arch_sync_events(kvm); + kvm_unregister_msix_mmio_dev(kvm); spin_lock(kvm_lock); list_del(kvm-vm_list); spin_unlock(kvm_lock); @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp, mutex_unlock(kvm-lock); break; #endif + case KVM_REGISTER_MSIX_MMIO: { + struct kvm_msix_mmio_user mmio_user; + + r = -EFAULT; + if (copy_from_user(mmio_user, argp, sizeof mmio_user)) + goto out; + r = kvm_vm_ioctl_register_msix_mmio(kvm, mmio_user); + break; + } + case KVM_UNREGISTER_MSIX_MMIO: { + struct kvm_msix_mmio_user mmio_user; + + r = -EFAULT; + if (copy_from_user(mmio_user, argp, sizeof mmio_user)) + goto out; + r = kvm_vm_ioctl_unregister_msix_mmio(kvm, mmio_user); + break; + } default: r = kvm_arch_vm_ioctl(filp, ioctl, arg); if (r == -ENOTTY) @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void) return r; } #endif + r = kvm_register_msix_mmio_dev(kvm); + if (r 0) { + kvm_put_kvm(kvm); + return r; + } + r = anon_inode_getfd(kvm-vm, kvm_vm_fops, kvm, O_RDWR); if (r 0) kvm_put_kvm(kvm); @@ -2223,14 +2249,18 @@ static void
Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
On Monday 17 January 2011 19:54:47 Marcelo Tosatti wrote: On Thu, Jan 06, 2011 at 06:19:44PM +0800, Sheng Yang wrote: Then we can support mask bit operation of assigned devices now. Signed-off-by: Sheng Yang sh...@linux.intel.com --- arch/x86/kvm/Makefile|2 +- arch/x86/kvm/x86.c |8 +- include/linux/kvm.h | 21 include/linux/kvm_host.h | 25 virt/kvm/assigned-dev.c | 44 +++ virt/kvm/kvm_main.c | 38 ++- virt/kvm/msix_mmio.c | 284 ++ virt/kvm/msix_mmio.h | 25 8 files changed, 440 insertions(+), 7 deletions(-) create mode 100644 virt/kvm/msix_mmio.c create mode 100644 virt/kvm/msix_mmio.h diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index ae72ae6..7444dcd 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c @@ -18,6 +18,7 @@ #include linux/interrupt.h #include linux/slab.h #include irq.h +#include msix_mmio.h static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, int assigned_dev_id) @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm *kvm, kvm_deassign_irq(kvm, assigned_dev, assigned_dev-irq_requested_type); } +static void assigned_device_free_msix_mmio(struct kvm *kvm, + struct kvm_assigned_dev_kernel *adev) +{ + struct kvm_msix_mmio mmio; + + mmio.dev_id = adev-assigned_dev_id; + mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV | + KVM_MSIX_MMIO_TYPE_BASE_TABLE; + kvm_free_msix_mmio(kvm, mmio); +} + static void kvm_free_assigned_device(struct kvm *kvm, struct kvm_assigned_dev_kernel *assigned_dev) { kvm_free_assigned_irq(kvm, assigned_dev); + assigned_device_free_msix_mmio(kvm, assigned_dev); + __pci_reset_function(assigned_dev-dev); pci_restore_state(assigned_dev-dev); @@ -785,3 +799,33 @@ out: return r; } +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm, + int assigned_dev_id, int entry, bool mask) +{ + int r = -EFAULT; + struct kvm_assigned_dev_kernel *adev; + int i; + + if (!irqchip_in_kernel(kvm)) + return r; + + mutex_lock(kvm-lock); + adev = kvm_find_assigned_dev(kvm-arch.assigned_dev_head, + assigned_dev_id); + if (!adev) + goto out; + + for (i = 0; i adev-entries_nr; i++) + if (adev-host_msix_entries[i].entry == entry) { + if (mask) + disable_irq_nosync( + adev-host_msix_entries[i].vector); + else + enable_irq(adev-host_msix_entries[i].vector); + r = 0; + break; + } Should check if the host irq is registered as MSIX type. +out: + mutex_unlock(kvm-lock); + return r; +} diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b1b6cbb..b7807c8 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -56,6 +56,7 @@ #include coalesced_mmio.h #include async_pf.h +#include msix_mmio.h #define CREATE_TRACE_POINTS #include trace/events/kvm.h @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm) struct mm_struct *mm = kvm-mm; kvm_arch_sync_events(kvm); + kvm_unregister_msix_mmio_dev(kvm); spin_lock(kvm_lock); list_del(kvm-vm_list); spin_unlock(kvm_lock); @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp, mutex_unlock(kvm-lock); break; #endif + case KVM_REGISTER_MSIX_MMIO: { + struct kvm_msix_mmio_user mmio_user; + + r = -EFAULT; + if (copy_from_user(mmio_user, argp, sizeof mmio_user)) + goto out; + r = kvm_vm_ioctl_register_msix_mmio(kvm, mmio_user); + break; + } + case KVM_UNREGISTER_MSIX_MMIO: { + struct kvm_msix_mmio_user mmio_user; + + r = -EFAULT; + if (copy_from_user(mmio_user, argp, sizeof mmio_user)) + goto out; + r = kvm_vm_ioctl_unregister_msix_mmio(kvm, mmio_user); + break; + } default: r = kvm_arch_vm_ioctl(filp, ioctl, arg); if (r == -ENOTTY) @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void) return r; } #endif + r = kvm_register_msix_mmio_dev(kvm); + if (r 0) { + kvm_put_kvm(kvm); + return r; + } + r =
Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
On 01/17/2011 02:18 PM, Sheng Yang wrote: + + if (copy_to_user((void __user *)(entry_base + offset), val, len)) + goto out; Instead of copying to/from userspace (which is subject to swapin, unexpected values), you could include the guest written value in a kvm_run structure, along with address. Qemu-kvm would use that to synchronize its copy of the table, on KVM_EXIT_MSIX_ROUTING_UPDATE exit. We want to acelerate MSI-X mask bit accessing, which won't exit to userspace in the most condition. That's the cost we want to optimize. Also it's possible to userspace to read the correct value of MMIO(but mostly userspace can't write to it in order to prevent synchronize issue). It's also good to have the values in just one place; using userspace makes it easy for both the kernel and userspace to see the values (and set them after migration, if/when we extend this to virtio). -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
On 01/06/2011 12:19 PM, Sheng Yang wrote: Then we can support mask bit operation of assigned devices now. +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm, + int assigned_dev_id, int entry, bool mask) +{ + int r = -EFAULT; + struct kvm_assigned_dev_kernel *adev; + int i; + + if (!irqchip_in_kernel(kvm)) + return r; + + mutex_lock(kvm-lock); + adev = kvm_find_assigned_dev(kvm-arch.assigned_dev_head, + assigned_dev_id); + if (!adev) + goto out; + + for (i = 0; i adev-entries_nr; i++) + if (adev-host_msix_entries[i].entry == entry) { + if (mask) + disable_irq_nosync( + adev-host_msix_entries[i].vector); Is it okay to call disable_irq_nosync() here? IIRC we don't check the mask bit on irq delivery, so we may forward an interrupt to the guest after the mask bit was set. What does pci say about the mask bit? when does it take effect? Another question is whether disable_irq_nosync() actually programs the device mask bit, or not. If it does, then it's slow, and it may be better to leave interrupts enabled but have an internal pending bit. If it doesn't program the mask bit, it's fine. + else + enable_irq(adev-host_msix_entries[i].vector); + r = 0; + break; + } +out: + mutex_unlock(kvm-lock); + return r; +} + +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, + void *val) +{ + struct kvm_msix_mmio_dev *mmio_dev = + container_of(this, struct kvm_msix_mmio_dev, table_dev); + struct kvm_msix_mmio *mmio; + int idx, ret = 0, entry, offset, r; + + mutex_lock(mmio_dev-lock); + idx = get_mmio_table_index(mmio_dev, addr, len); + if (idx 0) { + ret = -EOPNOTSUPP; + goto out; + } + if ((addr 0x3) || (len != 4 len != 8)) + goto out; + + offset = addr 0xf; + if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL len == 8) + goto out; + + mmio =mmio_dev-mmio[idx]; + entry = (addr - mmio-table_base_addr) / PCI_MSIX_ENTRY_SIZE; + r = copy_from_user(val, (void __user *)(mmio-table_base_va + + entry * PCI_MSIX_ENTRY_SIZE + offset), len); + if (r) + goto out; and return ret == 0? +out: + mutex_unlock(mmio_dev-lock); + return ret; +} + +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr, + int len, const void *val) +{ + struct kvm_msix_mmio_dev *mmio_dev = + container_of(this, struct kvm_msix_mmio_dev, table_dev); + struct kvm_msix_mmio *mmio; + int idx, entry, offset, ret = 0, r = 0; + gpa_t entry_base; + u32 old_ctrl, new_ctrl; + u32 *ctrl_pos; + + mutex_lock(mmio_dev-lock); + idx = get_mmio_table_index(mmio_dev, addr, len); + if (idx 0) { + ret = -EOPNOTSUPP; + goto out; + } + if ((addr 0x3) || (len != 4 len != 8)) + goto out; + + offset = addr 0xF; + if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL len == 8) + goto out; + + mmio =mmio_dev-mmio[idx]; + entry = (addr - mmio-table_base_addr) / PCI_MSIX_ENTRY_SIZE; + entry_base = mmio-table_base_va + entry * PCI_MSIX_ENTRY_SIZE; + ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL); + + if (get_user(old_ctrl, ctrl_pos)) + goto out; + + /* No allow writing to other fields when entry is unmasked */ + if (!(old_ctrl PCI_MSIX_ENTRY_CTRL_MASKBIT) + offset != PCI_MSIX_ENTRY_VECTOR_CTRL) + goto out; + + if (copy_to_user((void __user *)(entry_base + offset), val, len)) + goto out; + + if (get_user(new_ctrl, ctrl_pos)) + goto out; here, too. + + if ((offset PCI_MSIX_ENTRY_VECTOR_CTRL len == 4) || + (offset PCI_MSIX_ENTRY_DATA len == 8)) + ret = -ENOTSYNC; goto out? + if (old_ctrl == new_ctrl) + goto out; + if (!(old_ctrl PCI_MSIX_ENTRY_CTRL_MASKBIT) + (new_ctrl PCI_MSIX_ENTRY_CTRL_MASKBIT)) + r = update_msix_mask_bit(mmio_dev-kvm, mmio, entry, 1); + else if ((old_ctrl PCI_MSIX_ENTRY_CTRL_MASKBIT) + !(new_ctrl PCI_MSIX_ENTRY_CTRL_MASKBIT)) + r = update_msix_mask_bit(mmio_dev-kvm, mmio, entry, 0); + if (r || ret) + ret = -ENOTSYNC; +out: + mutex_unlock(mmio_dev-lock); + return ret; +} + -- error compiling committee.c: too many arguments to
Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
On Mon, Jan 17, 2011 at 08:18:22PM +0800, Sheng Yang wrote: + goto out; + + mmio = mmio_dev-mmio[idx]; + entry = (addr - mmio-table_base_addr) / PCI_MSIX_ENTRY_SIZE; + entry_base = mmio-table_base_va + entry * PCI_MSIX_ENTRY_SIZE; + ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL); + + if (get_user(old_ctrl, ctrl_pos)) + goto out; + + /* No allow writing to other fields when entry is unmasked */ + if (!(old_ctrl PCI_MSIX_ENTRY_CTRL_MASKBIT) + offset != PCI_MSIX_ENTRY_VECTOR_CTRL) + goto out; + + if (copy_to_user((void __user *)(entry_base + offset), val, len)) + goto out; Instead of copying to/from userspace (which is subject to swapin, unexpected values), you could include the guest written value in a kvm_run structure, along with address. Qemu-kvm would use that to synchronize its copy of the table, on KVM_EXIT_MSIX_ROUTING_UPDATE exit. We want to acelerate MSI-X mask bit accessing, which won't exit to userspace in the most condition. That's the cost we want to optimize. Also it's possible to userspace to read the correct value of MMIO(but mostly userspace can't write to it in order to prevent synchronize issue). Yes, i meant exit to userspace only when necessary, but instead of copying directly everytime record the value the guest has written in kvm_run and exit with KVM_EXIT_MSIX_ROUTING_UPDATE. + if (get_user(new_ctrl, ctrl_pos)) + goto out; + + if ((offset PCI_MSIX_ENTRY_VECTOR_CTRL len == 4) || + (offset PCI_MSIX_ENTRY_DATA len == 8)) + ret = -ENOTSYNC; + if (old_ctrl == new_ctrl) + goto out; + if (!(old_ctrl PCI_MSIX_ENTRY_CTRL_MASKBIT) + (new_ctrl PCI_MSIX_ENTRY_CTRL_MASKBIT)) + r = update_msix_mask_bit(mmio_dev-kvm, mmio, entry, 1); + else if ((old_ctrl PCI_MSIX_ENTRY_CTRL_MASKBIT) + !(new_ctrl PCI_MSIX_ENTRY_CTRL_MASKBIT)) + r = update_msix_mask_bit(mmio_dev-kvm, mmio, entry, 0); Then you rely on the kernel copy of the values to enable/disable irq. Yes, they are guest owned assigned IRQs. Any potential issue? Nothing prevents the kernel from enabling or disabling irq multiple times with this code (what prevents it is a qemu-kvm that behaves as expected). This is not very good. Perhaps the guest can only harm itself with that, but i'm not sure. And also if an msix table page is swapped out guest will hang. + return r; +} This is not consistent with registration, where there are no checks regarding validity of assigned device id. So why is it necessary? I am not quite understand. We need to free mmio anyway, otherwise it would result in wrong mmio interception... If you return -EOPNOTSUPP in case kvm_find_assigned_dev fails in the read/write paths, there is no need to free anything. There is a lock ordering problem BTW: - read/write handlers: dev-lock - kvm-lock - vm_ioctl_deassign_device - kvm_free_msix_mmio: kvm-lock - dev-lock Good catch! Would fix it(and other comments of course). -- regards Yang, Sheng + +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm, + struct kvm_msix_mmio_user *mmio_user) +{ + struct kvm_msix_mmio mmio; + + mmio.dev_id = mmio_user-dev_id; + mmio.type = mmio_user-type; + + return kvm_free_msix_mmio(kvm, mmio); +} -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
On Mon, Jan 17, 2011 at 02:18:43PM +0200, Avi Kivity wrote: On 01/17/2011 02:18 PM, Sheng Yang wrote: + + if (copy_to_user((void __user *)(entry_base + offset), val, len)) + goto out; Instead of copying to/from userspace (which is subject to swapin, unexpected values), you could include the guest written value in a kvm_run structure, along with address. Qemu-kvm would use that to synchronize its copy of the table, on KVM_EXIT_MSIX_ROUTING_UPDATE exit. We want to acelerate MSI-X mask bit accessing, which won't exit to userspace in the most condition. That's the cost we want to optimize. Also it's possible to userspace to read the correct value of MMIO(but mostly userspace can't write to it in order to prevent synchronize issue). It's also good to have the values in just one place; using userspace makes it easy for both the kernel and userspace to see the values (and set them after migration, if/when we extend this to virtio). Right, thats an advantage, but: - How can userspace ever synchronize with updates by the kernel to the MSI-X entry? - Reading/writing to the userspace area must be done carefully, values must be validated before used. - Swapping issue (minor?). -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
On 01/17/2011 02:48 PM, Marcelo Tosatti wrote: On Mon, Jan 17, 2011 at 02:18:43PM +0200, Avi Kivity wrote: On 01/17/2011 02:18 PM, Sheng Yang wrote: + + if (copy_to_user((void __user *)(entry_base + offset), val, len)) + goto out; Instead of copying to/from userspace (which is subject to swapin, unexpected values), you could include the guest written value in a kvm_run structure, along with address. Qemu-kvm would use that to synchronize its copy of the table, on KVM_EXIT_MSIX_ROUTING_UPDATE exit. We want to acelerate MSI-X mask bit accessing, which won't exit to userspace in the most condition. That's the cost we want to optimize. Also it's possible to userspace to read the correct value of MMIO(but mostly userspace can't write to it in order to prevent synchronize issue). It's also good to have the values in just one place; using userspace makes it easy for both the kernel and userspace to see the values (and set them after migration, if/when we extend this to virtio). Right, thats an advantage, but: - How can userspace ever synchronize with updates by the kernel to the MSI-X entry? What a value is written by the guest, which kvm cannot handle itself (i.e. a change to anything other than the mask bit), we exit with the table and entry ids, so userspace can reread them. - Reading/writing to the userspace area must be done carefully, values must be validated before used. True every time... - Swapping issue (minor?). I don't see the issue... just like any part of qemu that may be swapped out, blocking the vcpu thread. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
On Mon, Jan 17, 2011 at 02:29:44PM +0200, Avi Kivity wrote: On 01/06/2011 12:19 PM, Sheng Yang wrote: Then we can support mask bit operation of assigned devices now. +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm, +int assigned_dev_id, int entry, bool mask) +{ +int r = -EFAULT; +struct kvm_assigned_dev_kernel *adev; +int i; + +if (!irqchip_in_kernel(kvm)) +return r; + +mutex_lock(kvm-lock); +adev = kvm_find_assigned_dev(kvm-arch.assigned_dev_head, + assigned_dev_id); +if (!adev) +goto out; + +for (i = 0; i adev-entries_nr; i++) +if (adev-host_msix_entries[i].entry == entry) { +if (mask) +disable_irq_nosync( +adev-host_msix_entries[i].vector); Is it okay to call disable_irq_nosync() here? IIRC we don't check the mask bit on irq delivery, so we may forward an interrupt to the guest after the mask bit was set. What does pci say about the mask bit? when does it take effect? It is only guaranteed to take effect on the next read. In practice, it takes effect earlier and guests rely on this as an optimization. Another question is whether disable_irq_nosync() actually programs the device mask bit, or not. If it does, then it's slow, and it may be better to leave interrupts enabled but have an internal pending bit. If it doesn't program the mask bit, it's fine. +else +enable_irq(adev-host_msix_entries[i].vector); +r = 0; +break; +} +out: +mutex_unlock(kvm-lock); +return r; +} + +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, +void *val) +{ +struct kvm_msix_mmio_dev *mmio_dev = +container_of(this, struct kvm_msix_mmio_dev, table_dev); +struct kvm_msix_mmio *mmio; +int idx, ret = 0, entry, offset, r; + +mutex_lock(mmio_dev-lock); +idx = get_mmio_table_index(mmio_dev, addr, len); +if (idx 0) { +ret = -EOPNOTSUPP; +goto out; +} +if ((addr 0x3) || (len != 4 len != 8)) +goto out; + +offset = addr 0xf; +if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL len == 8) +goto out; + +mmio =mmio_dev-mmio[idx]; +entry = (addr - mmio-table_base_addr) / PCI_MSIX_ENTRY_SIZE; +r = copy_from_user(val, (void __user *)(mmio-table_base_va + +entry * PCI_MSIX_ENTRY_SIZE + offset), len); +if (r) +goto out; and return ret == 0? +out: +mutex_unlock(mmio_dev-lock); +return ret; +} + +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr, +int len, const void *val) +{ +struct kvm_msix_mmio_dev *mmio_dev = +container_of(this, struct kvm_msix_mmio_dev, table_dev); +struct kvm_msix_mmio *mmio; +int idx, entry, offset, ret = 0, r = 0; +gpa_t entry_base; +u32 old_ctrl, new_ctrl; +u32 *ctrl_pos; + +mutex_lock(mmio_dev-lock); +idx = get_mmio_table_index(mmio_dev, addr, len); +if (idx 0) { +ret = -EOPNOTSUPP; +goto out; +} +if ((addr 0x3) || (len != 4 len != 8)) +goto out; + +offset = addr 0xF; +if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL len == 8) +goto out; + +mmio =mmio_dev-mmio[idx]; +entry = (addr - mmio-table_base_addr) / PCI_MSIX_ENTRY_SIZE; +entry_base = mmio-table_base_va + entry * PCI_MSIX_ENTRY_SIZE; +ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL); + +if (get_user(old_ctrl, ctrl_pos)) +goto out; + +/* No allow writing to other fields when entry is unmasked */ +if (!(old_ctrl PCI_MSIX_ENTRY_CTRL_MASKBIT) +offset != PCI_MSIX_ENTRY_VECTOR_CTRL) +goto out; + +if (copy_to_user((void __user *)(entry_base + offset), val, len)) +goto out; + +if (get_user(new_ctrl, ctrl_pos)) +goto out; here, too. + +if ((offset PCI_MSIX_ENTRY_VECTOR_CTRL len == 4) || +(offset PCI_MSIX_ENTRY_DATA len == 8)) +ret = -ENOTSYNC; goto out? +if (old_ctrl == new_ctrl) +goto out; +if (!(old_ctrl PCI_MSIX_ENTRY_CTRL_MASKBIT) +(new_ctrl PCI_MSIX_ENTRY_CTRL_MASKBIT)) +r = update_msix_mask_bit(mmio_dev-kvm, mmio, entry, 1); +else if ((old_ctrl PCI_MSIX_ENTRY_CTRL_MASKBIT) +!(new_ctrl PCI_MSIX_ENTRY_CTRL_MASKBIT)) +r = update_msix_mask_bit(mmio_dev-kvm, mmio, entry, 0); +if (r || ret) +ret = -ENOTSYNC; +out: +mutex_unlock(mmio_dev-lock); +return ret; +} + -- error
Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
On 2011-01-17 13:29, Avi Kivity wrote: On 01/06/2011 12:19 PM, Sheng Yang wrote: Then we can support mask bit operation of assigned devices now. +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm, +int assigned_dev_id, int entry, bool mask) +{ +int r = -EFAULT; +struct kvm_assigned_dev_kernel *adev; +int i; + +if (!irqchip_in_kernel(kvm)) +return r; + +mutex_lock(kvm-lock); +adev = kvm_find_assigned_dev(kvm-arch.assigned_dev_head, + assigned_dev_id); +if (!adev) +goto out; + +for (i = 0; i adev-entries_nr; i++) +if (adev-host_msix_entries[i].entry == entry) { +if (mask) +disable_irq_nosync( +adev-host_msix_entries[i].vector); Is it okay to call disable_irq_nosync() here? IIRC we don't check the mask bit on irq delivery, so we may forward an interrupt to the guest after the mask bit was set. What does pci say about the mask bit? when does it take effect? Another question is whether disable_irq_nosync() actually programs the device mask bit, or not. If it does, then it's slow, and it may be better to leave interrupts enabled but have an internal pending bit. If it doesn't program the mask bit, it's fine. disable_irq* works lazily, only disabling the line in software. If an IRQ actually occurs, it is marked pending and the line is left masked on handler return (that's one reason why masking via PCI config space is slower at macro level). Jan -- Siemens AG, Corporate Technology, CT T DE IT 1 Corporate Competence Center Embedded Linux -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
On Mon, Jan 17, 2011 at 02:51:37PM +0200, Avi Kivity wrote: On 01/17/2011 02:48 PM, Marcelo Tosatti wrote: On Mon, Jan 17, 2011 at 02:18:43PM +0200, Avi Kivity wrote: On 01/17/2011 02:18 PM, Sheng Yang wrote: + +if (copy_to_user((void __user *)(entry_base + offset), val, len)) +goto out; Instead of copying to/from userspace (which is subject to swapin, unexpected values), you could include the guest written value in a kvm_run structure, along with address. Qemu-kvm would use that to synchronize its copy of the table, on KVM_EXIT_MSIX_ROUTING_UPDATE exit. We want to acelerate MSI-X mask bit accessing, which won't exit to userspace in the most condition. That's the cost we want to optimize. Also it's possible to userspace to read the correct value of MMIO(but mostly userspace can't write to it in order to prevent synchronize issue). It's also good to have the values in just one place; using userspace makes it easy for both the kernel and userspace to see the values (and set them after migration, if/when we extend this to virtio). Right, thats an advantage, but: - How can userspace ever synchronize with updates by the kernel to the MSI-X entry? What a value is written by the guest, which kvm cannot handle itself (i.e. a change to anything other than the mask bit), we exit with the table and entry ids, so userspace can reread them. OK. But regarding access to the MSI-X entry in userspace, it can only be accessed safely wrt parallel updates by the kernel in the exit handler. Is the exit handler the only location where the MSI-X entry will be read or written to, in userspace? - Reading/writing to the userspace area must be done carefully, values must be validated before used. True every time... - Swapping issue (minor?). I don't see the issue... just like any part of qemu that may be swapped out, blocking the vcpu thread. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
On 01/17/2011 05:52 PM, Marcelo Tosatti wrote: What a value is written by the guest, which kvm cannot handle itself (i.e. a change to anything other than the mask bit), we exit with the table and entry ids, so userspace can reread them. OK. But regarding access to the MSI-X entry in userspace, it can only be accessed safely wrt parallel updates by the kernel in the exit handler. Is the exit handler the only location where the MSI-X entry will be read or written to, in userspace? Yes. This should be documented, that the kernel owns the msix area as far as writes are concerned. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/3] KVM: Emulate MSI-X table in kernel
Then we can support mask bit operation of assigned devices now. Signed-off-by: Sheng Yang sh...@linux.intel.com --- arch/x86/kvm/Makefile|2 +- arch/x86/kvm/x86.c |8 +- include/linux/kvm.h | 21 include/linux/kvm_host.h | 25 virt/kvm/assigned-dev.c | 44 +++ virt/kvm/kvm_main.c | 38 ++- virt/kvm/msix_mmio.c | 284 ++ virt/kvm/msix_mmio.h | 25 8 files changed, 440 insertions(+), 7 deletions(-) create mode 100644 virt/kvm/msix_mmio.c create mode 100644 virt/kvm/msix_mmio.h diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index f15501f..3a0d851 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I. kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ coalesced_mmio.o irq_comm.o eventfd.o \ - assigned-dev.o) + assigned-dev.o msix_mmio.o) kvm-$(CONFIG_IOMMU_API)+= $(addprefix ../../../virt/kvm/, iommu.o) kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fa708c9..89bf12c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_X86_ROBUST_SINGLESTEP: case KVM_CAP_XSAVE: case KVM_CAP_ASYNC_PF: + case KVM_CAP_MSIX_MMIO: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned long addr, struct kvm_vcpu *vcpu) { gpa_t gpa; + int r; gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); @@ -3822,14 +3824,16 @@ static int emulator_write_emulated_onepage(unsigned long addr, mmio: trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); + r = vcpu_mmio_write(vcpu, gpa, bytes, val); /* * Is this MMIO handled locally? */ - if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) + if (!r) return X86EMUL_CONTINUE; vcpu-mmio_needed = 1; - vcpu-run-exit_reason = KVM_EXIT_MMIO; + vcpu-run-exit_reason = (r == -ENOTSYNC) ? + KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO; vcpu-run-mmio.phys_addr = vcpu-mmio_phys_addr = gpa; vcpu-run-mmio.len = vcpu-mmio_size = bytes; vcpu-run-mmio.is_write = vcpu-mmio_is_write = 1; diff --git a/include/linux/kvm.h b/include/linux/kvm.h index ea2dc1a..ad9df4b 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -161,6 +161,7 @@ struct kvm_pit_config { #define KVM_EXIT_NMI 16 #define KVM_EXIT_INTERNAL_ERROR 17 #define KVM_EXIT_OSI 18 +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19 /* For KVM_EXIT_INTERNAL_ERROR */ #define KVM_INTERNAL_ERROR_EMULATION 1 @@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_PPC_GET_PVINFO 57 #define KVM_CAP_PPC_IRQ_LEVEL 58 #define KVM_CAP_ASYNC_PF 59 +#define KVM_CAP_MSIX_MMIO 60 #ifdef KVM_CAP_IRQ_ROUTING @@ -672,6 +674,9 @@ struct kvm_clock_data { #define KVM_XEN_HVM_CONFIG_IOW(KVMIO, 0x7a, struct kvm_xen_hvm_config) #define KVM_SET_CLOCK _IOW(KVMIO, 0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK _IOR(KVMIO, 0x7c, struct kvm_clock_data) +/* Available with KVM_CAP_MSIX_MMIO */ +#define KVM_REGISTER_MSIX_MMIO_IOW(KVMIO, 0x7d, struct kvm_msix_mmio_user) +#define KVM_UNREGISTER_MSIX_MMIO _IOW(KVMIO, 0x7e, struct kvm_msix_mmio_user) /* Available with KVM_CAP_PIT_STATE2 */ #define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct kvm_pit_state2) #define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2) @@ -795,4 +800,20 @@ struct kvm_assigned_msix_entry { __u16 padding[3]; }; +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV(1 0) + +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE (1 8) + +#define KVM_MSIX_MMIO_TYPE_DEV_MASK0x00ff +#define KVM_MSIX_MMIO_TYPE_BASE_MASK 0xff00 +struct kvm_msix_mmio_user { + __u32 dev_id; + __u16 type; + __u16 max_entries_nr; + __u64 base_addr; + __u64 base_va; + __u64 flags; + __u64 reserved[4]; +}; + #endif /* __LINUX_KVM_H */ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7d313e0..c10670c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -233,6 +233,27 @@ struct kvm_memslots { KVM_PRIVATE_MEM_SLOTS]; }; +#define KVM_MSIX_MMIO_MAX32 + +struct kvm_msix_mmio { + u32 dev_id; + u16 type; + u16 max_entries_nr; + u64 flags; + gpa_t table_base_addr; + hva_t table_base_va; + gpa_t pba_base_addr; + hva_t