[PATCH] kvm: qemu: allow kvm.h to include config.h
From: Avi Kivity a...@redhat.com Signed-off-by: Avi Kivity a...@redhat.com diff --git a/qemu/kvm.h b/qemu/kvm.h index eeed3dc..05a13ee 100644 --- a/qemu/kvm.h +++ b/qemu/kvm.h @@ -14,10 +14,10 @@ #ifndef QEMU_KVM_H #define QEMU_KVM_H -#ifdef KVM_UPSTREAM - #include config.h +#ifdef KVM_UPSTREAM + #ifdef CONFIG_KVM extern int kvm_allowed; -- To unsubscribe from this list: send the line unsubscribe kvm-commits in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] kvm: qemu: Fix KVM includes in helper.c
From: Alexander Graf ag...@suse.de By default target-i386/helper.c does not include config.h, so no code in there knows if we're enabling KVM or not. This breaks the nested activation, as that's depending on the config options. This patch fixes compilation broken thanks to my nested SVM patches. Signed-off-by: Alexander Graf ag...@suse.de Signed-off-by: Avi Kivity a...@redhat.com diff --git a/qemu/qemu-kvm.h b/qemu/qemu-kvm.h index bd8a9e8..aec5286 100644 --- a/qemu/qemu-kvm.h +++ b/qemu/qemu-kvm.h @@ -161,6 +161,7 @@ int qemu_kvm_has_sync_mmu(void); void kvm_init_vcpu(CPUState *env); #else #define kvm_enabled() (0) +#define kvm_nested 0 #define qemu_kvm_irqchip_in_kernel() (0) #define qemu_kvm_pit_in_kernel() (0) #define kvm_has_sync_mmu() (0) -- To unsubscribe from this list: send the line unsubscribe kvm-commits in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] kvm: qemu: regenerate bios for smp boot hang fix
From: Avi Kivity a...@redhat.com Signed-off-by: Avi Kivity a...@redhat.com diff --git a/qemu/pc-bios/bios.bin b/qemu/pc-bios/bios.bin index 35fffec..768d8f0 100644 Binary files a/qemu/pc-bios/bios.bin and b/qemu/pc-bios/bios.bin differ -- To unsubscribe from this list: send the line unsubscribe kvm-commits in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] kvm: bios: prevent compiler from caching cpu count while starting up smp
From: Avi Kivity a...@redhat.com Other cpus are updating the count in parallel; if we get bad timing we might not notice them starting up and hang. Signed-off-by: Avi Kivity a...@redhat.com diff --git a/bios/rombios32.c b/bios/rombios32.c index cceaede..321563d 100755 --- a/bios/rombios32.c +++ b/bios/rombios32.c @@ -633,7 +633,7 @@ void smp_probe(void) #ifndef BX_QEMU delay_ms(10); #else - while (cmos_readb(0x5f) + 1 != smp_cpus) + while (cmos_readb(0x5f) + 1 != readw(smp_cpus)) ; #endif } -- To unsubscribe from this list: send the line unsubscribe kvm-commits in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Revert KVM: SVM: Accelerate nested SVM by emulating parts of GIF=0
From: Avi Kivity a...@redhat.com This reverts commit 037b6e2531d844994cf79bd4190853427c6af2ac (and 7b8052aecd9c533661493d1140cbec0e1ab311d3 as well). It causes hangs. Signed-off-by: Avi Kivity a...@redhat.com diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 6577934..f53be7e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -78,11 +78,6 @@ static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb, void *arg2, void *opaque); static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); -static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run); -static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run); -static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run); -static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run); - static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) { @@ -1496,50 +1491,6 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override) nested_svm_exit_handled_real); } -static int nested_svm_emulate(struct vcpu_svm *svm, struct kvm_run *kvm_run) -{ - int er; - u32 opcode = 0; - unsigned long rip; - unsigned long rip_linear; - - svm-vmcb-save.rax = svm-vcpu.arch.regs[VCPU_REGS_RAX]; - svm-vmcb-save.rsp = svm-vcpu.arch.regs[VCPU_REGS_RSP]; - svm-vmcb-save.rip = svm-vcpu.arch.regs[VCPU_REGS_RIP]; - rip = svm-vcpu.arch.regs[VCPU_REGS_RIP]; - rip_linear = rip + svm_seg(svm-vcpu, VCPU_SREG_CS)-base; - - er = emulator_read_std(rip_linear, (void *)opcode, 3, svm-vcpu); - if (er != X86EMUL_CONTINUE) - return er; - er = EMULATE_FAIL; - - switch (opcode) { - case 0xda010f: - vmload_interception(svm, kvm_run); - er = EMULATE_DONE; - break; - case 0xd8010f: - vmrun_interception(svm, kvm_run); - er = EMULATE_DONE; - break; - case 0xdb010f: - vmsave_interception(svm, kvm_run); - er = EMULATE_DONE; - break; - case 0xdc010f: - stgi_interception(svm, kvm_run); - er = EMULATE_DONE; - break; - default: - nsvm_printk(NSVM: Opcode %x unknown\n, opcode); - } - - nsvm_printk(NSVM: svm emul at 0x%lx - %d\n, rip, er); - - return er; -} - static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1, void *arg2, void *opaque) { @@ -1635,9 +1586,6 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) kvm_mmu_reset_context(svm-vcpu); kvm_mmu_load(svm-vcpu); - /* KVM calls vmsave after vmrun, so let's run it now if we can */ - nested_svm_emulate(svm, NULL); - return 0; } @@ -1848,8 +1796,6 @@ static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - int loopcount = 0; - if (nested_svm_check_permissions(svm)) return 1; @@ -1862,23 +1808,6 @@ static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) svm_clear_vintr(svm); svm-vmcb-control.int_ctl = ~V_IRQ_MASK; - /* Let's try to emulate as many instructions as possible in GIF=0 */ - - while (++loopcount 100) { - int er; - - er = emulate_instruction(svm-vcpu, kvm_run, 0, 0, 0); - nsvm_printk(NSVM: emulating at 0x%lx - %d\n, svm-vcpu.arch.regs[VCPU_REGS_RIP], er); - - /* So we can now emulate the SVM instructions that most probably - occur at the end of the codepath */ - if (er != EMULATE_DONE) { - while (true) - if (nested_svm_emulate(svm, kvm_run) == EMULATE_FAIL) - break; - break; - } - } return 1; } -- To unsubscribe from this list: send the line unsubscribe kvm-commits in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] kvm: external module: fix build w/ --with-patched-kernel on Ubuntu 8.10
From: Nolan no...@sigbus.net And presumably any other distribution that puts only symlinks in /lib/modules/kernel/build/... Signed-off-by: Nolan Leake no...@sigbus.net Signed-off-by: Avi Kivity a...@redhat.com diff --git a/kernel/Makefile b/kernel/Makefile index 8315e3d..6bf474b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -65,12 +65,12 @@ headers-new = $(LINUX)/arch/$(ARCH_DIR)/include/asm/./kvm*.h \ header-sync: rm -rf $T - rsync -R \ + rsync -R -L \ $(LINUX)/./include/linux/kvm*.h \ $(if $(wildcard $(headers-old)), $(headers-old)) \ $T/ $(if $(wildcard $(headers-new)), \ - rsync -R \ + rsync -R -L \ $(wildcard $(headers-new)) \ $T/include/asm-$(ARCH_DIR)/) -- To unsubscribe from this list: send the line unsubscribe kvm-commits in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] kvm: testsuite: fix smptest.flat makefile rules
From: Avi Kivity a...@redhat.com smptest.flat was not getting built Signed-off-by: Avi Kivity a...@redhat.com diff --git a/user/config-x86-common.mak b/user/config-x86-common.mak index edbf6e4..315091a 100644 --- a/user/config-x86-common.mak +++ b/user/config-x86-common.mak @@ -20,7 +20,7 @@ FLATLIBS = test/lib/libcflat.a $(libgcc) tests-common = $(TEST_DIR)/bootstrap \ $(TEST_DIR)/vmexit.flat $(TEST_DIR)/tsc.flat \ - $(TEST_DIR)/smp.flat $(TEST_DIR)/port80.flat \ + $(TEST_DIR)/smptest.flat $(TEST_DIR)/port80.flat \ $(TEST_DIR)/realmode.flat test_cases: $(tests-common) $(tests) @@ -43,7 +43,7 @@ $(TEST_DIR)/vmexit.flat: $(cstart.o) $(TEST_DIR)/vmexit.o $(TEST_DIR)/test32.flat: $(TEST_DIR)/test32.o -$(TEST_DIR)/smp.flat: $(cstart.o) $(TEST_DIR)/smptest.o +$(TEST_DIR)/smptest.flat: $(cstart.o) $(TEST_DIR)/smptest.o $(TEST_DIR)/emulator.flat: $(cstart.o) $(TEST_DIR)/vm.o $(TEST_DIR)/print.o -- To unsubscribe from this list: send the line unsubscribe kvm-commits in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] kvm: testsuite: avoid clobbering ebx
From: Avi Kivity a...@redhat.com Some compilers don't enjoy that. Signed-off-by: Avi Kivity a...@redhat.com diff --git a/user/test/x86/vmexit.c b/user/test/x86/vmexit.c index f6341cc..0662f34 100644 --- a/user/test/x86/vmexit.c +++ b/user/test/x86/vmexit.c @@ -18,6 +18,12 @@ static inline unsigned long long rdtsc() #define N (1 22) +#ifdef __x86_64__ +# define R r +#else +# define R e +#endif + int main() { int i; @@ -25,7 +31,8 @@ int main() t1 = rdtsc(); for (i = 0; i N; ++i) - asm volatile (cpuid : : : eax, ebx, ecx, edx); + asm volatile (push %%Rbx; cpuid; pop %%Rbx + : : : eax, ecx, edx); t2 = rdtsc(); printf(vmexit latency: %d\n, (int)((t2 - t1) / N)); return 0; -- To unsubscribe from this list: send the line unsubscribe kvm-commits in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] KVM: Cosmetic commit to reconcile upstream and local changes
From: Avi Kivity a...@redhat.com Signed-off-by: Avi Kivity a...@redhat.com diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index dfc6442..36d2a50 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -86,12 +86,9 @@ extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, extern int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs); extern int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt); - extern int kvmppc_booke_init(void); extern void kvmppc_booke_exit(void); extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu); -extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu); - #endif /* __POWERPC_KVM_PPC_H__ */ diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index a8b9304..9050491 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -389,7 +389,7 @@ static void emergency_vmx_disable_all(void) * We can't take any locks and we may be on an inconsistent * state, so we use NMIs as IPIs to tell the other CPUs to disable * VMX and halt. -* +* * For safety, we will avoid running the nmi_shootdown_cpus() * stuff unnecessarily, but we don't have a way to check * if other CPUs have VMX enabled. So we will call it only if the -- To unsubscribe from this list: send the line unsubscribe kvm-commits in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/8] KVM: Use kvm_free_assigned_irq() for free irq
On Tuesday 23 December 2008 23:18:43 Marcelo Tosatti wrote: Hi Sheng, On Tue, Dec 23, 2008 at 04:00:25PM +0800, Sheng Yang wrote: Which is more convenient... Signed-off-by: Sheng Yang sh...@linux.intel.com --- virt/kvm/kvm_main.c | 10 ++ 1 files changed, 2 insertions(+), 8 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ffd261d..cd84b3e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -284,11 +284,7 @@ static int assigned_device_update_intx(struct kvm *kvm, return 0; if (irqchip_in_kernel(kvm)) { - if (!msi2intx - adev-irq_requested_type KVM_ASSIGNED_DEV_HOST_MSI) { - free_irq(adev-host_irq, (void *)kvm); - pci_disable_msi(adev-dev); - } + kvm_free_assigned_irq(kvm, adev); if (!capable(CAP_SYS_RAWIO)) return -EPERM; @@ -339,9 +335,7 @@ static int assigned_device_update_msi(struct kvm *kvm, if (irqchip_in_kernel(kvm)) { if (!msi2intx) { - if (adev-irq_requested_type - KVM_ASSIGNED_DEV_HOST_INTX) - free_irq(adev-host_irq, (void *)adev); + kvm_free_assigned_irq(kvm, adev); r = pci_enable_msi(adev-dev); if (r) Regarding kvm_free_assigned_irq and assigned_device_update_msi/update_intx: if (cancel_work_sync(assigned_dev-interrupt_work)) /* We had pending work. That means we will have to take * care of kvm_put_kvm. */ kvm_put_kvm(kvm); free_irq(assigned_dev-host_irq, (void *)assigned_dev); What prevents the host IRQ from being triggered between kvm_put_kvm and free_irq? Also, if the kvm_put_kvm(kvm) from kvm_assigned_dev_interrupt_work_handler happens to be the last one, can't this happen: - kvm_assigned_dev_interrupt_work_handler - kvm_put_kvm - kvm_destroy_vm - kvm_arch_destroy_vm - kvm_free_all_assigned_devices - kvm_free_assigned_device - kvm_free_assigned_irq - cancel_work_sync(assigned_dev-interrupt_work) deadlock. Nice catch! I've updated the patchset to address this, take a look? :) -- regards Yang, Sheng -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 09/15] KVM: Split IOAPIC structure
Prepared for reuse ioapic_redir_entry for MSI. Signed-off-by: Sheng Yang sh...@linux.intel.com --- include/linux/kvm_types.h | 17 + virt/kvm/ioapic.c |6 +++--- virt/kvm/ioapic.h | 17 + 3 files changed, 21 insertions(+), 19 deletions(-) diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 9b6f395..f07de1a 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -53,4 +53,21 @@ struct kvm_pio_request { int rep; }; +union kvm_ioapic_redirect_entry { + u64 bits; + struct { + u8 vector; + u8 delivery_mode:3; + u8 dest_mode:1; + u8 delivery_status:1; + u8 polarity:1; + u8 remote_irr:1; + u8 trig_mode:1; + u8 mask:1; + u8 reserve:7; + u8 reserved[4]; + u8 dest_id; + } fields; +}; + #endif /* __KVM_TYPES_H__ */ diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 23b81cf..ebb2ab5 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -85,7 +85,7 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic, static void ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx) { - union ioapic_redir_entry *pent; + union kvm_ioapic_redirect_entry *pent; pent = ioapic-redirtbl[idx]; @@ -272,7 +272,7 @@ void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) { u32 old_irr = ioapic-irr; u32 mask = 1 irq; - union ioapic_redir_entry entry; + union kvm_ioapic_redirect_entry entry; if (irq = 0 irq IOAPIC_NUM_PINS) { entry = ioapic-redirtbl[irq]; @@ -291,7 +291,7 @@ void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level) static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int gsi, int trigger_mode) { - union ioapic_redir_entry *ent; + union kvm_ioapic_redirect_entry *ent; ent = ioapic-redirtbl[gsi]; diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h index 49c9581..ee5b0bd 100644 --- a/virt/kvm/ioapic.h +++ b/virt/kvm/ioapic.h @@ -40,22 +40,7 @@ struct kvm_ioapic { u32 id; u32 irr; u32 pad; - union ioapic_redir_entry { - u64 bits; - struct { - u8 vector; - u8 delivery_mode:3; - u8 dest_mode:1; - u8 delivery_status:1; - u8 polarity:1; - u8 remote_irr:1; - u8 trig_mode:1; - u8 mask:1; - u8 reserve:7; - u8 reserved[4]; - u8 dest_id; - } fields; - } redirtbl[IOAPIC_NUM_PINS]; + union kvm_ioapic_redirect_entry redirtbl[IOAPIC_NUM_PINS]; struct kvm_io_device dev; struct kvm *kvm; void (*ack_notifier)(void *opaque, int irq); -- 1.5.4.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 02/15] KVM: Use kvm_free_assigned_irq() for free irq
Which is more convenient... Signed-off-by: Sheng Yang sh...@linux.intel.com --- virt/kvm/kvm_main.c | 10 ++ 1 files changed, 2 insertions(+), 8 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ffd261d..cd84b3e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -284,11 +284,7 @@ static int assigned_device_update_intx(struct kvm *kvm, return 0; if (irqchip_in_kernel(kvm)) { - if (!msi2intx - adev-irq_requested_type KVM_ASSIGNED_DEV_HOST_MSI) { - free_irq(adev-host_irq, (void *)kvm); - pci_disable_msi(adev-dev); - } + kvm_free_assigned_irq(kvm, adev); if (!capable(CAP_SYS_RAWIO)) return -EPERM; @@ -339,9 +335,7 @@ static int assigned_device_update_msi(struct kvm *kvm, if (irqchip_in_kernel(kvm)) { if (!msi2intx) { - if (adev-irq_requested_type - KVM_ASSIGNED_DEV_HOST_INTX) - free_irq(adev-host_irq, (void *)adev); + kvm_free_assigned_irq(kvm, adev); r = pci_enable_msi(adev-dev); if (r) -- 1.5.4.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 01/15] KVM: Add MSI_ACTION flag for assigned irq
For MSI disable feature later. Notice I changed ABI here, but due to no userspace patch, I think it's OK. Signed-off-by: Sheng Yang sh...@linux.intel.com --- include/linux/kvm.h |3 ++- 1 files changed, 2 insertions(+), 1 deletions(-) diff --git a/include/linux/kvm.h b/include/linux/kvm.h index ef7f98e..5b965f6 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -544,6 +544,7 @@ struct kvm_assigned_irq { #define KVM_DEV_ASSIGN_ENABLE_IOMMU(1 0) -#define KVM_DEV_IRQ_ASSIGN_ENABLE_MSI (1 0) +#define KVM_DEV_IRQ_ASSIGN_MSI_ACTION (1 0) +#define KVM_DEV_IRQ_ASSIGN_ENABLE_MSI (1 1) #endif -- 1.5.4.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 03/15] KVM: Add support to disable MSI for assigned device
MSI is always enabled by default for msi2intx=1. But if msi2intx=0, we have to disable MSI if guest require to do so. The patch also discard unnecessary msi2intx judgment if guest want to update MSI state. Signed-off-by: Sheng Yang sh...@linux.intel.com --- virt/kvm/kvm_main.c | 12 ++-- 1 files changed, 10 insertions(+), 2 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index cd84b3e..111738b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -328,6 +328,15 @@ static int assigned_device_update_msi(struct kvm *kvm, adev-irq_requested_type = ~KVM_ASSIGNED_DEV_GUEST_MSI; adev-guest_irq = airq-guest_irq; adev-ack_notifier.gsi = airq-guest_irq; + } else { + /* +* Guest require to disable device MSI, we disable MSI and +* re-enable INTx by default again. Notice it's only for +* non-msi2intx. +*/ + kvm_free_assigned_irq(kvm, adev); + assigned_device_update_intx(kvm, adev, airq); + return 0; } if (adev-irq_requested_type KVM_ASSIGNED_DEV_HOST_MSI) @@ -399,8 +408,7 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, } } - if ((!msi2intx -(assigned_irq-flags KVM_DEV_IRQ_ASSIGN_ENABLE_MSI)) || + if ((assigned_irq-flags KVM_DEV_IRQ_ASSIGN_MSI_ACTION) || (msi2intx match-dev-msi_enabled)) { #ifdef CONFIG_X86 r = assigned_device_update_msi(kvm, match, assigned_irq); -- 1.5.4.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 14/15] KVM: Using kfifo for irq recording
For MSI-X, we have to deal with multiply IRQ with same IRQ handler, so it's necessary to record the IRQ that trigger the IRQ handler. And this one is also useful for fixing kvm_free_assigned_irq(). Signed-off-by: Sheng Yang sh...@linux.intel.com --- include/linux/kvm_host.h |4 virt/kvm/kvm_main.c | 30 +++--- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index fbf102c..84b11d5 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -17,6 +17,7 @@ #include linux/preempt.h #include linux/marker.h #include linux/msi.h +#include linux/kfifo.h #include asm/signal.h #include linux/kvm.h @@ -313,6 +314,9 @@ struct kvm_assigned_dev_kernel { int host_irq; bool host_irq_disabled; int guest_irq; +#define KVM_ASSIGNED_DEV_IRQ_FIFO_LEN 0x100 + struct kfifo *irq_fifo; + spinlock_t irq_fifo_lock; #define KVM_ASSIGNED_DEV_GUEST_INTX(1 0) #define KVM_ASSIGNED_DEV_GUEST_MSI (1 1) #define KVM_ASSIGNED_DEV_HOST_INTX (1 8) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a51e630..1863942 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -99,6 +99,8 @@ static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *h static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) { struct kvm_assigned_dev_kernel *assigned_dev; + int irq; + u32 gsi; assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, interrupt_work); @@ -109,14 +111,22 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) */ mutex_lock(assigned_dev-kvm-lock); - kvm_set_irq(assigned_dev-kvm, assigned_dev-irq_source_id, - assigned_dev-guest_irq, 1); +handle_irq: + kfifo_get(assigned_dev-irq_fifo, + (unsigned char *)irq, sizeof(int)); + + gsi = assigned_dev-guest_irq; + + kvm_set_irq(assigned_dev-kvm, assigned_dev-irq_source_id, gsi, 1); if (assigned_dev-irq_requested_type KVM_ASSIGNED_DEV_GUEST_MSI) { enable_irq(assigned_dev-host_irq); assigned_dev-host_irq_disabled = false; } + if (kfifo_len(assigned_dev-irq_fifo) != 0) + goto handle_irq; + mutex_unlock(assigned_dev-kvm-lock); kvm_put_kvm(assigned_dev-kvm); } @@ -128,6 +138,9 @@ static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) kvm_get_kvm(assigned_dev-kvm); + kfifo_put(assigned_dev-irq_fifo, + (unsigned char *)irq, sizeof(int)); + schedule_work(assigned_dev-interrupt_work); disable_irq_nosync(irq); @@ -201,6 +214,7 @@ static void kvm_free_assigned_device(struct kvm *kvm, pci_dev_put(assigned_dev-dev); list_del(assigned_dev-list); + kfifo_free(assigned_dev-irq_fifo); kfree(assigned_dev); } @@ -448,15 +462,25 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, list_add(match-list, kvm-arch.assigned_dev_head); + spin_lock_init(match-irq_fifo_lock); + match-irq_fifo = kfifo_alloc(sizeof(unsigned char) * + KVM_ASSIGNED_DEV_IRQ_FIFO_LEN, + GFP_KERNEL | __GFP_ZERO, + match-irq_fifo_lock); + if (!match-irq_fifo) + goto out_list_del; + if (assigned_dev-flags KVM_DEV_ASSIGN_ENABLE_IOMMU) { r = kvm_iommu_map_guest(kvm, match); if (r) - goto out_list_del; + goto out_fifo_del; } out: mutex_unlock(kvm-lock); return r; +out_fifo_del: + kfifo_free(match-irq_fifo); out_list_del: list_del(match-list); pci_release_regions(dev); -- 1.5.4.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 07/15] KVM: Using ioapic_irqchip() macro for kvm_set_irq
Signed-off-by: Sheng Yang sh...@linux.intel.com --- virt/kvm/irq_comm.c |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index abfab46..47243ef 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -39,7 +39,7 @@ void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level) * IOAPIC. So set the bit in both. The guest will ignore * writes to the unused one. */ - kvm_ioapic_set_irq(kvm-arch.vioapic, irq, !!(*irq_state)); + kvm_ioapic_set_irq(ioapic_irqchip(kvm), irq, !!(*irq_state)); #ifdef CONFIG_X86 kvm_pic_set_irq(pic_irqchip(kvm), irq, !!(*irq_state)); #endif -- 1.5.4.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 12/15] KVM: Update intr delivery func to accept unsigned long* bitmap
Would be used with bit ops, and would be easily extended if KVM_MAX_VCPUS is increased. Signed-off-by: Sheng Yang sh...@linux.intel.com --- arch/x86/kvm/lapic.c |8 include/linux/kvm_host.h |2 +- virt/kvm/ioapic.c|4 ++-- virt/kvm/ioapic.h|4 ++-- virt/kvm/irq_comm.c |6 +++--- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index afac68c..c1e4935 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -403,7 +403,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, } static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, - unsigned long bitmap) + unsigned long *bitmap) { int last; int next; @@ -415,7 +415,7 @@ static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, do { if (++next == KVM_MAX_VCPUS) next = 0; - if (kvm-vcpus[next] == NULL || !test_bit(next, bitmap)) + if (kvm-vcpus[next] == NULL || !test_bit(next, bitmap)) continue; apic = kvm-vcpus[next]-arch.apic; if (apic apic_enabled(apic)) @@ -431,7 +431,7 @@ static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector, } struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, - unsigned long bitmap) + unsigned long *bitmap) { struct kvm_lapic *apic; @@ -502,7 +502,7 @@ static void apic_send_ipi(struct kvm_lapic *apic) } if (delivery_mode == APIC_DM_LOWEST) { - target = kvm_get_lowest_prio_vcpu(vcpu-kvm, vector, lpr_map); + target = kvm_get_lowest_prio_vcpu(vcpu-kvm, vector, lpr_map); if (target != NULL) __apic_accept_irq(target-arch.apic, delivery_mode, vector, level, trig_mode); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4f92317..fbf102c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -332,7 +332,7 @@ struct kvm_gsi_msg { void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic, union kvm_ioapic_redirect_entry *entry, - u32 *deliver_bitmask); + unsigned long *deliver_bitmask); void kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 gsi, int level); void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi); void kvm_register_irq_ack_notifier(struct kvm *kvm, diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index ebd5ba6..164a746 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -154,7 +154,7 @@ static void ioapic_inj_nmi(struct kvm_vcpu *vcpu) } void kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, -u8 dest_mode, u32 *mask) +u8 dest_mode, unsigned long *mask) { int i; struct kvm *kvm = ioapic-kvm; @@ -195,7 +195,7 @@ void kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq) { union kvm_ioapic_redirect_entry entry = ioapic-redirtbl[irq]; - u32 deliver_bitmask; + unsigned long deliver_bitmask; struct kvm_vcpu *vcpu; int vcpu_id, r = 0; diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h index e107dbb..c418a7f 100644 --- a/virt/kvm/ioapic.h +++ b/virt/kvm/ioapic.h @@ -65,12 +65,12 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) } struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, - unsigned long bitmap); + unsigned long *bitmap); void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode); int kvm_ioapic_init(struct kvm *kvm); void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); void kvm_ioapic_reset(struct kvm_ioapic *ioapic); void kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, -u8 dest_mode, u32 *mask); +u8 dest_mode, unsigned long *mask); #endif diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index 1949587..e74d679 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -31,7 +31,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic, union kvm_ioapic_redirect_entry *entry, - u32 *deliver_bitmask) + unsigned long *deliver_bitmask) { struct kvm_vcpu *vcpu; @@ -41,7 +41,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic, switch
[PATCH 06/15] KVM: Improve MSI dispatch function
Prepare to merge with kvm_set_irq(). Signed-off-by: Sheng Yang sh...@linux.intel.com --- virt/kvm/kvm_main.c |8 1 files changed, 4 insertions(+), 4 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 3494861..599257e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -87,7 +87,7 @@ static bool kvm_rebooting; #ifdef KVM_CAP_DEVICE_ASSIGNMENT #ifdef CONFIG_X86 -static void assigned_device_msi_dispatch(struct kvm_assigned_dev_kernel *dev) +static void assigned_device_msi_dispatch(struct kvm_assigned_dev_kernel *dev, u32 gsi) { int vcpu_id; struct kvm_vcpu *vcpu; @@ -99,7 +99,7 @@ static void assigned_device_msi_dispatch(struct kvm_assigned_dev_kernel *dev) BUG_ON(!ioapic); mutex_lock(dev-kvm-gsi_msg_lock); - gsi_msg = kvm_find_gsi_msg(dev-kvm, dev-guest_irq); + gsi_msg = kvm_find_gsi_msg(dev-kvm, gsi); if (!gsi_msg) { printk(KERN_WARNING kvm: fail to find correlated gsi_msg\n); return; @@ -143,7 +143,7 @@ static void assigned_device_msi_dispatch(struct kvm_assigned_dev_kernel *dev) } } #else -static void assigned_device_msi_dispatch(struct kvm_assigned_dev_kernel *dev) {} +static void assigned_device_msi_dispatch(struct kvm_assigned_dev_kernel *dev, u32 gsi) {} #endif static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, @@ -178,7 +178,7 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) assigned_dev-guest_irq, 1); else if (assigned_dev-irq_requested_type KVM_ASSIGNED_DEV_GUEST_MSI) { - assigned_device_msi_dispatch(assigned_dev); + assigned_device_msi_dispatch(assigned_dev, assigned_dev-guest_irq); enable_irq(assigned_dev-host_irq); assigned_dev-host_irq_disabled = false; } -- 1.5.4.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 13/15] KVM: bit ops for deliver_bitmap
It's also convenient when we extend KVM supported vcpu number in the future. Signed-off-by: Sheng Yang sh...@linux.intel.com --- arch/x86/kvm/lapic.c |7 --- virt/kvm/ioapic.c| 24 +--- virt/kvm/irq_comm.c | 16 3 files changed, 25 insertions(+), 22 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index c1e4935..359e02c 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -477,9 +477,10 @@ static void apic_send_ipi(struct kvm_lapic *apic) struct kvm_vcpu *target; struct kvm_vcpu *vcpu; - unsigned long lpr_map = 0; + DECLARE_BITMAP(lpr_map, KVM_MAX_VCPUS); int i; + bitmap_zero(lpr_map, KVM_MAX_VCPUS); apic_debug(icr_high 0x%x, icr_low 0x%x, short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n, @@ -494,7 +495,7 @@ static void apic_send_ipi(struct kvm_lapic *apic) if (vcpu-arch.apic apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) { if (delivery_mode == APIC_DM_LOWEST) - set_bit(vcpu-vcpu_id, lpr_map); + set_bit(vcpu-vcpu_id, lpr_map); else __apic_accept_irq(vcpu-arch.apic, delivery_mode, vector, level, trig_mode); @@ -502,7 +503,7 @@ static void apic_send_ipi(struct kvm_lapic *apic) } if (delivery_mode == APIC_DM_LOWEST) { - target = kvm_get_lowest_prio_vcpu(vcpu-kvm, vector, lpr_map); + target = kvm_get_lowest_prio_vcpu(vcpu-kvm, vector, lpr_map); if (target != NULL) __apic_accept_irq(target-arch.apic, delivery_mode, vector, level, trig_mode); diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index 164a746..bf83f5e 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -195,7 +195,7 @@ void kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq) { union kvm_ioapic_redirect_entry entry = ioapic-redirtbl[irq]; - unsigned long deliver_bitmask; + DECLARE_BITMAP(deliver_bitmask, KVM_MAX_VCPUS); struct kvm_vcpu *vcpu; int vcpu_id, r = 0; @@ -205,22 +205,24 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq) entry.fields.delivery_mode, entry.fields.vector, entry.fields.trig_mode); - kvm_get_intr_delivery_bitmask(ioapic, entry, deliver_bitmask); - if (!deliver_bitmask) { - ioapic_debug(no target on destination\n); - return 0; - } + bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS); /* Always delivery PIT interrupt to vcpu 0 */ #ifdef CONFIG_X86 if (irq == 0) - deliver_bitmask = 1 0; + set_bit(0, deliver_bitmask); + else #endif + kvm_get_intr_delivery_bitmask(ioapic, entry, deliver_bitmask); + + if (find_first_bit(deliver_bitmask, KVM_MAX_VCPUS) = KVM_MAX_VCPUS) { + ioapic_debug(no target on destination\n); + return 0; + } - for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) { - if (!(deliver_bitmask (1 vcpu_id))) - continue; - deliver_bitmask = ~(1 vcpu_id); + while ((vcpu_id = find_first_bit(deliver_bitmask, KVM_MAX_VCPUS)) +KVM_MAX_VCPUS) { + clear_bit(vcpu_id, deliver_bitmask); vcpu = ioapic-kvm-vcpus[vcpu_id]; if (vcpu) { if (entry.fields.delivery_mode == diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index e74d679..ecda2c1 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -42,7 +42,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic, case IOAPIC_LOWEST_PRIORITY: vcpu = kvm_get_lowest_prio_vcpu(ioapic-kvm, entry-fields.vector, deliver_bitmask); - *deliver_bitmask = 1 vcpu-vcpu_id; + set_bit(vcpu-vcpu_id, deliver_bitmask); break; case IOAPIC_FIXED: case IOAPIC_NMI: @@ -63,11 +63,12 @@ void kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 gsi, int level) struct kvm_ioapic *ioapic = ioapic_irqchip(kvm); struct kvm_gsi_msg *gsi_msg; union kvm_ioapic_redirect_entry entry; - unsigned long deliver_bitmask; + DECLARE_BITMAP(deliver_bitmask, KVM_MAX_VCPUS); BUG_ON(!ioapic); #endif + bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS); if (!(gsi KVM_GSI_MSG_MASK)) { int irq = gsi; @@ -111,16 +112,15 @@ void
[PATCH 08/15] KVM: Merge MSI handling to kvm_set_irq
Using kvm_set_irq to handle all interrupt injection. Signed-off-by: Sheng Yang sh...@linux.intel.com --- include/linux/kvm_host.h |2 +- virt/kvm/irq_comm.c | 98 +++--- virt/kvm/kvm_main.c | 77 +++- 3 files changed, 90 insertions(+), 87 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index aa2606b..5b671b6 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -330,7 +330,7 @@ struct kvm_gsi_msg { struct hlist_node link; }; -void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level); +void kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 gsi, int level); void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi); void kvm_register_irq_ack_notifier(struct kvm *kvm, struct kvm_irq_ack_notifier *kian); diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index 47243ef..63cdf01 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -20,28 +20,96 @@ */ #include linux/kvm_host.h + +#ifdef CONFIG_X86 +#include asm/msidef.h +#endif + #include irq.h #include ioapic.h /* This should be called with the kvm-lock mutex held */ -void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level) +void kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 gsi, int level) { - unsigned long *irq_state = (unsigned long *)kvm-arch.irq_states[irq]; - - /* Logical OR for level trig interrupt */ - if (level) - set_bit(irq_source_id, irq_state); - else - clear_bit(irq_source_id, irq_state); - - /* Not possible to detect if the guest uses the PIC or the -* IOAPIC. So set the bit in both. The guest will ignore -* writes to the unused one. -*/ - kvm_ioapic_set_irq(ioapic_irqchip(kvm), irq, !!(*irq_state)); + unsigned long *irq_state; +#ifdef CONFIG_X86 + int vcpu_id; + struct kvm_vcpu *vcpu; + struct kvm_ioapic *ioapic = ioapic_irqchip(kvm); + struct kvm_gsi_msg *gsi_msg; + int dest_id, vector, dest_mode, trig_mode, delivery_mode; + u32 deliver_bitmask; + + BUG_ON(!ioapic); +#endif + + if (!(gsi KVM_GSI_MSG_MASK)) { + int irq = gsi; + + irq_state = (unsigned long *)kvm-arch.irq_states[irq]; + + /* Logical OR for level trig interrupt */ + if (level) + set_bit(irq_source_id, irq_state); + else + clear_bit(irq_source_id, irq_state); + + /* Not possible to detect if the guest uses the PIC or the +* IOAPIC. So set the bit in both. The guest will ignore +* writes to the unused one. +*/ + kvm_ioapic_set_irq(ioapic, irq, !!(*irq_state)); #ifdef CONFIG_X86 - kvm_pic_set_irq(pic_irqchip(kvm), irq, !!(*irq_state)); + kvm_pic_set_irq(pic_irqchip(kvm), irq, !!(*irq_state)); +#endif + return; + } + +#ifdef CONFIG_X86 + mutex_lock(kvm-gsi_msg_lock); + gsi_msg = kvm_find_gsi_msg(kvm, gsi); + mutex_unlock(kvm-gsi_msg_lock); + if (!gsi_msg) { + printk(KERN_WARNING kvm: fail to find correlated gsi_msg\n); + return; + } + + dest_id = (gsi_msg-msg.address_lo MSI_ADDR_DEST_ID_MASK) +MSI_ADDR_DEST_ID_SHIFT; + vector = (gsi_msg-msg.data MSI_DATA_VECTOR_MASK) +MSI_DATA_VECTOR_SHIFT; + dest_mode = test_bit(MSI_ADDR_DEST_MODE_SHIFT, + (unsigned long *)gsi_msg-msg.address_lo); + trig_mode = test_bit(MSI_DATA_TRIGGER_SHIFT, + (unsigned long *)gsi_msg-msg.data); + delivery_mode = test_bit(MSI_DATA_DELIVERY_MODE_SHIFT, + (unsigned long *)gsi_msg-msg.data); + deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic, + dest_id, dest_mode); + /* IOAPIC delivery mode value is the same as MSI here */ + switch (delivery_mode) { + case IOAPIC_LOWEST_PRIORITY: + vcpu = kvm_get_lowest_prio_vcpu(ioapic-kvm, vector, + deliver_bitmask); + if (vcpu != NULL) + kvm_apic_set_irq(vcpu, vector, trig_mode); + else + printk(KERN_INFO kvm: null lowest priority vcpu!\n); + break; + case IOAPIC_FIXED: + for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) { + if (!(deliver_bitmask (1 vcpu_id))) + continue; + deliver_bitmask = ~(1 vcpu_id); + vcpu = ioapic-kvm-vcpus[vcpu_id]; + if (vcpu) + kvm_apic_set_irq(vcpu, vector,
[PATCH 11/15] KVM: Change API of kvm_ioapic_get_delivery_bitmask
In order to use with bit ops. Signed-off-by: Sheng Yang sh...@linux.intel.com --- virt/kvm/ioapic.c | 17 - virt/kvm/ioapic.h |4 ++-- virt/kvm/irq_comm.c |5 +++-- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index af9f5de..ebd5ba6 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -153,22 +153,22 @@ static void ioapic_inj_nmi(struct kvm_vcpu *vcpu) kvm_vcpu_kick(vcpu); } -u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, - u8 dest_mode) +void kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, +u8 dest_mode, u32 *mask) { - u32 mask = 0; int i; struct kvm *kvm = ioapic-kvm; struct kvm_vcpu *vcpu; ioapic_debug(dest %d dest_mode %d\n, dest, dest_mode); + *mask = 0; if (dest_mode == 0) { /* Physical mode. */ if (dest == 0xFF) { /* Broadcast. */ for (i = 0; i KVM_MAX_VCPUS; ++i) if (kvm-vcpus[i] kvm-vcpus[i]-arch.apic) - mask |= 1 i; - return mask; + *mask |= 1 i; + return; } for (i = 0; i KVM_MAX_VCPUS; ++i) { vcpu = kvm-vcpus[i]; @@ -176,7 +176,7 @@ u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, continue; if (kvm_apic_match_physical_addr(vcpu-arch.apic, dest)) { if (vcpu-arch.apic) - mask = 1 i; + *mask = 1 i; break; } } @@ -187,10 +187,9 @@ u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, continue; if (vcpu-arch.apic kvm_apic_match_logical_addr(vcpu-arch.apic, dest)) - mask |= 1 vcpu-vcpu_id; + *mask |= 1 vcpu-vcpu_id; } - ioapic_debug(mask %x\n, mask); - return mask; + ioapic_debug(mask %x\n, *mask); } static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq) diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h index ee5b0bd..e107dbb 100644 --- a/virt/kvm/ioapic.h +++ b/virt/kvm/ioapic.h @@ -70,7 +70,7 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode); int kvm_ioapic_init(struct kvm *kvm); void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level); void kvm_ioapic_reset(struct kvm_ioapic *ioapic); -u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, - u8 dest_mode); +void kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, +u8 dest_mode, u32 *mask); #endif diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index d89d8b2..1949587 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -35,8 +35,9 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic, { struct kvm_vcpu *vcpu; - *deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic, - entry-fields.dest_id, entry-fields.dest_mode); + kvm_ioapic_get_delivery_bitmask(ioapic, entry-fields.dest_id, + entry-fields.dest_mode, + deliver_bitmask); switch (entry-fields.delivery_mode) { case IOAPIC_LOWEST_PRIORITY: vcpu = kvm_get_lowest_prio_vcpu(ioapic-kvm, -- 1.5.4.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/15] Device assignment MSI enhancement
Hi Avi and Marcelo Merry Xmas! And here is the v2 of patchset. Target at 2.6.29 for it contained a lot of fix and improvement of current device assignment and MSI feature. Change from V1: Addressed Marcelo's comments, and: 1. Fix racy in kvm_free_assigned_irq(). In case to do this, I fetch one patch (irq_fifo) from original MSI-X patchset. Indeed a nice catch of Marcelo. :) 2. Unified kvm_set_irq() with ioapic_deliver(). It didn't save much, but duplicate is always bothering, and I have modified bitmask for vcpu to a real bitmap (maybe not all, just what I have seen). And for V1: 1. Add gsi_msg mapping mechanism, which gsi can used to indicated a MSI interrupt.(Notice API/ABI changed a little, but we don't have userspace patch now, so it should be OK.) 2. Provide MSI disable capability. arch/x86/kvm/lapic.c | 11 ++- include/linux/kvm.h | 15 +++- include/linux/kvm_host.h | 26 +- include/linux/kvm_types.h | 17 virt/kvm/ioapic.c | 117 ++--- virt/kvm/ioapic.h | 23 + virt/kvm/irq_comm.c | 184 --- virt/kvm/kvm_main.c | 212 - 8 files changed, 415 insertions(+), 190 deletions(-) Sorry for the patchset size, it's too easy to grow fast, and I am a little too lazy to split them into more batches in the Xmas... :) -- regards Yang, Sheng -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 05/15] KVM: Using gsi_msg mapping for MSI device assignment
Convert MSI userspace interface to support gsi_msg mapping(and nobody should be the user of the old interface...). Signed-off-by: Sheng Yang sh...@linux.intel.com --- include/linux/kvm_host.h |1 - virt/kvm/kvm_main.c | 35 ++- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 0e5741a..aa2606b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -313,7 +313,6 @@ struct kvm_assigned_dev_kernel { int host_irq; bool host_irq_disabled; int guest_irq; - struct msi_msg guest_msi; #define KVM_ASSIGNED_DEV_GUEST_INTX(1 0) #define KVM_ASSIGNED_DEV_GUEST_MSI (1 1) #define KVM_ASSIGNED_DEV_HOST_INTX (1 8) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 26bccf9..3494861 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -92,20 +92,30 @@ static void assigned_device_msi_dispatch(struct kvm_assigned_dev_kernel *dev) int vcpu_id; struct kvm_vcpu *vcpu; struct kvm_ioapic *ioapic = ioapic_irqchip(dev-kvm); - int dest_id = (dev-guest_msi.address_lo MSI_ADDR_DEST_ID_MASK) -MSI_ADDR_DEST_ID_SHIFT; - int vector = (dev-guest_msi.data MSI_DATA_VECTOR_MASK) -MSI_DATA_VECTOR_SHIFT; - int dest_mode = test_bit(MSI_ADDR_DEST_MODE_SHIFT, - (unsigned long *)dev-guest_msi.address_lo); - int trig_mode = test_bit(MSI_DATA_TRIGGER_SHIFT, - (unsigned long *)dev-guest_msi.data); - int delivery_mode = test_bit(MSI_DATA_DELIVERY_MODE_SHIFT, - (unsigned long *)dev-guest_msi.data); + struct kvm_gsi_msg *gsi_msg; + int dest_id, vector, dest_mode, trig_mode, delivery_mode; u32 deliver_bitmask; BUG_ON(!ioapic); + mutex_lock(dev-kvm-gsi_msg_lock); + gsi_msg = kvm_find_gsi_msg(dev-kvm, dev-guest_irq); + if (!gsi_msg) { + printk(KERN_WARNING kvm: fail to find correlated gsi_msg\n); + return; + } + mutex_unlock(dev-kvm-gsi_msg_lock); + + dest_id = (gsi_msg-msg.address_lo MSI_ADDR_DEST_ID_MASK) +MSI_ADDR_DEST_ID_SHIFT; + vector = (gsi_msg-msg.data MSI_DATA_VECTOR_MASK) +MSI_DATA_VECTOR_SHIFT; + dest_mode = test_bit(MSI_ADDR_DEST_MODE_SHIFT, + (unsigned long *)gsi_msg-msg.address_lo); + trig_mode = test_bit(MSI_DATA_TRIGGER_SHIFT, + (unsigned long *)gsi_msg-msg.data); + delivery_mode = test_bit(MSI_DATA_DELIVERY_MODE_SHIFT, + (unsigned long *)gsi_msg-msg.data); deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic, dest_id, dest_mode); /* IOAPIC delivery mode value is the same as MSI here */ @@ -316,17 +326,16 @@ static int assigned_device_update_msi(struct kvm *kvm, { int r; + adev-guest_irq = airq-guest_irq; + if (airq-flags KVM_DEV_IRQ_ASSIGN_ENABLE_MSI) { /* x86 don't care upper address of guest msi message addr */ adev-irq_requested_type |= KVM_ASSIGNED_DEV_GUEST_MSI; adev-irq_requested_type = ~KVM_ASSIGNED_DEV_GUEST_INTX; - adev-guest_msi.address_lo = airq-guest_msi.addr_lo; - adev-guest_msi.data = airq-guest_msi.data; adev-ack_notifier.gsi = -1; } else if (msi2intx) { adev-irq_requested_type |= KVM_ASSIGNED_DEV_GUEST_INTX; adev-irq_requested_type = ~KVM_ASSIGNED_DEV_GUEST_MSI; - adev-guest_irq = airq-guest_irq; adev-ack_notifier.gsi = airq-guest_irq; } else { /* -- 1.5.4.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 10/15] KVM: Unified the delivery of IOAPIC and MSI
Duplicate code is always bothering... Signed-off-by: Sheng Yang sh...@linux.intel.com --- include/linux/kvm_host.h |3 ++ virt/kvm/ioapic.c| 84 +- virt/kvm/irq_comm.c | 75 3 files changed, 79 insertions(+), 83 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5b671b6..4f92317 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -330,6 +330,9 @@ struct kvm_gsi_msg { struct hlist_node link; }; +void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic, + union kvm_ioapic_redirect_entry *entry, + u32 *deliver_bitmask); void kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 gsi, int level); void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi); void kvm_register_irq_ack_notifier(struct kvm *kvm, diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c index ebb2ab5..af9f5de 100644 --- a/virt/kvm/ioapic.c +++ b/virt/kvm/ioapic.c @@ -195,75 +195,53 @@ u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest, static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq) { - u8 dest = ioapic-redirtbl[irq].fields.dest_id; - u8 dest_mode = ioapic-redirtbl[irq].fields.dest_mode; - u8 delivery_mode = ioapic-redirtbl[irq].fields.delivery_mode; - u8 vector = ioapic-redirtbl[irq].fields.vector; - u8 trig_mode = ioapic-redirtbl[irq].fields.trig_mode; + union kvm_ioapic_redirect_entry entry = ioapic-redirtbl[irq]; u32 deliver_bitmask; struct kvm_vcpu *vcpu; int vcpu_id, r = 0; ioapic_debug(dest=%x dest_mode=%x delivery_mode=%x vector=%x trig_mode=%x\n, -dest, dest_mode, delivery_mode, vector, trig_mode); +entry.fields.dest, entry.fields.dest_mode, +entry.fields.delivery_mode, entry.fields.vector, +entry.fields.trig_mode); - deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic, dest, - dest_mode); + kvm_get_intr_delivery_bitmask(ioapic, entry, deliver_bitmask); if (!deliver_bitmask) { ioapic_debug(no target on destination\n); return 0; } - switch (delivery_mode) { - case IOAPIC_LOWEST_PRIORITY: - vcpu = kvm_get_lowest_prio_vcpu(ioapic-kvm, vector, - deliver_bitmask); + /* Always delivery PIT interrupt to vcpu 0 */ #ifdef CONFIG_X86 - if (irq == 0) - vcpu = ioapic-kvm-vcpus[0]; + if (irq == 0) + deliver_bitmask = 1 0; #endif - if (vcpu != NULL) - r = ioapic_inj_irq(ioapic, vcpu, vector, - trig_mode, delivery_mode); - else - ioapic_debug(null lowest prio vcpu: -mask=%x vector=%x delivery_mode=%x\n, -deliver_bitmask, vector, IOAPIC_LOWEST_PRIORITY); - break; - case IOAPIC_FIXED: -#ifdef CONFIG_X86 - if (irq == 0) - deliver_bitmask = 1; -#endif - for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) { - if (!(deliver_bitmask (1 vcpu_id))) - continue; - deliver_bitmask = ~(1 vcpu_id); - vcpu = ioapic-kvm-vcpus[vcpu_id]; - if (vcpu) { - r = ioapic_inj_irq(ioapic, vcpu, vector, - trig_mode, delivery_mode); - } - } - break; - case IOAPIC_NMI: - for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) { - if (!(deliver_bitmask (1 vcpu_id))) - continue; - deliver_bitmask = ~(1 vcpu_id); - vcpu = ioapic-kvm-vcpus[vcpu_id]; - if (vcpu) + + for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) { + if (!(deliver_bitmask (1 vcpu_id))) + continue; + deliver_bitmask = ~(1 vcpu_id); + vcpu = ioapic-kvm-vcpus[vcpu_id]; + if (vcpu) { + if (entry.fields.delivery_mode == + IOAPIC_LOWEST_PRIORITY || + entry.fields.delivery_mode == IOAPIC_FIXED) + r = ioapic_inj_irq(ioapic, vcpu, + entry.fields.vector, + entry.fields.trig_mode, +
Re: [PATCH 0/15] Device assignment MSI enhancement
On Thursday 25 December 2008 17:09:24 Sheng Yang wrote: Hi Avi and Marcelo Merry Xmas! And here is the v2 of patchset. Target at 2.6.29 for it contained a lot of fix and improvement of current device assignment and MSI feature. Change from V1: Addressed Marcelo's comments, and: 1. Fix racy in kvm_free_assigned_irq(). In case to do this, I fetch one patch (irq_fifo) from original MSI-X patchset. Indeed a nice catch of Marcelo. :) 2. Unified kvm_set_irq() with ioapic_deliver(). It didn't save much, but duplicate is always bothering, and I have modified bitmask for vcpu to a real bitmap (maybe not all, just what I have seen). Forgot to mention, I didn't change API for guest to disable MSI which is a part of Marcelo's comments, for I think single interface named update with some flags represent the current bit state is enough for now... -- regards Yang, Sheng And for V1: 1. Add gsi_msg mapping mechanism, which gsi can used to indicated a MSI interrupt.(Notice API/ABI changed a little, but we don't have userspace patch now, so it should be OK.) 2. Provide MSI disable capability. arch/x86/kvm/lapic.c | 11 ++- include/linux/kvm.h | 15 +++- include/linux/kvm_host.h | 26 +- include/linux/kvm_types.h | 17 virt/kvm/ioapic.c | 117 ++--- virt/kvm/ioapic.h | 23 + virt/kvm/irq_comm.c | 184 --- virt/kvm/kvm_main.c | 212 - 8 files changed, 415 insertions(+), 190 deletions(-) Sorry for the patchset size, it's too easy to grow fast, and I am a little too lazy to split them into more batches in the Xmas... :) -- regards Yang, Sheng -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Nested KVM
Alexander Graf wrote: Avi, could you please apply that patch for kvm-82 too, so we get something working out? I'll take a closer look at what's broken exactly later on. I'll just revert the emulation loop patch. We can reapply it once we fix the problem. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[ANNOUNCE] kvm-82 release
This release adds support for nested virtualization, a feature which allows you to run kvm (and possibly other hypervisors) inside a guest. This is an experimental feature and is only available on AMD hosts. There are fixes included for a couple of minor vulnerabilities: one for the slirp stack (-net user), which is not usually used in production, and another in the vnc server, which allows malicious users to cause a VM to hang. Changes from kvm-81: - merge qemu-svn - uuid support - fix CVE-2007-5729 (slirp vulnerability) - fix CVE-2008-2382 (vnc denial of service) - better scsi support - pci subsystem id for pci devices - this will cause Windows guest to rediscover hardware - improved I/O parallelism - ppc kvm support - hpet support - not fully integrated yet - monitor 'info status' command - merge bochs-bios-cvs - uuid support - prepare for S3 sleep - merge vgabios-cvs - much improved guest debugging (Jan Kiszka) - both debugger in guest and debugger in host - fix kvm makefile for separate object dir (Andi Kleen) - nested svm (Alexander Graf) - run kvm in kvm in kvm... - fix ia64 register and stack access from userspace (Jes Sorensen) - don't treat a global pte as global if cr4.pge is clear - fixes Vista x86 smp failure on boot - properly lock virtual i8259 interrupt controller - fix large host pages invlpg/resync - fixes oops when using host hugetlbfs - fix vmload instruction misemulated as lidt Notes: If you use the modules bundled with kvm-82, you can use any version of Linux from 2.6.16 upwards. You may also use kvm-81 userspace with the kvm modules provided by Linux 2.6.25 or above. Some features may only be available in newer releases. http://kvm.qumranet.com -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[ kvm-Bugs-2168011 ] kvm_host.h:128: error: field 'mmu_notifier' has incomplete
Bugs item #2168011, was opened at 2008-10-15 11:53 Message generated for change (Comment added) made by wg1 You can respond by visiting: https://sourceforge.net/tracker/?func=detailatid=893831aid=2168011group_id=180599 Please note that this message will contain a full copy of the comment thread, including the initial issue submission, for this request, not just the latest update. Category: None Group: None Status: Pending Resolution: Fixed Priority: 5 Private: No Submitted By: Stephane Bakhos (nuitari3) Assigned to: Nobody/Anonymous (nobody) Summary: kvm_host.h:128: error: field 'mmu_notifier' has incomplete Initial Comment: When compiling kvm-77/76 on a 2.6.27 kernel with AMD IOMMU activated I get: kvm_host.h:128: error: field 'mmu_notifier' has incomplete type -- Comment By: Wolfram Gloger (wg1) Date: 2008-12-25 11:21 Message: It would only print a warning for those old kernels, not break them. -- Comment By: Avi Kivity (avik) Date: 2008-12-24 17:27 Message: Well, very old kernels don't have kvm support at all, and this would break them. -- Comment By: Wolfram Gloger (wg1) Date: 2008-12-24 16:53 Message: Oops, you are correct of course. CONFIG_KVM set and all is well. May I suggest the following patch so this doesn't bite people so easily. --- configure.orig 2008-12-14 14:16:27.0 +0100 +++ configure 2008-12-24 16:46:03.0 +0100 @@ -134,6 +134,19 @@ fi fi +if [ -e $kerneldir/.config ]; then +if egrep -q ^CONFIG_KVM=(y|m) $kerneldir/.config; then + : +else + echo Warning: kernel not configured for KVM + echo kvm kernel modules may not build correctly +fi +else +echo Error: kernel .config not found +echo Please make sure your kernel is configured +exit 1 +fi + #configure user dir (cd user; ./configure --prefix=$prefix --kerneldir=$libkvm_kerneldir \ --arch=$arch --processor=$processor \ -- Comment By: Avi Kivity (avik) Date: 2008-12-24 14:49 Message: You should enable the host kernel's kvm modules even if you don't plan to use them, so they will select functionality like mmu notifiers. -- Comment By: Wolfram Gloger (wg1) Date: 2008-12-15 19:11 Message: Sorry, cannot seem to attach a file (I looked hard!), so here the patch inline (applies to kvm-81, too): --- kernel/include/linux/kvm_host.h.orig2008-11-12 13:23:58.0 +0100 +++ kernel/include/linux/kvm_host.h 2008-11-15 21:08:02.0 +0100 @@ -46,6 +46,7 @@ * the COPYING file in the top-level directory. */ +#include linux/mmu_notifier.h #include linux/types.h #include linux/hardirq.h #include linux/list.h --- kernel/include/linux/mmu_notifier.h.orig2008-12-15 18:31:52.0 +0100 +++ kernel/include/linux/mmu_notifier.h 2008-11-15 21:19:08.0 +0100 @@ -0,0 +1,6 @@ +#ifndef _LINUX_MMU_NOTIFIER_H +#define _LINUX_MMU_NOTIFIER_H + +struct mmu_notifier {}; + +#endif -- Comment By: Wolfram Gloger (wg1) Date: 2008-12-15 19:00 Message: I'm seeing this too, with kvm-79 and now kvm-81 on Linux-2.6.27.7 and Linux-2.6.27.9. I'm surprised that such a FTBS is not more prevalent.. For now, I have helped myself with the attached patch. -- Comment By: Stephane Bakhos (nuitari3) Date: 2008-10-15 16:18 Message: It looks like I was wrong in saying that AMD IOMMU was the cause. When I complied again I used make -j5 and it looks like it just skipped the kernel modules. -- Comment By: Stephane Bakhos (nuitari3) Date: 2008-10-15 12:11 Message: It looks like I was wrong in saying that AMD IOMMU was the cause. When I complied again I used make -j5 and it looks like it just skipped the kernel modules. -- You can respond by visiting: https://sourceforge.net/tracker/?func=detailatid=893831aid=2168011group_id=180599 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/4] KVM: Using kfifo for irq recording
Sheng Yang wrote: For MSI-X, we have to deal with multiply IRQ with same IRQ handler, so it's necessary to record the IRQ that trigger the IRQ handler. Does MSI-X disallowing coalescing two requests into one interrupt? Or can we still coalesce interrupts (perhaps by recording them as a (irq, cpu) pair?) @@ -313,6 +314,9 @@ struct kvm_assigned_dev_kernel { int host_irq; bool host_irq_disabled; int guest_irq; +#define KVM_ASSIGNED_DEV_IRQ_FIFO_LEN 0x100 + struct kfifo *irq_fifo; + spinlock_t irq_fifo_lock; #define KVM_ASSIGNED_DEV_GUEST_INTX(1 0) What if it runs out? What does real hardware do? I'm sure it doesn't have a 100-entry queue. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/4] KVM: Using kfifo for irq recording
On Thursday 25 December 2008 19:07:22 Avi Kivity wrote: Sheng Yang wrote: For MSI-X, we have to deal with multiply IRQ with same IRQ handler, so it's necessary to record the IRQ that trigger the IRQ handler. Does MSI-X disallowing coalescing two requests into one interrupt? Or can we still coalesce interrupts (perhaps by recording them as a (irq, cpu) pair?) Disallow? Not quite understand. PCI spec said OS don't need to ensure the sequence they handled is the same as they happened. This struct is used just because we lost information of irq after schedule_work... @@ -313,6 +314,9 @@ struct kvm_assigned_dev_kernel { int host_irq; bool host_irq_disabled; int guest_irq; +#define KVM_ASSIGNED_DEV_IRQ_FIFO_LEN 0x100 + struct kfifo *irq_fifo; + spinlock_t irq_fifo_lock; #define KVM_ASSIGNED_DEV_GUEST_INTX(1 0) What if it runs out? What does real hardware do? I'm sure it doesn't have a 100-entry queue. 0x100 is just a simple number which I thought different interrupts of same MSI-X device can happen at same period(indeed it's 0x100/sizeof(int)). Maybe not that many. And it just used by work function later to find what guest vector is, and then inject the correlated interrupt to the guest. If hardware device driver also postpone the work, I think it also need something like this. -- regards Yang, Sheng -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/5] KVM: MMU: Add for_each_shadow_entry(), a simpler alternative to walk_shadow()
Using a for_each loop style removes the need to write callback and nasty casts. Implement the walk_shadow() using the for_each_shadow_entry(). Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/mmu.c | 69 +--- 1 files changed, 49 insertions(+), 20 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3b86df6..3248a3e 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -150,6 +150,20 @@ struct kvm_shadow_walk { u64 addr, u64 *spte, int level); }; +struct kvm_shadow_walk_iterator { + u64 addr; + hpa_t shadow_addr; + int level; + u64 *sptep; + unsigned index; +}; + +#define for_each_shadow_entry(_vcpu, _addr, _walker)\ + for (shadow_walk_init((_walker), _vcpu, _addr);\ +shadow_walk_okay((_walker)); \ +shadow_walk_next((_walker))) + + struct kvm_unsync_walk { int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk); }; @@ -1254,33 +1268,48 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, return sp; } +static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, +struct kvm_vcpu *vcpu, u64 addr) +{ + iterator-addr = addr; + iterator-shadow_addr = vcpu-arch.mmu.root_hpa; + iterator-level = vcpu-arch.mmu.shadow_root_level; + if (iterator-level == PT32E_ROOT_LEVEL) { + iterator-shadow_addr + = vcpu-arch.mmu.pae_root[(addr 30) 3]; + iterator-shadow_addr = PT64_BASE_ADDR_MASK; + --iterator-level; + if (!iterator-shadow_addr) + iterator-level = 0; + } +} + +static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) +{ + if (iterator-level PT_PAGE_TABLE_LEVEL) + return false; + iterator-index = SHADOW_PT_INDEX(iterator-addr, iterator-level); + iterator-sptep = ((u64 *)__va(iterator-shadow_addr)) + iterator-index; + return true; +} + +static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) +{ + iterator-shadow_addr = *iterator-sptep PT64_BASE_ADDR_MASK; + --iterator-level; +} + static int walk_shadow(struct kvm_shadow_walk *walker, struct kvm_vcpu *vcpu, u64 addr) { - hpa_t shadow_addr; - int level; + struct kvm_shadow_walk_iterator iterator; int r; - u64 *sptep; - unsigned index; - - shadow_addr = vcpu-arch.mmu.root_hpa; - level = vcpu-arch.mmu.shadow_root_level; - if (level == PT32E_ROOT_LEVEL) { - shadow_addr = vcpu-arch.mmu.pae_root[(addr 30) 3]; - shadow_addr = PT64_BASE_ADDR_MASK; - if (!shadow_addr) - return 1; - --level; - } - while (level = PT_PAGE_TABLE_LEVEL) { - index = SHADOW_PT_INDEX(addr, level); - sptep = ((u64 *)__va(shadow_addr)) + index; - r = walker-entry(walker, vcpu, addr, sptep, level); + for_each_shadow_entry(vcpu, addr, iterator) { + r = walker-entry(walker, vcpu, addr, + iterator.sptep, iterator.level); if (r) return r; - shadow_addr = *sptep PT64_BASE_ADDR_MASK; - --level; } return 0; } -- 1.6.0.6 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/5] KVM: MMU: Use for_each_shadow_entry() in __direct_map()
Eliminating a callback and a useless structure. Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/mmu.c | 83 ++- 1 files changed, 29 insertions(+), 54 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3248a3e..b4b79b0 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1841,67 +1841,42 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) { } -struct direct_shadow_walk { - struct kvm_shadow_walk walker; - pfn_t pfn; - int write; - int largepage; - int pt_write; -}; - -static int direct_map_entry(struct kvm_shadow_walk *_walk, - struct kvm_vcpu *vcpu, - u64 addr, u64 *sptep, int level) +static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, + int largepage, gfn_t gfn, pfn_t pfn) { - struct direct_shadow_walk *walk = - container_of(_walk, struct direct_shadow_walk, walker); + struct kvm_shadow_walk_iterator iterator; struct kvm_mmu_page *sp; + int pt_write = 0; gfn_t pseudo_gfn; - gfn_t gfn = addr PAGE_SHIFT; - - if (level == PT_PAGE_TABLE_LEVEL - || (walk-largepage level == PT_DIRECTORY_LEVEL)) { - mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL, -0, walk-write, 1, walk-pt_write, -walk-largepage, 0, gfn, walk-pfn, false); - ++vcpu-stat.pf_fixed; - return 1; - } - if (*sptep == shadow_trap_nonpresent_pte) { - pseudo_gfn = (addr PT64_DIR_BASE_ADDR_MASK) PAGE_SHIFT; - sp = kvm_mmu_get_page(vcpu, pseudo_gfn, (gva_t)addr, level - 1, - 1, ACC_ALL, sptep); - if (!sp) { - pgprintk(nonpaging_map: ENOMEM\n); - kvm_release_pfn_clean(walk-pfn); - return -ENOMEM; + for_each_shadow_entry(vcpu, (u64)gfn PAGE_SHIFT, iterator) { + if (iterator.level == PT_PAGE_TABLE_LEVEL + || (largepage iterator.level == PT_DIRECTORY_LEVEL)) { + mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, +0, write, 1, pt_write, +largepage, 0, gfn, pfn, false); + ++vcpu-stat.pf_fixed; + break; } - set_shadow_pte(sptep, - __pa(sp-spt) - | PT_PRESENT_MASK | PT_WRITABLE_MASK - | shadow_user_mask | shadow_x_mask); - } - return 0; -} + if (*iterator.sptep == shadow_trap_nonpresent_pte) { + pseudo_gfn = (iterator.addr PT64_DIR_BASE_ADDR_MASK) PAGE_SHIFT; + sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, + iterator.level - 1, + 1, ACC_ALL, iterator.sptep); + if (!sp) { + pgprintk(nonpaging_map: ENOMEM\n); + kvm_release_pfn_clean(pfn); + return -ENOMEM; + } -static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, - int largepage, gfn_t gfn, pfn_t pfn) -{ - int r; - struct direct_shadow_walk walker = { - .walker = { .entry = direct_map_entry, }, - .pfn = pfn, - .largepage = largepage, - .write = write, - .pt_write = 0, - }; - - r = walk_shadow(walker.walker, vcpu, gfn PAGE_SHIFT); - if (r 0) - return r; - return walker.pt_write; + set_shadow_pte(iterator.sptep, + __pa(sp-spt) + | PT_PRESENT_MASK | PT_WRITABLE_MASK + | shadow_user_mask | shadow_x_mask); + } + } + return pt_write; } static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) -- 1.6.0.6 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/5] KVM: MMU: Replace walk_shadow() by for_each_shadow_entry() in invlpg()
Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/paging_tmpl.h | 81 +-- 1 files changed, 32 insertions(+), 49 deletions(-) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 69c7e33..46b68f9 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -25,7 +25,6 @@ #if PTTYPE == 64 #define pt_element_t u64 #define guest_walker guest_walker64 - #define shadow_walker shadow_walker64 #define FNAME(name) paging##64_##name #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK @@ -42,7 +41,6 @@ #elif PTTYPE == 32 #define pt_element_t u32 #define guest_walker guest_walker32 - #define shadow_walker shadow_walker32 #define FNAME(name) paging##32_##name #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK @@ -73,18 +71,6 @@ struct guest_walker { u32 error_code; }; -struct shadow_walker { - struct kvm_shadow_walk walker; - struct guest_walker *guest_walker; - int user_fault; - int write_fault; - int largepage; - int *ptwrite; - pfn_t pfn; - u64 *sptep; - gpa_t pte_gpa; -}; - static gfn_t gpte_to_gfn(pt_element_t gpte) { return (gpte PT_BASE_ADDR_MASK) PAGE_SHIFT; @@ -453,54 +439,52 @@ out_unlock: return 0; } -static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw, - struct kvm_vcpu *vcpu, u64 addr, - u64 *sptep, int level) +static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) { - struct shadow_walker *sw = - container_of(_sw, struct shadow_walker, walker); + struct kvm_shadow_walk_iterator iterator; + pt_element_t gpte; + gpa_t pte_gpa = -1; + int level; + u64 *sptep; + + spin_lock(vcpu-kvm-mmu_lock); - /* FIXME: properly handle invlpg on large guest pages */ - if (level == PT_PAGE_TABLE_LEVEL || - ((level == PT_DIRECTORY_LEVEL) is_large_pte(*sptep))) { - struct kvm_mmu_page *sp = page_header(__pa(sptep)); + for_each_shadow_entry(vcpu, gva, iterator) { + level = iterator.level; + sptep = iterator.sptep; - sw-pte_gpa = (sp-gfn PAGE_SHIFT); - sw-pte_gpa += (sptep - sp-spt) * sizeof(pt_element_t); + /* FIXME: properly handle invlpg on large guest pages */ + if (level == PT_PAGE_TABLE_LEVEL || + ((level == PT_DIRECTORY_LEVEL) is_large_pte(*sptep))) { + struct kvm_mmu_page *sp = page_header(__pa(sptep)); - if (is_shadow_present_pte(*sptep)) { - rmap_remove(vcpu-kvm, sptep); - if (is_large_pte(*sptep)) - --vcpu-kvm-stat.lpages; + pte_gpa = (sp-gfn PAGE_SHIFT); + pte_gpa += (sptep - sp-spt) * sizeof(pt_element_t); + + if (is_shadow_present_pte(*sptep)) { + rmap_remove(vcpu-kvm, sptep); + if (is_large_pte(*sptep)) + --vcpu-kvm-stat.lpages; + } + set_shadow_pte(sptep, shadow_trap_nonpresent_pte); + break; } - set_shadow_pte(sptep, shadow_trap_nonpresent_pte); - return 1; - } - if (!is_shadow_present_pte(*sptep)) - return 1; - return 0; -} -static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) -{ - pt_element_t gpte; - struct shadow_walker walker = { - .walker = { .entry = FNAME(shadow_invlpg_entry), }, - .pte_gpa = -1, - }; + if (!is_shadow_present_pte(*sptep)) + break; + } - spin_lock(vcpu-kvm-mmu_lock); - walk_shadow(walker.walker, vcpu, gva); spin_unlock(vcpu-kvm-mmu_lock); - if (walker.pte_gpa == -1) + + if (pte_gpa == -1) return; - if (kvm_read_guest_atomic(vcpu-kvm, walker.pte_gpa, gpte, + if (kvm_read_guest_atomic(vcpu-kvm, pte_gpa, gpte, sizeof(pt_element_t))) return; if (is_present_pte(gpte) (gpte PT_ACCESSED_MASK)) { if (mmu_topup_memory_caches(vcpu)) return; - kvm_mmu_pte_write(vcpu, walker.pte_gpa, (const u8 *)gpte, + kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)gpte, sizeof(pt_element_t), 0); } } @@ -607,7 +591,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) #undef pt_element_t #undef
[PATCH 0/5] for_each_shadow_entry
This patchset replaces walk_shadow(), which calls a callback for each shadow pte that maps a guest virtal address, by an equivalent for_each style construct. Benefits are less thunks and smaller code. Please review. Avi Kivity (5): KVM: MMU: Add for_each_shadow_entry(), a simpler alternative to walk_shadow() KVM: MMU: Use for_each_shadow_entry() in __direct_map() KVM: MMU: Replace walk_shadow() by for_each_shadow_entry() in fetch() KVM: MMU: Replace walk_shadow() by for_each_shadow_entry() in invlpg() KVM: MMU: Drop walk_shadow() arch/x86/kvm/mmu.c | 150 ++- arch/x86/kvm/paging_tmpl.h | 209 +++- 2 files changed, 157 insertions(+), 202 deletions(-) -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/5] KVM: MMU: Replace walk_shadow() by for_each_shadow_entry() in fetch()
Effectively reverting to the pre walk_shadow() version -- but now with the reusable for_each(). Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/paging_tmpl.h | 128 1 files changed, 58 insertions(+), 70 deletions(-) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 9fd78b6..69c7e33 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -283,91 +283,79 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, /* * Fetch a shadow pte for a specific level in the paging hierarchy. */ -static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw, - struct kvm_vcpu *vcpu, u64 addr, - u64 *sptep, int level) +static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, +struct guest_walker *gw, +int user_fault, int write_fault, int largepage, +int *ptwrite, pfn_t pfn) { - struct shadow_walker *sw = - container_of(_sw, struct shadow_walker, walker); - struct guest_walker *gw = sw-guest_walker; unsigned access = gw-pt_access; struct kvm_mmu_page *shadow_page; - u64 spte; + u64 spte, *sptep; int metaphysical; gfn_t table_gfn; int r; + int level; pt_element_t curr_pte; + struct kvm_shadow_walk_iterator iterator; - if (level == PT_PAGE_TABLE_LEVEL - || (sw-largepage level == PT_DIRECTORY_LEVEL)) { - mmu_set_spte(vcpu, sptep, access, gw-pte_access access, -sw-user_fault, sw-write_fault, -gw-ptes[gw-level-1] PT_DIRTY_MASK, -sw-ptwrite, sw-largepage, -gw-ptes[gw-level-1] PT_GLOBAL_MASK, -gw-gfn, sw-pfn, false); - sw-sptep = sptep; - return 1; - } - - if (is_shadow_present_pte(*sptep) !is_large_pte(*sptep)) - return 0; - - if (is_large_pte(*sptep)) { - set_shadow_pte(sptep, shadow_trap_nonpresent_pte); - kvm_flush_remote_tlbs(vcpu-kvm); - rmap_remove(vcpu-kvm, sptep); - } + if (!is_present_pte(gw-ptes[gw-level - 1])) + return NULL; - if (level == PT_DIRECTORY_LEVEL gw-level == PT_DIRECTORY_LEVEL) { - metaphysical = 1; - if (!is_dirty_pte(gw-ptes[level - 1])) - access = ~ACC_WRITE_MASK; - table_gfn = gpte_to_gfn(gw-ptes[level - 1]); - } else { - metaphysical = 0; - table_gfn = gw-table_gfn[level - 2]; - } - shadow_page = kvm_mmu_get_page(vcpu, table_gfn, (gva_t)addr, level-1, - metaphysical, access, sptep); - if (!metaphysical) { - r = kvm_read_guest_atomic(vcpu-kvm, gw-pte_gpa[level - 2], - curr_pte, sizeof(curr_pte)); - if (r || curr_pte != gw-ptes[level - 2]) { - kvm_mmu_put_page(shadow_page, sptep); - kvm_release_pfn_clean(sw-pfn); - sw-sptep = NULL; - return 1; + for_each_shadow_entry(vcpu, addr, iterator) { + level = iterator.level; + sptep = iterator.sptep; + if (level == PT_PAGE_TABLE_LEVEL + || (largepage level == PT_DIRECTORY_LEVEL)) { + mmu_set_spte(vcpu, sptep, access, +gw-pte_access access, +user_fault, write_fault, +gw-ptes[gw-level-1] PT_DIRTY_MASK, +ptwrite, largepage, +gw-ptes[gw-level-1] PT_GLOBAL_MASK, +gw-gfn, pfn, false); + break; } - } - spte = __pa(shadow_page-spt) | PT_PRESENT_MASK | PT_ACCESSED_MASK - | PT_WRITABLE_MASK | PT_USER_MASK; - *sptep = spte; - return 0; -} + if (is_shadow_present_pte(*sptep) !is_large_pte(*sptep)) + continue; -static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, -struct guest_walker *guest_walker, -int user_fault, int write_fault, int largepage, -int *ptwrite, pfn_t pfn) -{ - struct shadow_walker walker = { - .walker = { .entry = FNAME(shadow_walk_entry), }, - .guest_walker = guest_walker, - .user_fault = user_fault, - .write_fault = write_fault, - .largepage = largepage, - .ptwrite = ptwrite, -
[PATCH 5/5] KVM: MMU: Drop walk_shadow()
No longer used. Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/mmu.c | 20 1 files changed, 0 insertions(+), 20 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b4b79b0..31ebe69 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -145,11 +145,6 @@ struct kvm_rmap_desc { struct kvm_rmap_desc *more; }; -struct kvm_shadow_walk { - int (*entry)(struct kvm_shadow_walk *walk, struct kvm_vcpu *vcpu, -u64 addr, u64 *spte, int level); -}; - struct kvm_shadow_walk_iterator { u64 addr; hpa_t shadow_addr; @@ -1299,21 +1294,6 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) --iterator-level; } -static int walk_shadow(struct kvm_shadow_walk *walker, - struct kvm_vcpu *vcpu, u64 addr) -{ - struct kvm_shadow_walk_iterator iterator; - int r; - - for_each_shadow_entry(vcpu, addr, iterator) { - r = walker-entry(walker, vcpu, addr, - iterator.sptep, iterator.level); - if (r) - return r; - } - return 0; -} - static void kvm_mmu_page_unlink_children(struct kvm *kvm, struct kvm_mmu_page *sp) { -- 1.6.0.6 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/4] KVM: Using kfifo for irq recording
Sheng Yang wrote: On Thursday 25 December 2008 19:07:22 Avi Kivity wrote: Sheng Yang wrote: For MSI-X, we have to deal with multiply IRQ with same IRQ handler, so it's necessary to record the IRQ that trigger the IRQ handler. Does MSI-X disallowing coalescing two requests into one interrupt? Or can we still coalesce interrupts (perhaps by recording them as a (irq, cpu) pair?) Disallow? Not quite understand. PCI spec said OS don't need to ensure the sequence they handled is the same as they happened. This struct is used just because we lost information of irq after schedule_work... Why can't we store this information in a bitmap? There are a limited number of irqs. The only reason I can think of for using a fifo is if we want to preserve the number and ordering of interrupts. Is there another reason? @@ -313,6 +314,9 @@ struct kvm_assigned_dev_kernel { int host_irq; bool host_irq_disabled; int guest_irq; +#define KVM_ASSIGNED_DEV_IRQ_FIFO_LEN 0x100 + struct kfifo *irq_fifo; + spinlock_t irq_fifo_lock; #define KVM_ASSIGNED_DEV_GUEST_INTX(1 0) What if it runs out? What does real hardware do? I'm sure it doesn't have a 100-entry queue. 0x100 is just a simple number which I thought different interrupts of same MSI-X device can happen at same period(indeed it's 0x100/sizeof(int)). Maybe not that many. And it just used by work function later to find what guest vector is, and then inject the correlated interrupt to the guest. Maybe it's better to do the conversion immediately, so we can store the information in a structure that's not prone to overflow. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Nested KVM
On 25.12.2008, at 10:59, Avi Kivity a...@redhat.com wrote: Alexander Graf wrote: Avi, could you please apply that patch for kvm-82 too, so we get something working out? I'll take a closer look at what's broken exactly later on. I'll just revert the emulation loop patch. We can reapply it once we fix the problem. Sounds good. It was rather meant as a draft/rfc anyways :-). Alex -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [ANNOUNCE] kvm-82 release
When building KVM-82 on F10 I get the following errors: make[2]: Entering directory `/usr/src/kernels/2.6.27.9-159.fc10.x86_64' LD /opt/kvm-82/kernel/x86/built-in.o CC [M] /opt/kvm-82/kernel/x86/svm.o In file included from /opt/kvm-82/kernel/x86/external-module-compat.h:10, from command-line:2: /opt/kvm-82/kernel/x86/../external-module-compat-comm.h:587: error: conflicting types for 'hrtimer_add_expires_ns' include/linux/hrtimer.h:245: error: previous definition of 'hrtimer_add_expires_ns' was here /opt/kvm-82/kernel/x86/../external-module-compat-comm.h:592: error: conflicting types for 'hrtimer_get_expires' include/linux/hrtimer.h:250: error: previous definition of 'hrtimer_get_expires' was here /opt/kvm-82/kernel/x86/../external-module-compat-comm.h:597: error: conflicting types for 'hrtimer_get_expires_ns' include/linux/hrtimer.h:260: error: previous definition of 'hrtimer_get_expires_ns' was here /opt/kvm-82/kernel/x86/../external-module-compat-comm.h:602: error: conflicting types for 'hrtimer_start_expires' include/linux/hrtimer.h:341: error: previous definition of 'hrtimer_start_expires' was here make[4]: *** [/opt/kvm-82/kernel/x86/svm.o] Error 1 make[3]: *** [/opt/kvm-82/kernel/x86] Error 2 make[2]: *** [_module_/opt/kvm-82/kernel] Error 2 make[2]: Leaving directory `/usr/src/kernels/2.6.27.9-159.fc10.x86_64' make[1]: *** [all] Error 2 make[1]: Leaving directory `/opt/kvm-82/kernel' make: *** [kernel] Error 2 Has anyone else seen this? Mark Bidewell On Thu, Dec 25, 2008 at 5:11 AM, Avi Kivity a...@redhat.com wrote: This release adds support for nested virtualization, a feature which allows you to run kvm (and possibly other hypervisors) inside a guest. This is an experimental feature and is only available on AMD hosts. There are fixes included for a couple of minor vulnerabilities: one for the slirp stack (-net user), which is not usually used in production, and another in the vnc server, which allows malicious users to cause a VM to hang. Changes from kvm-81: - merge qemu-svn - uuid support - fix CVE-2007-5729 (slirp vulnerability) - fix CVE-2008-2382 (vnc denial of service) - better scsi support - pci subsystem id for pci devices - this will cause Windows guest to rediscover hardware - improved I/O parallelism - ppc kvm support - hpet support - not fully integrated yet - monitor 'info status' command - merge bochs-bios-cvs - uuid support - prepare for S3 sleep - merge vgabios-cvs - much improved guest debugging (Jan Kiszka) - both debugger in guest and debugger in host - fix kvm makefile for separate object dir (Andi Kleen) - nested svm (Alexander Graf) - run kvm in kvm in kvm... - fix ia64 register and stack access from userspace (Jes Sorensen) - don't treat a global pte as global if cr4.pge is clear - fixes Vista x86 smp failure on boot - properly lock virtual i8259 interrupt controller - fix large host pages invlpg/resync - fixes oops when using host hugetlbfs - fix vmload instruction misemulated as lidt Notes: If you use the modules bundled with kvm-82, you can use any version of Linux from 2.6.16 upwards. You may also use kvm-81 userspace with the kvm modules provided by Linux 2.6.25 or above. Some features may only be available in newer releases. http://kvm.qumranet.com -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
kvm vmload/vmsave vs tss.ist
kvm performance is largely dependent on the frequency and cost of switches between guest and host mode. The cost of a switch is greatly influenced by the amount of state we have to load and save. One of the optimizations that kvm makes in order to reduce the cost is to partition the guest state into two; let's call the two parts kernel state and user state. The kernel state consists of registers that are used for general kernel execution, for example the general purpose registers. User state consists of registers that are only used in user mode (or in the transition to user mode). When switching from guest to host, we only save and reload the kernel state, delaying reloading of user state until we actually need to switch to user mode. Since many exits are satisfied entirely in the kernel, we can avoid switching user state entirely. In effect the host kernel runs with some of the cpu registers containing guest values. The mechanism used for deferring state switch is PREEMPT_NOTIFIERS, introduced in 2.6.23 IIRC. Now, AMD SVM instructions also partition register state into two. The VMRUN instruction, which is used to switch to guest mode, loads and saves registers corresponding to kernel state. The VMLOAD and VMSAVE instructions load and save user state registers. The exact registers managed by VMLOAD and VMSAVE are: FS GS TR LDTR KernelGSBase STAR LSTAR CSTAR SFMASK SYSENTER_CS SYSENTER_ESP SYSENTER_EIP None of these registers are ever touched in 64-bit kernel mode, except gs.base (which we can save/restore manually), and TR. The only part of the TSS (pointed to by the TR) used in 64-bit mode are the seven Interrupt Stack Table (IST) entries. These are used to provide known-good stacks for critical exceptions. These critical exceptions are: debug, nmi, double fault, stack fault, and machine check. Because of this one detail, kvm must execute vmload/vmsave on every guest/host switch. Hardware architects, give yourself a pat on the back. The impact is even greater when using nested virtualization, since we must trap on two additional instructions on every switch. I would like to remove this limitation. I see several ways to go about it: 1. Drop the use of IST This would reduce the (perceived) reliability of the kernel and would probably not be welcomed. 2. Introduce a config item for dropping IST, and have kvm defer vmload/vmsave depending on the configuration This would pose a dilemma for kitchen sink distro kernels: kvm performance or maximum reliability? 3. Switch off IST when the first VM is created, switch it back on when the last VM is destroyed Most likely no additional code would need to be modified. It could be made conditional if someone wants to retain IST even while kvm is active. We already have hooks in place and know where the host IST is. I favor this option. 4. Some other brilliant idea? Might be even better than option 3. hpa/Ingo, any opinions? -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: kvm vmload/vmsave vs tss.ist
* Avi Kivity a...@redhat.com wrote: I would like to remove this limitation. I see several ways to go about it: 1. Drop the use of IST This would reduce the (perceived) reliability of the kernel and would probably not be welcomed. hpa/Ingo, any opinions? i think we should actually do #1 unconditionally. ISTs are bad for the native kernel too. They have various nasty complications in the stack walker (and hence they _reduce_ reliability in practice), and they are non-preemptible as well. Plus we have the maximum-stack-footprint ftrace plugin now, which can remove any perception about how bad the worst-case stack footprint is in practice. If it ever becomes an issue we could also soft-switch to a larger (per CPU) exception stack from the exception handlers themselves. The architectural stack footprint of the various critical exceptions are calculatable and low - so we could switch away and get almost the kind of separation that ISTs give. There's no deep reason to actually make use of hw switched ISTs. So feel free to send a patch that just standardizes the critical exceptions to use the regular kernel stack. (I havent actually tried this but it should be relatively simple to implement. Roadblocks are possible.) Ingo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[ kvm-Bugs-2466584 ] Guest/Host serial ports no longer working
Bugs item #2466584, was opened at 2008-12-25 10:26 Message generated for change (Tracker Item Submitted) made by Item Submitter You can respond by visiting: https://sourceforge.net/tracker/?func=detailatid=893831aid=2466584group_id=180599 Please note that this message will contain a full copy of the comment thread, including the initial issue submission, for this request, not just the latest update. Category: kernel Group: None Status: Open Resolution: None Priority: 5 Private: No Submitted By: AndrewB (andybaumhauer) Assigned to: Nobody/Anonymous (nobody) Summary: Guest/Host serial ports no longer working Initial Comment: Host: Linux, Fedora 10 2.6.27.9-159.fc10.x86_64 on Intel Core 2 Duo (Q6600) running KVM (kvm-74-10.fc10.x86_64) Guest: Windows XP Home, using qemu-kvm -M pc -m 768 -smp 2 -boot c -hda /home/vm/Windows_XP_Home.img -net nic,macaddr=00:16:3e:10:23:ee,vlan=0 -net user,vlan=0,script=,ifname=virbr0 -std-vga -soundhw es1370 -localtime -serial /dev/ttyS0 Error messages when quitting Guest OS: kvm_run: Unknown error 524 kvm_run returned -524 Summary: On Fedora 8 and Fedora 9 using KVM-65 and earlier, connections from Windows COM1 to /dev/ttyS0 worked. On Fedora 10 and KVM-74 serial connection between guest and host no longer works. How to reproduce: On host OS, use GTKTerm to access the serial port /dev/ttyS0 with a loopback adapter attached to the port. You will see characters echo'ed back. On Guest OS, use option -serial stdio and Hyperterm to access COM1, and you will see characters from Hyperterm on the Host OS terminal that started qemu-kvm (so we know that the Guest OS can send data out of KVM). On Fedora 10, if you regress to KVM-65 (from Fedora 9 repository), by: rpm -e kvm-74-10.fc10.x86_64 rpm -ihv --force gnutls-2.0.4-2.fc9.x86_64.rpm rpm -ihv kvm-65-1.fc9.x86_64.rpm Now option -serial /dev/ttyS0 will operate as expected, and Hyperterm will see characters echo'ed from the loopback hardware. -- You can respond by visiting: https://sourceforge.net/tracker/?func=detailatid=893831aid=2466584group_id=180599 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: kvm vmload/vmsave vs tss.ist
Ingo Molnar wrote: i think we should actually do #1 unconditionally. ISTs are bad for the native kernel too. They have various nasty complications in the stack walker (and hence they _reduce_ reliability in practice), and they are non-preemptible as well. Plus we have the maximum-stack-footprint ftrace plugin now, which can remove any perception about how bad the worst-case stack footprint is in practice. If it ever becomes an issue we could also soft-switch to a larger (per CPU) exception stack from the exception handlers themselves. The architectural stack footprint of the various critical exceptions are calculatable and low - so we could switch away and get almost the kind of separation that ISTs give. There's no deep reason to actually make use of hw switched ISTs. So feel free to send a patch that just standardizes the critical exceptions to use the regular kernel stack. (I havent actually tried this but it should be relatively simple to implement. Roadblocks are possible.) Certainly. There is provision for a debug stack that can be larger than the normal exception stack. This is used for vectors 1 and 3. If we wish to preserve this, we need to to manual stack switching. Currently DEBUG_STKSZ is 8K, the same as the normal stack (compared to 4K for the other execption stacks). Do we need to implement stack switching for debug vectors? -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] x86: drop the use of the tss interrupt stack table (IST)
The IST is the only thing that requires a valid TSS while running in kernel mode. Dropping its use unlocks an optimization opportunity for kvm: if we don't need a valid TSS while in kernel mode we can defer the use of the VMLOAD/VMSAVE instructions until the next context switch, reducing the executions of these costly instructions by a nice factor. Kernel reliability should also be improved since interrupt paths are simplified. Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/include/asm/desc.h | 12 - arch/x86/include/asm/page_64.h |7 --- arch/x86/include/asm/processor.h | 11 arch/x86/kernel/cpu/common.c | 34 - arch/x86/kernel/dumpstack_64.c | 96 -- arch/x86/kernel/entry_64.S | 17 ++- arch/x86/kernel/traps.c | 12 ++-- 7 files changed, 10 insertions(+), 179 deletions(-) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index e6b82b1..0465c75 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -369,18 +369,6 @@ static inline void set_task_gate(unsigned int n, unsigned int gdt_entry) _set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry3)); } -static inline void set_intr_gate_ist(int n, void *addr, unsigned ist) -{ - BUG_ON((unsigned)n 0xFF); - _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS); -} - -static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist) -{ - BUG_ON((unsigned)n 0xFF); - _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS); -} - #else /* * GET_DESC_BASE reads the descriptor base of the specified segment. diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index 5ebca29..7c89095 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -16,13 +16,6 @@ #define IRQSTACK_ORDER 2 #define IRQSTACKSIZE (PAGE_SIZE IRQSTACK_ORDER) -#define STACKFAULT_STACK 1 -#define DOUBLEFAULT_STACK 2 -#define NMI_STACK 3 -#define DEBUG_STACK 4 -#define MCE_STACK 5 -#define N_EXCEPTION_STACKS 5 /* hw limit: 7 */ - #define PUD_PAGE_SIZE (_AC(1, UL) PUD_SHIFT) #define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 5ca01e3..4ef899c 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -273,13 +273,6 @@ struct tss_struct { DECLARE_PER_CPU(struct tss_struct, init_tss); -/* - * Save the original ist values for checking stack pointers during debugging - */ -struct orig_ist { - unsigned long ist[7]; -}; - #defineMXCSR_DEFAULT 0x1f80 struct i387_fsave_struct { @@ -372,10 +365,6 @@ union thread_xstate { struct xsave_struct xsave; }; -#ifdef CONFIG_X86_64 -DECLARE_PER_CPU(struct orig_ist, orig_ist); -#endif - extern void print_cpu_info(struct cpuinfo_x86 *); extern unsigned int xstate_size; extern void free_thread_xstate(struct task_struct *); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b9c9ea0..8563c51 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -903,9 +903,6 @@ void __cpuinit pda_init(int cpu) } } -char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + - DEBUG_STKSZ] __page_aligned_bss; - extern asmlinkage void ignore_sysret(void); /* May not be marked __init: used by software suspend */ @@ -931,12 +928,6 @@ void syscall_init(void) unsigned long kernel_eflags; -/* - * Copies of the original ist values from the tss are only accessed during - * debugging, no special alignment required. - */ -DEFINE_PER_CPU(struct orig_ist, orig_ist); - #else /* Make sure %fs is initialized properly in idle threads */ @@ -960,17 +951,13 @@ void __cpuinit cpu_init(void) { int cpu = stack_smp_processor_id(); struct tss_struct *t = per_cpu(init_tss, cpu); - struct orig_ist *orig_ist = per_cpu(orig_ist, cpu); unsigned long v; - char *estacks = NULL; struct task_struct *me; int i; /* CPU 0 is initialised in head64.c */ if (cpu != 0) pda_init(cpu); - else - estacks = boot_exception_stacks; me = current; @@ -1000,27 +987,6 @@ void __cpuinit cpu_init(void) if (cpu != 0 x2apic) enable_x2apic(); - /* -* set up and load the per-CPU TSS -*/ - if (!orig_ist-ist[0]) { - static const unsigned int order[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, - [DEBUG_STACK - 1] = DEBUG_STACK_ORDER - }; - for (v = 0; v N_EXCEPTION_STACKS; v++) { - if (cpu) { - estacks = (char *)__get_free_pages(GFP_ATOMIC,
Re: kvm vmload/vmsave vs tss.ist
* Avi Kivity a...@redhat.com wrote: Ingo Molnar wrote: i think we should actually do #1 unconditionally. ISTs are bad for the native kernel too. They have various nasty complications in the stack walker (and hence they _reduce_ reliability in practice), and they are non-preemptible as well. Plus we have the maximum-stack-footprint ftrace plugin now, which can remove any perception about how bad the worst-case stack footprint is in practice. If it ever becomes an issue we could also soft-switch to a larger (per CPU) exception stack from the exception handlers themselves. The architectural stack footprint of the various critical exceptions are calculatable and low - so we could switch away and get almost the kind of separation that ISTs give. There's no deep reason to actually make use of hw switched ISTs. So feel free to send a patch that just standardizes the critical exceptions to use the regular kernel stack. (I havent actually tried this but it should be relatively simple to implement. Roadblocks are possible.) Certainly. There is provision for a debug stack that can be larger than the normal exception stack. This is used for vectors 1 and 3. If we wish to preserve this, we need to to manual stack switching. Currently DEBUG_STKSZ is 8K, the same as the normal stack (compared to 4K for the other execption stacks). Do we need to implement stack switching for debug vectors? i'd suggest to reuse the irq-stacks for this. Right now on 64-bit we've got the following stack layout: 8K process stacks, a 16K IRQ stack on each CPU, shared by all IRQs. Then we have the IST stacks with weird sizes: debug:8K, the others: 4K. Then all the unnecessary IST complications can be removed. If nesting ever becomes an issue, the IRQ stack size can be doubled to 32K. This way we save some small amount of RAM too (right now the IST stacks take up 28K of RAM per CPU), and reduce complexity and fragility quite visibly. And help KVM ;-) Ingo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[ kvm-Bugs-2001121 ] Windows 2003 x64 - SESSION5_INITIALIZATION_FAILED
Bugs item #2001121, was opened at 2008-06-23 21:09 Message generated for change (Comment added) made by masc82 You can respond by visiting: https://sourceforge.net/tracker/?func=detailatid=893831aid=2001121group_id=180599 Please note that this message will contain a full copy of the comment thread, including the initial issue submission, for this request, not just the latest update. Category: intel Group: None Status: Open Resolution: None Priority: 5 Private: No Submitted By: Andreas 'ac0v' Specht (ac0v) Assigned to: Nobody/Anonymous (nobody) Summary: Windows 2003 x64 - SESSION5_INITIALIZATION_FAILED Initial Comment: Host Machine: CPU:2x Intel(R) Xeon(R) CPU E5405 @ 2.00GHz Kernel: Linux version 2.6.25-gentoo-r4 Arch: x86_64 KVM:tried kvm-69 and kvm-70 Guest System: tried Windows 2003 x64 and Windows 2003 x64 with slipstreamed Service Pack 2 Hi, I get a BSoD (see attachment) while installing Windows 2003 x64 which contains the error message SESSION5_INITIALIZATION_FAILED Serial log is empty. I start my KVM via this command: kvm -hda /dev/lvg1/sap-test -boot d -cdrom /srv/install/iso/windows/2003-server-x64.iso -vnc :4 -m 3048 -smp 4 -daemonize Using -no-kvm or the -no-kvm-pit switch doesn't help and shows only the message Setup is starting Windows. The -no-kvm-irqchip switch has no effect (same BSoD). Any Ideas? Regards, Andreas 'ac0v' Specht -- Comment By: MaSc82 (masc82) Date: 2008-12-25 17:35 Message: Updated to 2.6.28 including kvm modules, which seem to work very well with kvm81, at the same time supporting win2003 x64, so all mentioned issues are resolved for me, but only when using the kvm modules of linux kernel 2.6.28. -- Comment By: MaSc82 (masc82) Date: 2008-12-22 16:58 Message: I've got the same issue with kvm-81 and Linux version 2.6.27-gentoo-r7. The problem does not occur when using the kvm modules coming with the kernel, but these (probably older?) modules still have bugs with smp and block device virtio (temporary freeze).. Can anyone shed some light on this, please? -- You can respond by visiting: https://sourceforge.net/tracker/?func=detailatid=893831aid=2001121group_id=180599 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: kvm vmload/vmsave vs tss.ist
* Ingo Molnar mi...@elte.hu wrote: i'd suggest to reuse the irq-stacks for this. Right now on 64-bit we've got the following stack layout: 8K process stacks, a 16K IRQ stack on each CPU, shared by all IRQs. Then we have the IST stacks with weird sizes: debug:8K, the others: 4K. this has to be done carefully though, as there's a subtle detail here: right now the pda_irqcount and the pda_irqstackptr logic in entry_64.S is not re-entry safe and relies on IRQs being off. If critical exceptions are moved to the IRQ stack then %rsp switching to the IRQ stack has to be done atomically: instead of using the pda_irqcount check the %rsp value itself should be checked against pda_irqstackptr - if it's within that 16K range then we are already on the IRQ stack and do not need to switch to it but can just use the current %rsp. Ingo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: kvm vmload/vmsave vs tss.ist
Ingo Molnar wrote: * Ingo Molnar mi...@elte.hu wrote: i'd suggest to reuse the irq-stacks for this. Right now on 64-bit we've got the following stack layout: 8K process stacks, a 16K IRQ stack on each CPU, shared by all IRQs. Then we have the IST stacks with weird sizes: debug:8K, the others: 4K. this has to be done carefully though, as there's a subtle detail here: right now the pda_irqcount and the pda_irqstackptr logic in entry_64.S is not re-entry safe and relies on IRQs being off. If critical exceptions are moved to the IRQ stack then %rsp switching to the IRQ stack has to be done atomically: instead of using the pda_irqcount check the %rsp value itself should be checked against pda_irqstackptr - if it's within that 16K range then we are already on the IRQ stack and do not need to switch to it but can just use the current %rsp. I think it's enough to switch %rsp before incrementing irqcount, no? -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: kvm vmload/vmsave vs tss.ist
* Avi Kivity a...@redhat.com wrote: Ingo Molnar wrote: * Ingo Molnar mi...@elte.hu wrote: i'd suggest to reuse the irq-stacks for this. Right now on 64-bit we've got the following stack layout: 8K process stacks, a 16K IRQ stack on each CPU, shared by all IRQs. Then we have the IST stacks with weird sizes: debug:8K, the others: 4K. this has to be done carefully though, as there's a subtle detail here: right now the pda_irqcount and the pda_irqstackptr logic in entry_64.S is not re-entry safe and relies on IRQs being off. If critical exceptions are moved to the IRQ stack then %rsp switching to the IRQ stack has to be done atomically: instead of using the pda_irqcount check the %rsp value itself should be checked against pda_irqstackptr - if it's within that 16K range then we are already on the IRQ stack and do not need to switch to it but can just use the current %rsp. I think it's enough to switch %rsp before incrementing irqcount, no? no - that would introduce a small race: if an exception (say an NMI or MCE, or a debug trap) happens in that small window then the exception context thinks that it's on the IRQ stack already, and would use the task stack. So if we want to move them to IRQ stacks all the time, we have to check that condition atomically - the safest way of which is to check RSP against the (static) pda:[irqstackptr-16K+64..irqstackptr] range. Ingo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: kvm vmload/vmsave vs tss.ist
Ingo Molnar wrote: I think it's enough to switch %rsp before incrementing irqcount, no? no - that would introduce a small race: if an exception (say an NMI or MCE, or a debug trap) happens in that small window then the exception context thinks that it's on the IRQ stack already, and would use the task stack. I'm suggesting check irqcount if (wasnt_in_irq) rsp = irqstack ++irqcount If the NMI happens before the increment, we'll switch the stack unconditionally, and if the NMI happens after the increment, then we won't switch the stack, but we're guaranteed to be on the irqstack anyway. The window size is negative :) Similarly, the exit path should be oldstack_reg = oldstack; --irqcount; rsp = oldstack_register; To guarantee that by the time we decrement irqcount, we don't need the stack anymore. -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [ANNOUNCE] kvm-82 release
Mark Bidewell mark.bidewell at alumni.clemson.edu writes: When building KVM-82 on F10 I get the following errors: make[2]: Entering directory `/usr/src/kernels/2.6.27.9-159.fc10.x86_64' LD /opt/kvm-82/kernel/x86/built-in.o CC [M] /opt/kvm-82/kernel/x86/svm.o In file included from /opt/kvm-82/kernel/x86/external-module-compat.h:10, from command-line:2: /opt/kvm-82/kernel/x86/../external-module-compat-comm.h:587: error: conflicting types for 'hrtimer_add_expires_ns' include/linux/hrtimer.h:245: error: previous definition of 'hrtimer_add_expires_ns' was here ... make: *** [kernel] Error 2 Has anyone else seen this? the same problem exists with the latest stock kernel on fedora 9. you may comment out the conflicting definitions in kvm-82/kernel/external-module-compat-comm.h to fix the build problem. bye, Andreas Winkelbauer -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: kvm vmload/vmsave vs tss.ist
* Avi Kivity a...@redhat.com wrote: Ingo Molnar wrote: I think it's enough to switch %rsp before incrementing irqcount, no? no - that would introduce a small race: if an exception (say an NMI or MCE, or a debug trap) happens in that small window then the exception context thinks that it's on the IRQ stack already, and would use the task stack. I'm suggesting check irqcount if (wasnt_in_irq) rsp = irqstack ++irqcount If the NMI happens before the increment, we'll switch the stack unconditionally, and if the NMI happens after the increment, then we won't switch the stack, but we're guaranteed to be on the irqstack anyway. The window size is negative :) Similarly, the exit path should be oldstack_reg = oldstack; --irqcount; rsp = oldstack_register; To guarantee that by the time we decrement irqcount, we don't need the stack anymore. agreed, something like this would work too. My suggestion, to eliminate irqcount altogether and just check RSP against the known-irqstack-range, could result in slightly smaller (and thus faster) code, but it's a marginal difference at best. Ingo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: kvm vmload/vmsave vs tss.ist
Avi Kivity wrote: I'm suggesting check irqcount if (wasnt_in_irq) rsp = irqstack ++irqcount If the NMI happens before the increment, we'll switch the stack unconditionally, and if the NMI happens after the increment, then we won't switch the stack, but we're guaranteed to be on the irqstack anyway. The window size is negative :) Similarly, the exit path should be oldstack_reg = oldstack; --irqcount; rsp = oldstack_register; To guarantee that by the time we decrement irqcount, we don't need the stack anymore. On the other hand, checking %rsp allows us to drop irqcount completely, so maybe it's better. -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Randomly freezing guests - Workaround?
Hi all, 3 weeks ago I asked for help in #kvm. I have two AMD64 machines running up to 10 guests (each) via kvm/libvirt. Some of my guest machines (most of them Debian etch/lenny) randomly froze. Sometimes this happened after several days, sometimes shortly after the guest had been started. Someone at #kvm told me to switch the current clocksource (which had been kvm-clock) to acpi_pm. My guests did not freeze since then! Thanks for your support and this excellent piece of software! Regards, Mario -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [ANNOUNCE] kvm-82 release
Avi Kivity wrote: This release adds support for nested virtualization, a feature which allows you to run kvm (and possibly other hypervisors) inside a guest. This is an experimental feature and is only available on AMD hosts. There are fixes included for a couple of minor vulnerabilities: one for the slirp stack (-net user), which is not usually used in production, and another in the vnc server, which allows malicious users to cause a VM to hang. on centos-5, kernel/include-compat/asm/msr-index.h gives dozens of such warnings during compile: In file included from /home/robot/rpm/BUILD/kvm-kmod-82/_kmod_build_/kernel/include/asm/kvm_host.h:65, from /home/robot/rpm/BUILD/kvm-kmod-82/_kmod_build_/kernel/include/linux/kvm_host.h:67, from /home/robot/rpm/BUILD/kvm-kmod-82/_kmod_build_/kernel/x86/lapic.c:60: /home/robot/rpm/BUILD/kvm-kmod-82/_kmod_build_/kernel/include-compat/asm/msr-index.h:304:1: warning: MSR_P4_U2L_ESCR0 redefined In file included from include/asm/processor.h:16, from /home/robot/rpm/BUILD/kvm-kmod-82/_kmod_build_/kernel/include/asm/kvm_para.h:89, from /home/robot/rpm/BUILD/kvm-kmod-82/_kmod_build_/kernel/include/linux/kvm_para.h:63, from /home/robot/rpm/BUILD/kvm-kmod-82/_kmod_build_/kernel/x86/../external-module-compat-comm.h:14, from /home/robot/rpm/BUILD/kvm-kmod-82/_kmod_build_/kernel/x86/external-module-compat.h:9, from command line:1: include/asm/msr.h:407:1: warning: this is the location of the previous definition In file included from /home/robot/rpm/BUILD/kvm-kmod-82/_kmod_build_/kernel/include/asm/kvm_host.h:65, from /home/robot/rpm/BUILD/kvm-kmod-82/_kmod_build_/kernel/include/linux/kvm_host.h:67, from /home/robot/rpm/BUILD/kvm-kmod-82/_kmod_build_/kernel/x86/lapic.c:60: /home/robot/rpm/BUILD/kvm-kmod-82/_kmod_build_/kernel/include-compat/asm/msr-index.h:305:1: warning: MSR_P4_U2L_ESCR1 redefined In file included from include/asm/processor.h:16, from /home/robot/rpm/BUILD/kvm-kmod-82/_kmod_build_/kernel/include/asm/kvm_para.h:89, from /home/robot/rpm/BUILD/kvm-kmod-82/_kmod_build_/kernel/include/linux/kvm_para.h:63, from /home/robot/rpm/BUILD/kvm-kmod-82/_kmod_build_/kernel/x86/../external-module-compat-comm.h:14, from /home/robot/rpm/BUILD/kvm-kmod-82/_kmod_build_/kernel/x86/external-module-compat.h:9, -- Levente Si vis pacem para bellum! -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] Re: [PATCH 1/5] re-register whole area upon lfb unmap.
2008/12/17 Anthony Liguori anth...@codemonkey.ws: Glauber Costa wrote: set phys_offset correctly for the whole vga area when unmapping linear vram (for vga optimization). We first register the old pieces as unassigned memory, to make things easier for kvm (and possibly other slot based implementations in the future). Replacing the region directly would make the slot management significantly more complex. This change worries me because it involves explicitly unassigning slots and then assigning a new, bigger slot. This is not necessary for TCG. It suggests to me that there's a bug in the kvm slot code and that we're changing QEMU to work around it. That will means there may be other places in the code that are completely valid, but exercise this bug. Or is this purely an optimization? It also changes the semantics because IO callbacks are now passed offsets from regions starts instead of absolute addresses. I'm not able to tell if the change is for good or for bad though. Cheers -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/3] Remove interrupt stack table usage from x86_64 kernel
The interrupt stack table (IST) mechanism is the only thing preventing kvm from deferring saving and reloading of some significant state. It is also somewhat complicated. Remove it by switching the special exceptions to use the normal irqstack. Avi Kivity (3): x86: drop the use of the tss interrupt stack table (IST) x86: Remove pda.irqcount x86: Switch critical exceptions and NMI to irqstack arch/x86/include/asm/desc.h | 12 - arch/x86/include/asm/page_64.h |7 --- arch/x86/include/asm/pda.h |2 +- arch/x86/include/asm/processor.h | 11 arch/x86/kernel/asm-offsets_64.c |1 - arch/x86/kernel/cpu/common.c | 35 -- arch/x86/kernel/dumpstack_64.c | 96 -- arch/x86/kernel/entry_64.S | 49 --- arch/x86/kernel/traps.c | 12 ++-- 9 files changed, 27 insertions(+), 198 deletions(-) -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/3] x86: Remove pda.irqcount
pda.irqcount is used to test whether we need to switch to an irqstack or not. We can do without it, however, by testing %rsp directly: if it's already within the irqstack range we don't need to stacks. This makes switching the nmi handler to use the irqstack easier. Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/include/asm/pda.h |2 +- arch/x86/kernel/asm-offsets_64.c |1 - arch/x86/kernel/cpu/common.c |1 - arch/x86/kernel/entry_64.S | 29 + 4 files changed, 14 insertions(+), 19 deletions(-) diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h index 2fbfff8..2099610 100644 --- a/arch/x86/include/asm/pda.h +++ b/arch/x86/include/asm/pda.h @@ -14,7 +14,7 @@ struct x8664_pda { address */ unsigned long kernelstack; /* 16 top of kernel stack for current */ unsigned long oldrsp; /* 24 user rsp for system call */ - int irqcount; /* 32 Irq nesting counter. Starts -1 */ + int unused; /* 32 for rent */ unsigned int cpunumber; /* 36 Logical CPU number */ #ifdef CONFIG_CC_STACKPROTECTOR unsigned long stack_canary; /* 40 stack canary value */ diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 7fcf63d..779d010 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -50,7 +50,6 @@ int main(void) ENTRY(kernelstack); ENTRY(oldrsp); ENTRY(pcurrent); - ENTRY(irqcount); ENTRY(cpunumber); ENTRY(irqstackptr); ENTRY(data_offset); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 8563c51..6313d03 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -877,7 +877,6 @@ void __cpuinit pda_init(int cpu) mb(); pda-cpunumber = cpu; - pda-irqcount = -1; pda-kernelstack = (unsigned long)stack_thread_info() - PDA_STACKOFFSET + THREAD_SIZE; pda-active_mm = init_mm; diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 8c882e1..245fecd 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -627,6 +627,15 @@ END(stub_rt_sigreturn) vector already pushed) */ #define XCPT_FRAME _frame ORIG_RAX + .macro enter_irqstack scratch + mov %gs:pda_irqstackptr, \scratch + sub %rsp, \scratch + cmp $IRQSTACKSIZE-64, \scratch + jbe 1234f + mov %gs:pda_irqstackptr, %rsp +1234: + .endm + /* * Interrupt entry/exit. * @@ -655,14 +664,7 @@ END(stub_rt_sigreturn) testl $3,CS(%rdi) je 1f SWAPGS - /* irqcount is used to check if a CPU is already on an interrupt - stack or not. While this is essentially redundant with preempt_count - it is a little cheaper to use a separate counter in the PDA - (short of moving irq_enter into assembly, which would be too - much work) */ -1: incl%gs:pda_irqcount - cmoveq %gs:pda_irqstackptr,%rsp - push%rbp# backlink for old unwinder +1: enter_irqstack %rax /* * We entered an interrupt context - irqs are off: */ @@ -677,7 +679,6 @@ ENTRY(common_interrupt) ret_from_intr: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - decl %gs:pda_irqcount leaveq CFI_DEF_CFA_REGISTERrsp CFI_ADJUST_CFA_OFFSET -8 @@ -1325,14 +1326,12 @@ ENTRY(call_softirq) CFI_REL_OFFSET rbp,0 mov %rsp,%rbp CFI_DEF_CFA_REGISTER rbp - incl %gs:pda_irqcount - cmove %gs:pda_irqstackptr,%rsp + enter_irqstack %rax push %rbp # backlink for old unwinder call __do_softirq leaveq CFI_DEF_CFA_REGISTERrsp CFI_ADJUST_CFA_OFFSET -8 - decl %gs:pda_irqcount ret CFI_ENDPROC ENDPROC(call_softirq) @@ -1369,15 +1368,13 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) movq %rdi, %rsp# we don't return, adjust the stack frame CFI_ENDPROC CFI_DEFAULT_STACK -11:incl %gs:pda_irqcount - movq %rsp,%rbp +11:movq %rsp,%rbp CFI_DEF_CFA_REGISTER rbp - cmovzq %gs:pda_irqstackptr,%rsp + enter_irqstack %rax pushq %rbp # backlink for old unwinder call xen_evtchn_do_upcall popq %rsp CFI_DEF_CFA_REGISTER rsp - decl %gs:pda_irqcount jmp error_exit CFI_ENDPROC END(do_hypervisor_callback) -- 1.6.0.6 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/3] x86: Switch critical exceptions and NMI to irqstack
With the special exception stacks gone, the irqstack is a much safer place than the regular task stacks. Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kernel/entry_64.S |3 +++ 1 files changed, 3 insertions(+), 0 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 245fecd..8f40593 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -917,7 +917,10 @@ END(spurious_interrupt) movq %rsp,%rdi movq ORIG_RAX(%rsp),%rsi movq $-1,ORIG_RAX(%rsp) + mov %rsp, %rbp + enter_irqstack %rax call \sym + mov %rbp, %rsp DISABLE_INTERRUPTS(CLBR_NONE) .if \irqtrace TRACE_IRQS_OFF -- 1.6.0.6 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/3] x86: drop the use of the tss interrupt stack table (IST)
The IST is the only thing that requires a valid TSS while running in kernel mode. Dropping its use unlocks an optimization opportunity for kvm: if we don't need a valid TSS while in kernel mode we can defer the use of the VMLOAD/VMSAVE instructions until the next context switch, reducing the executions of these costly instructions by a nice factor. Kernel reliability should also be improved since interrupt paths are simplified. Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/include/asm/desc.h | 12 - arch/x86/include/asm/page_64.h |7 --- arch/x86/include/asm/processor.h | 11 arch/x86/kernel/cpu/common.c | 34 - arch/x86/kernel/dumpstack_64.c | 96 -- arch/x86/kernel/entry_64.S | 17 ++- arch/x86/kernel/traps.c | 12 ++-- 7 files changed, 10 insertions(+), 179 deletions(-) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index e6b82b1..0465c75 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -369,18 +369,6 @@ static inline void set_task_gate(unsigned int n, unsigned int gdt_entry) _set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry3)); } -static inline void set_intr_gate_ist(int n, void *addr, unsigned ist) -{ - BUG_ON((unsigned)n 0xFF); - _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS); -} - -static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist) -{ - BUG_ON((unsigned)n 0xFF); - _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS); -} - #else /* * GET_DESC_BASE reads the descriptor base of the specified segment. diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index 5ebca29..7c89095 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -16,13 +16,6 @@ #define IRQSTACK_ORDER 2 #define IRQSTACKSIZE (PAGE_SIZE IRQSTACK_ORDER) -#define STACKFAULT_STACK 1 -#define DOUBLEFAULT_STACK 2 -#define NMI_STACK 3 -#define DEBUG_STACK 4 -#define MCE_STACK 5 -#define N_EXCEPTION_STACKS 5 /* hw limit: 7 */ - #define PUD_PAGE_SIZE (_AC(1, UL) PUD_SHIFT) #define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 5ca01e3..4ef899c 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -273,13 +273,6 @@ struct tss_struct { DECLARE_PER_CPU(struct tss_struct, init_tss); -/* - * Save the original ist values for checking stack pointers during debugging - */ -struct orig_ist { - unsigned long ist[7]; -}; - #defineMXCSR_DEFAULT 0x1f80 struct i387_fsave_struct { @@ -372,10 +365,6 @@ union thread_xstate { struct xsave_struct xsave; }; -#ifdef CONFIG_X86_64 -DECLARE_PER_CPU(struct orig_ist, orig_ist); -#endif - extern void print_cpu_info(struct cpuinfo_x86 *); extern unsigned int xstate_size; extern void free_thread_xstate(struct task_struct *); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b9c9ea0..8563c51 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -903,9 +903,6 @@ void __cpuinit pda_init(int cpu) } } -char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + - DEBUG_STKSZ] __page_aligned_bss; - extern asmlinkage void ignore_sysret(void); /* May not be marked __init: used by software suspend */ @@ -931,12 +928,6 @@ void syscall_init(void) unsigned long kernel_eflags; -/* - * Copies of the original ist values from the tss are only accessed during - * debugging, no special alignment required. - */ -DEFINE_PER_CPU(struct orig_ist, orig_ist); - #else /* Make sure %fs is initialized properly in idle threads */ @@ -960,17 +951,13 @@ void __cpuinit cpu_init(void) { int cpu = stack_smp_processor_id(); struct tss_struct *t = per_cpu(init_tss, cpu); - struct orig_ist *orig_ist = per_cpu(orig_ist, cpu); unsigned long v; - char *estacks = NULL; struct task_struct *me; int i; /* CPU 0 is initialised in head64.c */ if (cpu != 0) pda_init(cpu); - else - estacks = boot_exception_stacks; me = current; @@ -1000,27 +987,6 @@ void __cpuinit cpu_init(void) if (cpu != 0 x2apic) enable_x2apic(); - /* -* set up and load the per-CPU TSS -*/ - if (!orig_ist-ist[0]) { - static const unsigned int order[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, - [DEBUG_STACK - 1] = DEBUG_STACK_ORDER - }; - for (v = 0; v N_EXCEPTION_STACKS; v++) { - if (cpu) { - estacks = (char *)__get_free_pages(GFP_ATOMIC,
Re: [PATCH 1/4] KVM: Using kfifo for irq recording
On Thursday 25 December 2008 21:26:29 Avi Kivity wrote: Sheng Yang wrote: On Thursday 25 December 2008 19:07:22 Avi Kivity wrote: Sheng Yang wrote: For MSI-X, we have to deal with multiply IRQ with same IRQ handler, so it's necessary to record the IRQ that trigger the IRQ handler. Does MSI-X disallowing coalescing two requests into one interrupt? Or can we still coalesce interrupts (perhaps by recording them as a (irq, cpu) pair?) Disallow? Not quite understand. PCI spec said OS don't need to ensure the sequence they handled is the same as they happened. This struct is used just because we lost information of irq after schedule_work... Why can't we store this information in a bitmap? There are a limited number of irqs. The only reason I can think of for using a fifo is if we want to preserve the number and ordering of interrupts. Is there another reason? Well, I just think using fifo is more generic and unify the logic of three type of interrupt easily, something seems more elegant. @@ -313,6 +314,9 @@ struct kvm_assigned_dev_kernel { int host_irq; bool host_irq_disabled; int guest_irq; +#define KVM_ASSIGNED_DEV_IRQ_FIFO_LEN0x100 + struct kfifo *irq_fifo; + spinlock_t irq_fifo_lock; #define KVM_ASSIGNED_DEV_GUEST_INTX (1 0) What if it runs out? What does real hardware do? I'm sure it doesn't have a 100-entry queue. 0x100 is just a simple number which I thought different interrupts of same MSI-X device can happen at same period(indeed it's 0x100/sizeof(int)). Maybe not that many. And it just used by work function later to find what guest vector is, and then inject the correlated interrupt to the guest. Maybe it's better to do the conversion immediately, so we can store the information in a structure that's not prone to overflow. OK. I would give a bitmap to kvm struct with gsi_msg which is unable to overflow. -- regards Yang, Sheng -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 14/15] KVM: Replace host_irq_disable with a new flag
(I discard irq_fifo and change a method to fix this problem) We can reused the field state later. Signed-off-by: Sheng Yang sh...@linux.intel.com --- include/linux/kvm_host.h |3 ++- virt/kvm/kvm_main.c |8 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index fbf102c..58e4b7e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -311,13 +311,14 @@ struct kvm_assigned_dev_kernel { int host_busnr; int host_devfn; int host_irq; - bool host_irq_disabled; int guest_irq; #define KVM_ASSIGNED_DEV_GUEST_INTX(1 0) #define KVM_ASSIGNED_DEV_GUEST_MSI (1 1) #define KVM_ASSIGNED_DEV_HOST_INTX (1 8) #define KVM_ASSIGNED_DEV_HOST_MSI (1 9) unsigned long irq_requested_type; +#define KVM_ASSIGNED_DEV_HOST_IRQ_DISABLED (1 0) + unsigned long state; int irq_source_id; struct pci_dev *dev; struct kvm *kvm; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a51e630..065af2d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -114,7 +114,7 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) if (assigned_dev-irq_requested_type KVM_ASSIGNED_DEV_GUEST_MSI) { enable_irq(assigned_dev-host_irq); - assigned_dev-host_irq_disabled = false; + assigned_dev-state = ~KVM_ASSIGNED_DEV_HOST_IRQ_DISABLED; } mutex_unlock(assigned_dev-kvm-lock); @@ -131,7 +131,7 @@ static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) schedule_work(assigned_dev-interrupt_work); disable_irq_nosync(irq); - assigned_dev-host_irq_disabled = true; + assigned_dev-state |= KVM_ASSIGNED_DEV_HOST_IRQ_DISABLED; return IRQ_HANDLED; } @@ -152,9 +152,9 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) /* The guest irq may be shared so this ack may be * from another device. */ - if (dev-host_irq_disabled) { + if (dev-state KVM_ASSIGNED_DEV_HOST_IRQ_DISABLED) { enable_irq(dev-host_irq); - dev-host_irq_disabled = false; + dev-state = ~KVM_ASSIGNED_DEV_HOST_IRQ_DISABLED; } } -- 1.5.4.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 15/15] KVM: Fix racy in kvm_free_assigned_irq
Thanks to Marcelo's observation, The following code have potential issue: if (cancel_work_sync(assigned_dev-interrupt_work)) kvm_put_kvm(kvm); In fact, cancel_work_sync() would return true either work struct is only scheduled or the callback of work struct is executed. This code only consider the former situation. Also, we have a window between cancel_work_sync() and free_irq. This patch fixs them two. Signed-off-by: Sheng Yang sh...@linux.intel.com --- include/linux/kvm_host.h |1 + virt/kvm/kvm_main.c | 34 ++ 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 58e4b7e..e0775b9 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -318,6 +318,7 @@ struct kvm_assigned_dev_kernel { #define KVM_ASSIGNED_DEV_HOST_MSI (1 9) unsigned long irq_requested_type; #define KVM_ASSIGNED_DEV_HOST_IRQ_DISABLED (1 0) +#define KVM_ASSIGNED_DEV_IRQ_GOT_KVM (1 1) unsigned long state; int irq_source_id; struct pci_dev *dev; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 065af2d..9ffa601 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -119,6 +119,7 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) mutex_unlock(assigned_dev-kvm-lock); kvm_put_kvm(assigned_dev-kvm); + assigned_dev-state = ~KVM_ASSIGNED_DEV_IRQ_GOT_KVM; } static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) @@ -126,7 +127,15 @@ static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) struct kvm_assigned_dev_kernel *assigned_dev = (struct kvm_assigned_dev_kernel *) dev_id; + /* +* In kvm_free_device_irq, cancel_work_sync return true if: +* 1. work is scheduled, and then cancelled. +* 2. work callback is executed. +* +* We need to call kvm_put_kvm() for the former, but not the later. +*/ kvm_get_kvm(assigned_dev-kvm); + assigned_dev-state |= KVM_ASSIGNED_DEV_IRQ_GOT_KVM; schedule_work(assigned_dev-interrupt_work); @@ -173,10 +182,27 @@ static void kvm_free_assigned_irq(struct kvm *kvm, if (!assigned_dev-irq_requested_type) return; - if (cancel_work_sync(assigned_dev-interrupt_work)) - /* We had pending work. That means we will have to take -* care of kvm_put_kvm. -*/ + /* +* We need to ensure: kvm_put_kvm() paired with kvm_get_kvm() in +* kvm_assigned_dev_intr, and no more interrupt after we cancelled +* current one. +* +* Here we have two possiblities for cancel_work_sync() return true: +* 1. The work is scheduled, but callback haven't been called. We need +* to call kvm_put_kvm() here. And IRQ is already disabled without +* doubt. +* +* 2. The callback have executed, here we don't need to call +* kvm_put_kvm(), but we may need to disable irq(e.g. for MSI). +* +* We judge the two condition according assigned_dev-state. And we +* disable irq here anyway, and it may resulted in IRQ nested disable, +* but it's fine, for we are going to free it. +*/ + disable_irq_nosync(assigned_dev-host_irq); + + if (cancel_work_sync(assigned_dev-interrupt_work) + assigned_dev-state KVM_ASSIGNED_DEV_IRQ_GOT_KVM) kvm_put_kvm(kvm); free_irq(assigned_dev-host_irq, (void *)assigned_dev); -- 1.5.4.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] KVM: userspace: Remove duplicated functionality for cpuid processing
host_cpuid is now available in target-i386/helper.c. Remove the duplicated code now in kvm-specific code. Signed-off-by: Amit Shah amit.s...@redhat.com --- qemu/qemu-kvm-x86.c | 70 --- 1 files changed, 0 insertions(+), 70 deletions(-) diff --git a/qemu/qemu-kvm-x86.c b/qemu/qemu-kvm-x86.c index aa36be8..1bf86e1 100644 --- a/qemu/qemu-kvm-x86.c +++ b/qemu/qemu-kvm-x86.c @@ -451,39 +451,6 @@ void kvm_arch_save_regs(CPUState *env) } } -static void host_cpuid(uint32_t function, uint32_t *eax, uint32_t *ebx, - uint32_t *ecx, uint32_t *edx) -{ -uint32_t vec[4]; - -#ifdef __x86_64__ -asm volatile(cpuid -: =a(vec[0]), =b(vec[1]), - =c(vec[2]), =d(vec[3]) -: 0(function) : cc); -#else -asm volatile(pusha \n\t -cpuid \n\t -mov %%eax, 0(%1) \n\t -mov %%ebx, 4(%1) \n\t -mov %%ecx, 8(%1) \n\t -mov %%edx, 12(%1) \n\t -popa -: : a(function), S(vec) -: memory, cc); -#endif - -if (eax) - *eax = vec[0]; -if (ebx) - *ebx = vec[1]; -if (ecx) - *ecx = vec[2]; -if (edx) - *edx = vec[3]; -} - - static void do_cpuid_ent(struct kvm_cpuid_entry *e, uint32_t function, CPUState *env) { @@ -494,43 +461,6 @@ static void do_cpuid_ent(struct kvm_cpuid_entry *e, uint32_t function, e-ebx = env-regs[R_EBX]; e-ecx = env-regs[R_ECX]; e-edx = env-regs[R_EDX]; -if (function == 0x8001) { - uint32_t h_eax, h_edx; - - host_cpuid(function, h_eax, NULL, NULL, h_edx); - - // long mode - if ((h_edx 0x2000) == 0 || !lm_capable_kernel) - e-edx = ~0x2000u; - // syscall - if ((h_edx 0x0800) == 0) - e-edx = ~0x0800u; - // nx - if ((h_edx 0x0010) == 0) - e-edx = ~0x0010u; - // svm - if (!kvm_nested (e-ecx 4)) - e-ecx = ~4u; -} -// sysenter isn't supported on compatibility mode on AMD. and syscall -// isn't supported in compatibility mode on Intel. so advertise the -// actuall cpu, and say goodbye to migration between different vendors -// is you use compatibility mode. -if (function == 0) { - uint32_t bcd[3]; - - host_cpuid(0, NULL, bcd[0], bcd[1], bcd[2]); - e-ebx = bcd[0]; - e-ecx = bcd[1]; - e-edx = bcd[2]; -} -// Hypervisor present bit for Microsoft guests -if (function == 1) - e-ecx |= (1u 31); - -// 3dnow isn't properly emulated yet -if (function == 0x8001) - e-edx = ~0xc000; } struct kvm_para_features { -- 1.5.4.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html