[COMMIT master] KVM: Convert irq notifiers lists to RCU locking
From: Gleb Natapov g...@redhat.com Use RCU locking for mask/ack notifiers lists. Signed-off-by: Gleb Natapov g...@redhat.com Signed-off-by: Avi Kivity a...@redhat.com diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index f019725..6c94614 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -183,19 +183,19 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) rcu_read_lock(); gsi = rcu_dereference(kvm-irq_routing)-chip[irqchip][pin]; - rcu_read_unlock(); - if (gsi != -1) - hlist_for_each_entry(kian, n, kvm-irq_ack_notifier_list, link) + hlist_for_each_entry_rcu(kian, n, kvm-irq_ack_notifier_list, +link) if (kian-gsi == gsi) kian-irq_acked(kian); + rcu_read_unlock(); } void kvm_register_irq_ack_notifier(struct kvm *kvm, struct kvm_irq_ack_notifier *kian) { mutex_lock(kvm-irq_lock); - hlist_add_head(kian-link, kvm-irq_ack_notifier_list); + hlist_add_head_rcu(kian-link, kvm-irq_ack_notifier_list); mutex_unlock(kvm-irq_lock); } @@ -203,8 +203,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm, struct kvm_irq_ack_notifier *kian) { mutex_lock(kvm-irq_lock); - hlist_del_init(kian-link); + hlist_del_init_rcu(kian-link); mutex_unlock(kvm-irq_lock); + synchronize_rcu(); } int kvm_request_irq_source_id(struct kvm *kvm) @@ -257,7 +258,7 @@ void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, { mutex_lock(kvm-irq_lock); kimn-irq = irq; - hlist_add_head(kimn-link, kvm-mask_notifier_list); + hlist_add_head_rcu(kimn-link, kvm-mask_notifier_list); mutex_unlock(kvm-irq_lock); } @@ -265,8 +266,9 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, struct kvm_irq_mask_notifier *kimn) { mutex_lock(kvm-irq_lock); - hlist_del(kimn-link); + hlist_del_rcu(kimn-link); mutex_unlock(kvm-irq_lock); + synchronize_rcu(); } void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask) @@ -274,11 +276,11 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask) struct kvm_irq_mask_notifier *kimn; struct hlist_node *n; - WARN_ON(!mutex_is_locked(kvm-irq_lock)); - - hlist_for_each_entry(kimn, n, kvm-mask_notifier_list, link) + rcu_read_lock(); + hlist_for_each_entry_rcu(kimn, n, kvm-mask_notifier_list, link) if (kimn-irq == irq) kimn-func(kimn, mask); + rcu_read_unlock(); } void kvm_free_irq_routing(struct kvm *kvm) -- To unsubscribe from this list: send the line unsubscribe kvm-commits in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[COMMIT master] KVM: Maintain back mapping from irqchip/pin to gsi
From: Gleb Natapov g...@redhat.com Maintain back mapping from irqchip/pin to gsi to speedup interrupt acknowledgment notifications. [avi: build fix on non-x86/ia64] Signed-off-by: Gleb Natapov g...@redhat.com Signed-off-by: Avi Kivity a...@redhat.com diff --git a/arch/ia64/include/asm/kvm.h b/arch/ia64/include/asm/kvm.h index 18a7e49..bc90c75 100644 --- a/arch/ia64/include/asm/kvm.h +++ b/arch/ia64/include/asm/kvm.h @@ -60,6 +60,7 @@ struct kvm_ioapic_state { #define KVM_IRQCHIP_PIC_MASTER 0 #define KVM_IRQCHIP_PIC_SLAVE1 #define KVM_IRQCHIP_IOAPIC 2 +#define KVM_NR_IRQCHIPS 3 #define KVM_CONTEXT_SIZE 8*1024 diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index 4a5fe91..f02e87a 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -79,6 +79,7 @@ struct kvm_ioapic_state { #define KVM_IRQCHIP_PIC_MASTER 0 #define KVM_IRQCHIP_PIC_SLAVE1 #define KVM_IRQCHIP_IOAPIC 2 +#define KVM_NR_IRQCHIPS 3 /* for KVM_GET_REGS and KVM_SET_REGS */ struct kvm_regs { diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 802c080..b8db809 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -132,7 +132,10 @@ struct kvm_kernel_irq_routing_entry { struct hlist_node link; }; +#ifdef __KVM_HAVE_IOAPIC + struct kvm_irq_routing_table { + int chip[KVM_NR_IRQCHIPS][KVM_IOAPIC_NUM_PINS]; struct kvm_kernel_irq_routing_entry *rt_entries; u32 nr_rt_entries; /* @@ -142,6 +145,12 @@ struct kvm_irq_routing_table { struct hlist_head map[0]; }; +#else + +struct kvm_irq_routing_table {}; + +#endif + struct kvm { spinlock_t mmu_lock; spinlock_t requests_lock; diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index 81950f6..59cf8da 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -175,25 +175,16 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) { struct kvm_irq_ack_notifier *kian; struct hlist_node *n; - unsigned gsi = pin; - int i; + int gsi; trace_kvm_ack_irq(irqchip, pin); - for (i = 0; i kvm-irq_routing-nr_rt_entries; i++) { - struct kvm_kernel_irq_routing_entry *e; - e = kvm-irq_routing-rt_entries[i]; - if (e-type == KVM_IRQ_ROUTING_IRQCHIP - e-irqchip.irqchip == irqchip - e-irqchip.pin == pin) { - gsi = e-gsi; - break; - } - } - - hlist_for_each_entry(kian, n, kvm-arch.irq_ack_notifier_list, link) - if (kian-gsi == gsi) - kian-irq_acked(kian); + gsi = kvm-irq_routing-chip[irqchip][pin]; + if (gsi != -1) + hlist_for_each_entry(kian, n, kvm-arch.irq_ack_notifier_list, +link) + if (kian-gsi == gsi) + kian-irq_acked(kian); } void kvm_register_irq_ack_notifier(struct kvm *kvm, @@ -332,6 +323,9 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt, } e-irqchip.irqchip = ue-u.irqchip.irqchip; e-irqchip.pin = ue-u.irqchip.pin + delta; + if (e-irqchip.pin = KVM_IOAPIC_NUM_PINS) + goto out; + rt-chip[ue-u.irqchip.irqchip][e-irqchip.pin] = ue-gsi; break; case KVM_IRQ_ROUTING_MSI: e-set = kvm_set_msi; @@ -356,7 +350,7 @@ int kvm_set_irq_routing(struct kvm *kvm, unsigned flags) { struct kvm_irq_routing_table *new, *old; - u32 i, nr_rt_entries = 0; + u32 i, j, nr_rt_entries = 0; int r; for (i = 0; i nr; ++i) { @@ -377,6 +371,9 @@ int kvm_set_irq_routing(struct kvm *kvm, new-rt_entries = (void *)new-map[nr_rt_entries]; new-nr_rt_entries = nr_rt_entries; + for (i = 0; i 3; i++) + for (j = 0; j KVM_IOAPIC_NUM_PINS; j++) + new-chip[i][j] = -1; for (i = 0; i nr; ++i) { r = -EINVAL; -- To unsubscribe from this list: send the line unsubscribe kvm-commits in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[COMMIT master] Revert KVM: x86 emulator: Report unhandled instructions
From: Avi Kivity a...@redhat.com This reverts commit ea67fbbcf346a15b1e8e18cff7c64c248972b961. Unhandled instructions can and do occur in normal runs. This needs to be made optional so as not to spam the logs. Signed-off-by: Avi Kivity a...@redhat.com diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 15593e8..0644d3d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2194,7 +2194,6 @@ writeback: done: if (rc == X86EMUL_UNHANDLEABLE) { - kvm_report_emulation_failure(ctxt-vcpu, unhandled instruction); c-eip = saved_eip; return -1; } @@ -2468,7 +2467,7 @@ twobyte_insn: goto writeback; cannot_emulate: - kvm_report_emulation_failure(ctxt-vcpu, unhandled instruction); + DPRINTF(Cannot emulate %02x\n, c-b); c-eip = saved_eip; return -1; } -- To unsubscribe from this list: send the line unsubscribe kvm-commits in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[COMMIT master] KVM: Move IO APIC to its own lock
From: Gleb Natapov g...@redhat.com The allows removal of irq_lock from the injection path. Signed-off-by: Gleb Natapov g...@redhat.com Signed-off-by: Avi Kivity a...@redhat.com diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 0ad09f0..4a98314 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -851,8 +851,7 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, r = 0; switch (chip-chip_id) { case KVM_IRQCHIP_IOAPIC: - memcpy(chip-chip.ioapic, ioapic_irqchip(kvm), - sizeof(struct kvm_ioapic_state)); + r = kvm_get_ioapic(kvm, chip-chip.ioapic); break; default: r = -EINVAL; @@ -868,9 +867,7 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) r = 0; switch (chip-chip_id) { case KVM_IRQCHIP_IOAPIC: - memcpy(ioapic_irqchip(kvm), - chip-chip.ioapic, - sizeof(struct kvm_ioapic_state)); + r = kvm_set_ioapic(kvm, chip-chip.ioapic); break; default: r = -EINVAL; diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index ccc941a..d057c0c 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -38,7 +38,15 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq) s-isr_ack |= (1 irq); if (s != s-pics_state-pics[0]) irq += 8; + /* +* We are dropping lock while calling ack notifiers since ack +* notifier callbacks for assigned devices call into PIC recursively. +* Other interrupt may be delivered to PIC while lock is dropped but +* it should be safe since PIC state is already updated at this stage. +*/ + spin_unlock(s-pics_state-lock); kvm_notify_acked_irq(s-pics_state-kvm, SELECT_PIC(irq), irq); + spin_lock(s-pics_state-lock); } void kvm_pic_clear_isr_ack(struct kvm *kvm) @@ -176,16 +184,18 @@ int kvm_pic_set_irq(void *opaque, int irq, int level) static inline void pic_intack(struct kvm_kpic_state *s, int irq) { s-isr |= 1 irq; - if (s-auto_eoi) { - if (s-rotate_on_auto_eoi) - s-priority_add = (irq + 1) 7; - pic_clear_isr(s, irq); - } /* * We don't clear a level sensitive interrupt here */ if (!(s-elcr (1 irq))) s-irr = ~(1 irq); + + if (s-auto_eoi) { + if (s-rotate_on_auto_eoi) + s-priority_add = (irq + 1) 7; + pic_clear_isr(s, irq); + } + } int kvm_pic_read_irq(struct kvm *kvm) @@ -294,9 +304,9 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) priority = get_priority(s, s-isr); if (priority != 8) { irq = (priority + s-priority_add) 7; - pic_clear_isr(s, irq); if (cmd == 5) s-priority_add = (irq + 1) 7; + pic_clear_isr(s, irq); pic_update_irq(s-pics_state); } break; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 5b9d1ae..8f0967f 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -471,11 +471,8 @@ static void apic_set_eoi(struct kvm_lapic *apic) trigger_mode = IOAPIC_LEVEL_TRIG; else trigger_mode = IOAPIC_EDGE_TRIG; - if (!(apic_get_reg(apic, APIC_SPIV) APIC_SPIV_DIRECTED_EOI)) { - mutex_lock(apic-vcpu-kvm-irq_lock); + if (!(apic_get_reg(apic, APIC_SPIV) APIC_SPIV_DIRECTED_EOI)) kvm_ioapic_update_eoi(apic-vcpu-kvm, vector, trigger_mode); - mutex_unlock(apic-vcpu-kvm-irq_lock); - } } static void apic_send_ipi(struct kvm_lapic *apic) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d22400f..c7b0b83 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2023,9 +2023,7 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) sizeof(struct kvm_pic_state)); break; case KVM_IRQCHIP_IOAPIC: - memcpy(chip-chip.ioapic, - ioapic_irqchip(kvm), - sizeof(struct kvm_ioapic_state)); + r = kvm_get_ioapic(kvm, chip-chip.ioapic); break; default: r = -EINVAL; @@ -2055,11 +2053,7 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) spin_unlock(pic_irqchip(kvm)-lock); break; case KVM_IRQCHIP_IOAPIC: -
[COMMIT master] KVM: Call pic_clear_isr() on pic reset to reuse logic there
From: Gleb Natapov g...@redhat.com Also move call of ack notifiers after pic state change. Signed-off-by: Gleb Natapov g...@redhat.com Signed-off-by: Avi Kivity a...@redhat.com diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 01f1516..ccc941a 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -225,22 +225,11 @@ int kvm_pic_read_irq(struct kvm *kvm) void kvm_pic_reset(struct kvm_kpic_state *s) { - int irq, irqbase, n; + int irq; struct kvm *kvm = s-pics_state-irq_request_opaque; struct kvm_vcpu *vcpu0 = kvm-bsp_vcpu; + u8 irr = s-irr, isr = s-imr; - if (s == s-pics_state-pics[0]) - irqbase = 0; - else - irqbase = 8; - - for (irq = 0; irq PIC_NUM_PINS/2; irq++) { - if (vcpu0 kvm_apic_accept_pic_intr(vcpu0)) - if (s-irr (1 irq) || s-isr (1 irq)) { - n = irq + irqbase; - kvm_notify_acked_irq(kvm, SELECT_PIC(n), n); - } - } s-last_irr = 0; s-irr = 0; s-imr = 0; @@ -256,6 +245,13 @@ void kvm_pic_reset(struct kvm_kpic_state *s) s-rotate_on_auto_eoi = 0; s-special_fully_nested_mode = 0; s-init4 = 0; + + for (irq = 0; irq PIC_NUM_PINS/2; irq++) { + if (vcpu0 kvm_apic_accept_pic_intr(vcpu0)) + if (irr (1 irq) || isr (1 irq)) { + pic_clear_isr(s, irq); + } + } } static void pic_ioport_write(void *opaque, u32 addr, u32 val) -- To unsubscribe from this list: send the line unsubscribe kvm-commits in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[COMMIT master] KVM: Move irq routing data structure to rcu locking
From: Gleb Natapov g...@redhat.com Signed-off-by: Gleb Natapov g...@redhat.com Signed-off-by: Avi Kivity a...@redhat.com diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index 59cf8da..fb861dd 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -159,7 +159,8 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level) * IOAPIC. So set the bit in both. The guest will ignore * writes to the unused one. */ - irq_rt = kvm-irq_routing; + rcu_read_lock(); + irq_rt = rcu_dereference(kvm-irq_routing); if (irq irq_rt-nr_rt_entries) hlist_for_each_entry(e, n, irq_rt-map[irq], link) { int r = e-set(e, kvm, irq_source_id, level); @@ -168,6 +169,7 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level) ret = r + ((ret 0) ? 0 : ret); } + rcu_read_unlock(); return ret; } @@ -179,7 +181,10 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) trace_kvm_ack_irq(irqchip, pin); - gsi = kvm-irq_routing-chip[irqchip][pin]; + rcu_read_lock(); + gsi = rcu_dereference(kvm-irq_routing)-chip[irqchip][pin]; + rcu_read_unlock(); + if (gsi != -1) hlist_for_each_entry(kian, n, kvm-arch.irq_ack_notifier_list, link) @@ -279,9 +284,9 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask) void kvm_free_irq_routing(struct kvm *kvm) { - mutex_lock(kvm-irq_lock); + /* Called only during vm destruction. Nobody can use the pointer + at this stage */ kfree(kvm-irq_routing); - mutex_unlock(kvm-irq_lock); } static int setup_routing_entry(struct kvm_irq_routing_table *rt, @@ -387,8 +392,9 @@ int kvm_set_irq_routing(struct kvm *kvm, mutex_lock(kvm-irq_lock); old = kvm-irq_routing; - kvm-irq_routing = new; + rcu_assign_pointer(kvm-irq_routing, new); mutex_unlock(kvm-irq_lock); + synchronize_rcu(); new = old; r = 0; -- To unsubscribe from this list: send the line unsubscribe kvm-commits in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[COMMIT master] KVM: PIT: fix pit_state copy in set_pit2/get_pit2
From: Marcelo Tosatti mtosa...@redhat.com The kvm_pit_state2 structure contains extra space, so the memcpy in kvm_vm_ioctl_set_pit2 corrupts kvm-arch.vpit-pit_state. Fix it by memcpy'ing the channel information and assigning flags manually. Signed-off-by: Marcelo Tosatti mtosa...@redhat.com Signed-off-by: Avi Kivity a...@redhat.com diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2e92aef..d22400f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2095,7 +2095,9 @@ static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) int r = 0; mutex_lock(kvm-arch.vpit-pit_state.lock); - memcpy(ps, kvm-arch.vpit-pit_state, sizeof(struct kvm_pit_state2)); + memcpy(ps-channels, kvm-arch.vpit-pit_state.channels, + sizeof(ps-channels)); + ps-flags = kvm-arch.vpit-pit_state.flags; mutex_unlock(kvm-arch.vpit-pit_state.lock); return r; } @@ -2109,7 +2111,9 @@ static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) cur_legacy = ps-flags KVM_PIT_FLAGS_HPET_LEGACY; if (!prev_legacy cur_legacy) start = 1; - memcpy(kvm-arch.vpit-pit_state, ps, sizeof(struct kvm_pit_state2)); + memcpy(kvm-arch.vpit-pit_state.channels, ps-channels, + sizeof(kvm-arch.vpit-pit_state.channels)); + kvm-arch.vpit-pit_state.flags = ps-flags; kvm_pit_load_count(kvm, 0, kvm-arch.vpit-pit_state.channels[0].count, start); mutex_unlock(kvm-arch.vpit-pit_state.lock); return r; -- To unsubscribe from this list: send the line unsubscribe kvm-commits in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: vhost net: performance with ping benchmark
On Tue, 25 Aug 2009 10:04:41 pm Arnd Bergmann wrote: On Tuesday 25 August 2009, Avi Kivity wrote: On 08/25/2009 05:22 AM, Anthony Liguori wrote: I think 2.6.32 is pushing it. 2.6.32 is pushing it, but we need to push it. Agreed. Get real. It's not happening. We need migration completely solved and tested. I want to see all the features supported, including indirect descs and GSO. If this wasn't a new userspace ABI, I'd be all for throwing it in as experimental ASAP. Rusty. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: vhost net: performance with ping benchmark
On Wed, Aug 26, 2009 at 05:04:44PM +0930, Rusty Russell wrote: On Tue, 25 Aug 2009 10:04:41 pm Arnd Bergmann wrote: On Tuesday 25 August 2009, Avi Kivity wrote: On 08/25/2009 05:22 AM, Anthony Liguori wrote: I think 2.6.32 is pushing it. 2.6.32 is pushing it, but we need to push it. Agreed. Get real. It's not happening. We need migration completely solved and tested. I want to see all the features supported, including indirect descs and GSO. I'm not sure why indirect descs are needed for virtio-net. Comments? If this wasn't a new userspace ABI, I'd be all for throwing it in as experimental ASAP. Rusty. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] VMX: Return to userspace on invalid state emulation failure
On 08/25/2009 01:37 AM, Mohammed Gamal wrote: Return to userspace instead of repeatedly trying to emulate instructions that have already failed Signed-off-by: Mohammed Gamalm.gamal...@gmail.com --- arch/x86/kvm/vmx.c |6 +- 1 files changed, 5 insertions(+), 1 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6b57eed..c559bb7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3337,6 +3337,8 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (err != EMULATE_DONE) { kvm_report_emulation_failure(vcpu, emulation failure); + vcpu-run-exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu-run-internal.suberror = KVM_INTERNAL_ERROR_EMULATION; break; } @@ -3607,7 +3609,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx-entry_time = ktime_get(); /* Handle invalid guest state instead of entering VMX */ - if (vmx-emulation_required emulate_invalid_guest_state) { + if (vmx-emulation_required emulate_invalid_guest_state + !(vcpu-run-exit_reason == KVM_EXIT_INTERNAL_ERROR + vcpu-run-internal.suberror == KVM_INTERNAL_ERROR_EMULATION)) { handle_invalid_guest_state(vcpu); return; } Still suffers from the same problem. You don't always update vcpu-run-exit_reason, so you can't test it. Best to return a value from handle_invalid_guest_state() (the standard return codes for exit handlers are 1 for return-to-guest, 0 for return-to-host, and -errno to return with an error). -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] fix apic id reading in x2apic mode
On 08/25/2009 04:39 PM, Gleb Natapov wrote: Format of apic id register is different in x2apic mode. Return correct apic id when apic is in x2apic mode. Applied, thanks. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] VMX: Return to userspace on invalid state emulation failure
On Wed, Aug 26, 2009 at 12:02 PM, Avi Kivitya...@redhat.com wrote: On 08/25/2009 01:37 AM, Mohammed Gamal wrote: Return to userspace instead of repeatedly trying to emulate instructions that have already failed Signed-off-by: Mohammed Gamalm.gamal...@gmail.com --- arch/x86/kvm/vmx.c | 6 +- 1 files changed, 5 insertions(+), 1 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6b57eed..c559bb7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3337,6 +3337,8 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (err != EMULATE_DONE) { kvm_report_emulation_failure(vcpu, emulation failure); + vcpu-run-exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu-run-internal.suberror = KVM_INTERNAL_ERROR_EMULATION; break; } @@ -3607,7 +3609,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx-entry_time = ktime_get(); /* Handle invalid guest state instead of entering VMX */ - if (vmx-emulation_required emulate_invalid_guest_state) { + if (vmx-emulation_required emulate_invalid_guest_state + !(vcpu-run-exit_reason == KVM_EXIT_INTERNAL_ERROR + vcpu-run-internal.suberror == KVM_INTERNAL_ERROR_EMULATION)) { handle_invalid_guest_state(vcpu); return; } Still suffers from the same problem. You don't always update vcpu-run-exit_reason, so you can't test it. Best to return a value from handle_invalid_guest_state() (the standard return codes for exit handlers are 1 for return-to-guest, 0 for return-to-host, and -errno to return with an error). I was thinking of the same idea since I was also concerned about vcpu-run-exit_reason not being updated. But how can we interpret the return values of handle_invalid_guest_state() inside vmx_vcpu_run() since it doesn't have a return value. Or would it be better to move handle_invalid_guest_state() to the standard vmx exit handlers? -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] VMX: Return to userspace on invalid state emulation failure
On 08/26/2009 01:07 PM, Mohammed Gamal wrote: On Wed, Aug 26, 2009 at 12:02 PM, Avi Kivitya...@redhat.com wrote: On 08/25/2009 01:37 AM, Mohammed Gamal wrote: Return to userspace instead of repeatedly trying to emulate instructions that have already failed Signed-off-by: Mohammed Gamalm.gamal...@gmail.com --- arch/x86/kvm/vmx.c |6 +- 1 files changed, 5 insertions(+), 1 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6b57eed..c559bb7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3337,6 +3337,8 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (err != EMULATE_DONE) { kvm_report_emulation_failure(vcpu, emulation failure); + vcpu-run-exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu-run-internal.suberror = KVM_INTERNAL_ERROR_EMULATION; break; } @@ -3607,7 +3609,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx-entry_time = ktime_get(); /* Handle invalid guest state instead of entering VMX */ - if (vmx-emulation_requiredemulate_invalid_guest_state) { + if (vmx-emulation_requiredemulate_invalid_guest_state +!(vcpu-run-exit_reason == KVM_EXIT_INTERNAL_ERROR + vcpu-run-internal.suberror == KVM_INTERNAL_ERROR_EMULATION)) { handle_invalid_guest_state(vcpu); return; } Still suffers from the same problem. You don't always update vcpu-run-exit_reason, so you can't test it. Best to return a value from handle_invalid_guest_state() (the standard return codes for exit handlers are 1 for return-to-guest, 0 for return-to-host, and -errno to return with an error). I was thinking of the same idea since I was also concerned about vcpu-run-exit_reason not being updated. But how can we interpret the return values of handle_invalid_guest_state() inside vmx_vcpu_run() since it doesn't have a return value. Or would it be better to move handle_invalid_guest_state() to the standard vmx exit handlers? We can move the call to vmx_handle_exit(). We have a check for emulate_invalid_guest_state there anyway. I don't think it should be a standard exit handler since there is no exit_reason for it. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: AlacrityVM benchmark numbers updated
On 08/26/2009 04:01 AM, Gregory Haskins wrote: We are pleased to announce the availability of the latest networking benchmark numbers for AlacrityVM. We've made several tweaks to the original v0.1 release to improve performance. The most notable is a switch from get_user_pages to switch_mm+copy_[to/from]_user thanks to a review suggestion from Michael Tsirkin (as well as his patch to implement it). This change alone accounted for freeing up an additional 1.2Gbps, which is over 25% improvement from v0.1. The previous numbers were 4560Gbps before the change, and 5708Gbps after (for 1500mtu over 10GE). This moves us ever closer to the goal of native performance under virtualization. Interesting, it's good to see that copy_*_user() works so well. Note that there's a possible optimization that goes in the opposite direction - keep using get_user_pages(), but use the dma engine API to perform the actual copy. I expect that it will only be a win when using tso to transfer full pages. Large pages may also help. Copyless tx also wants get_user_pages(). It makes sense to check if switch_mm() + get_user_pages_fast() gives better performance than get_user_pages(). -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: R/W HG memory mappings with kvm?
On 08/24/2009 07:55 AM, Avi Kivity wrote: On 08/24/2009 12:59 AM, Stephen Donnelly wrote: On Thu, Aug 20, 2009 at 12:14 AM, Avi Kivitya...@redhat.com wrote: On 08/13/2009 07:07 AM, Stephen Donnelly wrote: npages = get_user_pages_fast(addr, 1, 1, page); returns -EFAULT, presumably because (vma-vm_flags(VM_IO | VM_PFNMAP)). It takes then unlikely branch, and checks the vma, but I don't understand what it is doing here: pfn = ((addr - vma-vm_start) PAGE_SHIFT) + vma-vm_pgoff; It's calculating the pfn according to pfnmap rules. From what I understand this will only work when remapping 'main memory', e.g. where the pgoff is equal to the physical page offset? VMAs that remap IO memory will usually set pgoff to 0 for the start of the mapping. If so, how do they calculate the pfn when mapping pages? kvm needs to be able to do the same thing. Maybe the simplest thing is to call vma-vm_ops-fault here. Marcelo/Chris? Context is improving gfn_to_pfn() on the mmio path. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 03/47] KVM: MMU: fix bogus alloc_mmu_pages assignment
From: Marcelo Tosatti mtosa...@redhat.com Remove the bogus n_free_mmu_pages assignment from alloc_mmu_pages. It breaks accounting of mmu pages, since n_free_mmu_pages is modified but the real number of pages remains the same. Signed-off-by: Marcelo Tosatti mtosa...@redhat.com Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/mmu.c |8 1 files changed, 0 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 28be35c..6f38178 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2786,14 +2786,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) ASSERT(vcpu); - spin_lock(vcpu-kvm-mmu_lock); - if (vcpu-kvm-arch.n_requested_mmu_pages) - vcpu-kvm-arch.n_free_mmu_pages = - vcpu-kvm-arch.n_requested_mmu_pages; - else - vcpu-kvm-arch.n_free_mmu_pages = - vcpu-kvm-arch.n_alloc_mmu_pages; - spin_unlock(vcpu-kvm-mmu_lock); /* * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. * Therefore we need to allocate shadow page tables in the first -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 04/47] KVM: x86: Disallow hypercalls for guest callers in rings 0
From: Jan Kiszka jan.kis...@siemens.com So far unprivileged guest callers running in ring 3 can issue, e.g., MMU hypercalls. Normally, such callers cannot provide any hand-crafted MMU command structure as it has to be passed by its physical address, but they can still crash the guest kernel by passing random addresses. To close the hole, this patch considers hypercalls valid only if issued from guest ring 0. This may still be relaxed on a per-hypercall base in the future once required. Cc: sta...@kernel.org Signed-off-by: Jan Kiszka jan.kis...@siemens.com Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/x86.c |6 ++ include/linux/kvm_para.h |1 + 2 files changed, 7 insertions(+), 0 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fa525d5..92b5edd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3213,6 +3213,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) a3 = 0x; } + if (kvm_x86_ops-get_cpl(vcpu) != 0) { + ret = -KVM_EPERM; + goto out; + } + switch (nr) { case KVM_HC_VAPIC_POLL_IRQ: ret = 0; @@ -3224,6 +3229,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) ret = -KVM_ENOSYS; break; } +out: kvm_register_write(vcpu, VCPU_REGS_RAX, ret); ++vcpu-stat.hypercalls; return r; diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h index 3ddce03..d731092 100644 --- a/include/linux/kvm_para.h +++ b/include/linux/kvm_para.h @@ -13,6 +13,7 @@ #define KVM_ENOSYS 1000 #define KVM_EFAULT EFAULT #define KVM_E2BIG E2BIG +#define KVM_EPERM EPERM #define KVM_HC_VAPIC_POLL_IRQ 1 #define KVM_HC_MMU_OP 2 -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 18/47] KVM: SVM: cache nested intercepts
From: Joerg Roedel joerg.roe...@amd.com When the nested intercepts are cached we don't need to call get_user_pages and/or map the nested vmcb on every nested #vmexit to check who will handle the intercept. Further this patch aligns the emulated svm behavior better to real hardware. Signed-off-by: Joerg Roedel joerg.roe...@amd.com Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/svm.c | 30 +++--- 1 files changed, 23 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index fbadaa7..4426c63 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -80,6 +80,15 @@ struct nested_state { /* gpa pointers to the real vectors */ u64 vmcb_msrpm; + + /* cache for intercepts of the guest */ + u16 intercept_cr_read; + u16 intercept_cr_write; + u16 intercept_dr_read; + u16 intercept_dr_write; + u32 intercept_exceptions; + u64 intercept; + }; struct vcpu_svm { @@ -1452,7 +1461,6 @@ static int nested_svm_exit_handled_real(struct vcpu_svm *svm, void *arg2, void *opaque) { - struct vmcb *nested_vmcb = (struct vmcb *)arg1; bool kvm_overrides = *(bool *)opaque; u32 exit_code = svm-vmcb-control.exit_code; @@ -1479,38 +1487,38 @@ static int nested_svm_exit_handled_real(struct vcpu_svm *svm, switch (exit_code) { case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { u32 cr_bits = 1 (exit_code - SVM_EXIT_READ_CR0); - if (nested_vmcb-control.intercept_cr_read cr_bits) + if (svm-nested.intercept_cr_read cr_bits) return 1; break; } case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: { u32 cr_bits = 1 (exit_code - SVM_EXIT_WRITE_CR0); - if (nested_vmcb-control.intercept_cr_write cr_bits) + if (svm-nested.intercept_cr_write cr_bits) return 1; break; } case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: { u32 dr_bits = 1 (exit_code - SVM_EXIT_READ_DR0); - if (nested_vmcb-control.intercept_dr_read dr_bits) + if (svm-nested.intercept_dr_read dr_bits) return 1; break; } case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: { u32 dr_bits = 1 (exit_code - SVM_EXIT_WRITE_DR0); - if (nested_vmcb-control.intercept_dr_write dr_bits) + if (svm-nested.intercept_dr_write dr_bits) return 1; break; } case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { u32 excp_bits = 1 (exit_code - SVM_EXIT_EXCP_BASE); - if (nested_vmcb-control.intercept_exceptions excp_bits) + if (svm-nested.intercept_exceptions excp_bits) return 1; break; } default: { u64 exit_bits = 1ULL (exit_code - SVM_EXIT_INTR); nsvm_printk(exit code: 0x%x\n, exit_code); - if (nested_vmcb-control.intercept exit_bits) + if (svm-nested.intercept exit_bits) return 1; } } @@ -1801,6 +1809,14 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, svm-nested.vmcb_msrpm = nested_vmcb-control.msrpm_base_pa; + /* cache intercepts */ + svm-nested.intercept_cr_read= nested_vmcb-control.intercept_cr_read; + svm-nested.intercept_cr_write = nested_vmcb-control.intercept_cr_write; + svm-nested.intercept_dr_read= nested_vmcb-control.intercept_dr_read; + svm-nested.intercept_dr_write = nested_vmcb-control.intercept_dr_write; + svm-nested.intercept_exceptions = nested_vmcb-control.intercept_exceptions; + svm-nested.intercept= nested_vmcb-control.intercept; + force_new_asid(svm-vcpu); svm-vmcb-control.exit_int_info = nested_vmcb-control.exit_int_info; svm-vmcb-control.exit_int_info_err = nested_vmcb-control.exit_int_info_err; -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 22/47] KVM: SVM: get rid of nested_svm_vmexit_real
From: Joerg Roedel joerg.roe...@amd.com This patch is the starting point of removing nested_svm_do from the nested svm code. The nested_svm_do function basically maps two guest physical pages to host virtual addresses and calls a passed function on it. This function pointer code flow is hard to read and not the best technical solution here. As a side effect this patch indroduces the nested_svm_[un]map helper functions. Signed-off-by: Joerg Roedel joerg.roe...@amd.com Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/svm.c | 52 1 files changed, 40 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 67fad66..5e55a1b 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1390,6 +1390,39 @@ static inline int nested_svm_intr(struct vcpu_svm *svm) return 0; } +static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx) +{ + struct page *page; + + down_read(current-mm-mmap_sem); + page = gfn_to_page(svm-vcpu.kvm, gpa PAGE_SHIFT); + up_read(current-mm-mmap_sem); + + if (is_error_page(page)) + goto error; + + return kmap_atomic(page, idx); + +error: + kvm_release_page_clean(page); + kvm_inject_gp(svm-vcpu, 0); + + return NULL; +} + +static void nested_svm_unmap(void *addr, enum km_type idx) +{ + struct page *page; + + if (!addr) + return; + + page = kmap_atomic_to_page(addr); + + kunmap_atomic(addr, idx); + kvm_release_page_dirty(page); +} + static struct page *nested_svm_get_page(struct vcpu_svm *svm, u64 gpa) { struct page *page; @@ -1597,13 +1630,16 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr dst-lbr_ctl = from-lbr_ctl; } -static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1, - void *arg2, void *opaque) +static int nested_svm_vmexit(struct vcpu_svm *svm) { - struct vmcb *nested_vmcb = (struct vmcb *)arg1; + struct vmcb *nested_vmcb; struct vmcb *hsave = svm-nested.hsave; struct vmcb *vmcb = svm-vmcb; + nested_vmcb = nested_svm_map(svm, svm-nested.vmcb, KM_USER0); + if (!nested_vmcb) + return 1; + /* Give the current vmcb to the guest */ disable_gif(svm); @@ -1678,15 +1714,7 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1, /* Exit nested SVM mode */ svm-nested.vmcb = 0; - return 0; -} - -static int nested_svm_vmexit(struct vcpu_svm *svm) -{ - nsvm_printk(VMexit\n); - if (nested_svm_do(svm, svm-nested.vmcb, 0, - NULL, nested_svm_vmexit_real)) - return 1; + nested_svm_unmap(nested_vmcb, KM_USER0); kvm_mmu_reset_context(svm-vcpu); kvm_mmu_load(svm-vcpu); -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/2] eventfd: new EFD_STATE flag
On Tue, Aug 25, 2009 at 02:57:01PM -0700, Davide Libenzi wrote: On Tue, 25 Aug 2009, Michael S. Tsirkin wrote: Yes, we don't want that. The best thing is to try to restate the problem in a way that is generic, and then either solve or best use existing solution. Right? I thought I had that, but apparently not. The reason I'm Cc-ing you is not to try and spam you until you give up and accept the patch, it's hoping that you see the pattern behind our usage, and help generalize it. If I understand it correctly, you believe this is not possible and so any solution will have to be in KVM? Or maybe I didn't state the problem clearly enough and should restate it? Please do. - Davide Problem looks like this: There are multiple processes (devices) where each has a condition (interrupt line) which it has logic to determine is either true or false. A single other process (hypervisor) is interested in a condition (interrupt level) which is a logical OR of all interrupt lines. On changes, an interrupt level value needs to be read and copied to guest virtual cpu. We also want ability to replace some or all processes above by a kernel components, with condition changes done potentially from hardware interrupt context. How we wanted to solve it with EFD_STATE: Share a separate eventfd between each device and the hypervisor. device sets state to either 0 or 1. hypervisor polls all eventfds, reads interrupt line on changes, calculates the interrupt level and updates guest. Alternative solution: shared memory where each device writes interrupt line value. This makes setup more complex (need to share around much more than just an fd), and makes access from interrupt impossible unless we lock the memory (and locking userspace memory introduces yet another set of issues). -- MST -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 21/47] KVM: SVM: simplify nested_svm_check_exception
From: Joerg Roedel joerg.roe...@amd.com Makes the code of this function more readable by removing on indentation level for the core logic. Signed-off-by: Joerg Roedel joerg.roe...@amd.com Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/svm.c | 19 --- 1 files changed, 8 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 3bb6d4b..67fad66 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1359,18 +1359,15 @@ static int nested_svm_check_permissions(struct vcpu_svm *svm) static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code) { - if (is_nested(svm)) { - svm-vmcb-control.exit_code = SVM_EXIT_EXCP_BASE + nr; - svm-vmcb-control.exit_code_hi = 0; - svm-vmcb-control.exit_info_1 = error_code; - svm-vmcb-control.exit_info_2 = svm-vcpu.arch.cr2; - if (nested_svm_exit_handled(svm, false)) { - nsvm_printk(VMexit - EXCP 0x%x\n, nr); - return 1; - } - } + if (!is_nested(svm)) + return 0; - return 0; + svm-vmcb-control.exit_code = SVM_EXIT_EXCP_BASE + nr; + svm-vmcb-control.exit_code_hi = 0; + svm-vmcb-control.exit_info_1 = error_code; + svm-vmcb-control.exit_info_2 = svm-vcpu.arch.cr2; + + return nested_svm_exit_handled(svm, false); } static inline int nested_svm_intr(struct vcpu_svm *svm) -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 31/47] KVM: SVM: check for nested VINTR flag in svm_interrupt_allowed
From: Joerg Roedel joerg.roe...@amd.com Not checking for this flag breaks any nested hypervisor that does not set VINTR. So fix it with this patch. Signed-off-by: Joerg Roedel joerg.roe...@amd.com Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/svm.c |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ff04a4b..825035e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2466,7 +2466,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) return (vmcb-save.rflags X86_EFLAGS_IF) !(vmcb-control.int_state SVM_INTERRUPT_SHADOW_MASK) gif_set(svm) - !is_nested(svm); + !(is_nested(svm) (svm-vcpu.arch.hflags HF_VINTR_MASK)); } static void enable_irq_window(struct kvm_vcpu *vcpu) -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 26/47] KVM: SVM: remove nested_svm_do and helper functions
From: Joerg Roedel joerg.roe...@amd.com This function is not longer required. So remove it. Signed-off-by: Joerg Roedel joerg.roe...@amd.com Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/svm.c | 60 1 files changed, 0 insertions(+), 60 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1a915f3..42b8b67 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1421,66 +1421,6 @@ static void nested_svm_unmap(void *addr, enum km_type idx) kvm_release_page_dirty(page); } -static struct page *nested_svm_get_page(struct vcpu_svm *svm, u64 gpa) -{ - struct page *page; - - down_read(current-mm-mmap_sem); - page = gfn_to_page(svm-vcpu.kvm, gpa PAGE_SHIFT); - up_read(current-mm-mmap_sem); - - if (is_error_page(page)) { - printk(KERN_INFO %s: could not find page at 0x%llx\n, - __func__, gpa); - kvm_release_page_clean(page); - kvm_inject_gp(svm-vcpu, 0); - return NULL; - } - return page; -} - -static int nested_svm_do(struct vcpu_svm *svm, -u64 arg1_gpa, u64 arg2_gpa, void *opaque, -int (*handler)(struct vcpu_svm *svm, - void *arg1, - void *arg2, - void *opaque)) -{ - struct page *arg1_page; - struct page *arg2_page = NULL; - void *arg1; - void *arg2 = NULL; - int retval; - - arg1_page = nested_svm_get_page(svm, arg1_gpa); - if(arg1_page == NULL) - return 1; - - if (arg2_gpa) { - arg2_page = nested_svm_get_page(svm, arg2_gpa); - if(arg2_page == NULL) { - kvm_release_page_clean(arg1_page); - return 1; - } - } - - arg1 = kmap_atomic(arg1_page, KM_USER0); - if (arg2_gpa) - arg2 = kmap_atomic(arg2_page, KM_USER1); - - retval = handler(svm, arg1, arg2, opaque); - - kunmap_atomic(arg1, KM_USER0); - if (arg2_gpa) - kunmap_atomic(arg2, KM_USER1); - - kvm_release_page_dirty(arg1_page); - if (arg2_gpa) - kvm_release_page_dirty(arg2_page); - - return retval; -} - static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm) { u32 param = svm-vmcb-control.exit_info_1 1; -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 47/47] KVM: Document KVM_CAP_IRQCHIP
Signed-off-by: Avi Kivity a...@redhat.com --- Documentation/kvm/api.txt | 76 + 1 files changed, 76 insertions(+), 0 deletions(-) diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt index 1b1c22d..5a4bc8c 100644 --- a/Documentation/kvm/api.txt +++ b/Documentation/kvm/api.txt @@ -517,6 +517,82 @@ struct kvm_fpu { __u32 pad2; }; +4.23 KVM_CREATE_IRQCHIP + +Capability: KVM_CAP_IRQCHIP +Architectures: x86, ia64 +Type: vm ioctl +Parameters: none +Returns: 0 on success, -1 on error + +Creates an interrupt controller model in the kernel. On x86, creates a virtual +ioapic, a virtual PIC (two PICs, nested), and sets up future vcpus to have a +local APIC. IRQ routing for GSIs 0-15 is set to both PIC and IOAPIC; GSI 16-23 +only go to the IOAPIC. On ia64, a IOSAPIC is created. + +4.24 KVM_IRQ_LINE + +Capability: KVM_CAP_IRQCHIP +Architectures: x86, ia64 +Type: vm ioctl +Parameters: struct kvm_irq_level +Returns: 0 on success, -1 on error + +Sets the level of a GSI input to the interrupt controller model in the kernel. +Requires that an interrupt controller model has been previously created with +KVM_CREATE_IRQCHIP. Note that edge-triggered interrupts require the level +to be set to 1 and then back to 0. + +struct kvm_irq_level { + union { + __u32 irq; /* GSI */ + __s32 status; /* not used for KVM_IRQ_LEVEL */ + }; + __u32 level; /* 0 or 1 */ +}; + +4.25 KVM_GET_IRQCHIP + +Capability: KVM_CAP_IRQCHIP +Architectures: x86, ia64 +Type: vm ioctl +Parameters: struct kvm_irqchip (in/out) +Returns: 0 on success, -1 on error + +Reads the state of a kernel interrupt controller created with +KVM_CREATE_IRQCHIP into a buffer provided by the caller. + +struct kvm_irqchip { + __u32 chip_id; /* 0 = PIC1, 1 = PIC2, 2 = IOAPIC */ + __u32 pad; +union { + char dummy[512]; /* reserving space */ + struct kvm_pic_state pic; + struct kvm_ioapic_state ioapic; + } chip; +}; + +4.26 KVM_SET_IRQCHIP + +Capability: KVM_CAP_IRQCHIP +Architectures: x86, ia64 +Type: vm ioctl +Parameters: struct kvm_irqchip (in) +Returns: 0 on success, -1 on error + +Sets the state of a kernel interrupt controller created with +KVM_CREATE_IRQCHIP from a buffer provided by the caller. + +struct kvm_irqchip { + __u32 chip_id; /* 0 = PIC1, 1 = PIC2, 2 = IOAPIC */ + __u32 pad; +union { + char dummy[512]; /* reserving space */ + struct kvm_pic_state pic; + struct kvm_ioapic_state ioapic; + } chip; +}; + 5. The kvm_run structure Application code obtains a pointer to the kvm_run structure by -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 43/47] KVM: x86 emulator: Add adc and sbb missing decoder flags
From: Mohammed Gamal m.gamal...@gmail.com Add missing decoder flags for adc and sbb instructions (opcodes 0x14-0x15, 0x1c-0x1d) Signed-off-by: Mohammed Gamal m.gamal...@gmail.com Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/emulate.c |4 ++-- 1 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 2eb807a..1be5cd6 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -100,11 +100,11 @@ static u32 opcode_table[256] = { /* 0x10 - 0x17 */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, + ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, /* 0x18 - 0x1F */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, + ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, /* 0x20 - 0x27 */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 45/47] KVM: VMX: Fix EPT with WP bit change during paging
From: Sheng Yang sh...@linux.intel.com QNX update WP bit when paging enabled, which is not covered yet. This one fix QNX boot with EPT. Cc: sta...@kernel.org Signed-off-by: Sheng Yang sh...@linux.intel.com Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/vmx.c |6 +++--- 1 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2b7e7bd..1ee811c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1642,7 +1642,6 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, CPU_BASED_CR3_STORE_EXITING)); vcpu-arch.cr0 = cr0; vmx_set_cr4(vcpu, vcpu-arch.cr4); - *hw_cr0 = ~X86_CR0_WP; } else if (!is_paging(vcpu)) { /* From nonpaging to paging */ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, @@ -1651,9 +1650,10 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, CPU_BASED_CR3_STORE_EXITING)); vcpu-arch.cr0 = cr0; vmx_set_cr4(vcpu, vcpu-arch.cr4); - if (!(vcpu-arch.cr0 X86_CR0_WP)) - *hw_cr0 = ~X86_CR0_WP; } + + if (!(cr0 X86_CR0_WP)) + *hw_cr0 = ~X86_CR0_WP; } static void ept_update_paging_mode_cr4(unsigned long *hw_cr4, -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 38/47] KVM: Rename x86_emulate.c to emulate.c
We're in arch/x86, what could we possibly be emulating? Signed-off-by: Avi Kivity a...@redhat.com --- .../asm/{kvm_x86_emulate.h = kvm_emulate.h} |0 arch/x86/include/asm/kvm_host.h|2 +- arch/x86/kvm/Makefile |2 +- arch/x86/kvm/{x86_emulate.c = emulate.c} |4 ++-- arch/x86/kvm/x86.c |2 +- 5 files changed, 5 insertions(+), 5 deletions(-) rename arch/x86/include/asm/{kvm_x86_emulate.h = kvm_emulate.h} (100%) rename arch/x86/kvm/{x86_emulate.c = emulate.c} (99%) diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_emulate.h similarity index 100% rename from arch/x86/include/asm/kvm_x86_emulate.h rename to arch/x86/include/asm/kvm_emulate.h diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b17d845..33901be 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -138,7 +138,7 @@ enum { VCPU_SREG_LDTR, }; -#include asm/kvm_x86_emulate.h +#include asm/kvm_emulate.h #define KVM_NR_MEM_OBJS 40 diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index afaaa76..0e7fe78 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -9,7 +9,7 @@ kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ coalesced_mmio.o irq_comm.o eventfd.o) kvm-$(CONFIG_IOMMU_API)+= $(addprefix ../../../virt/kvm/, iommu.o) -kvm-y += x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \ +kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ i8254.o timer.o kvm-intel-y+= vmx.o kvm-amd-y += svm.o diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/emulate.c similarity index 99% rename from arch/x86/kvm/x86_emulate.c rename to arch/x86/kvm/emulate.c index c6663d4..2eb807a 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1,5 +1,5 @@ /** - * x86_emulate.c + * emulate.c * * Generic x86 (32-bit and 64-bit) instruction decoder and emulator. * @@ -30,7 +30,7 @@ #define DPRINTF(x...) do {} while (0) #endif #include linux/module.h -#include asm/kvm_x86_emulate.h +#include asm/kvm_emulate.h #include mmu.h /* for is_long_mode() */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1aa7e6d..c0e9427 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2759,7 +2759,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu, kvm_clear_exception_queue(vcpu); vcpu-arch.mmio_fault_cr2 = cr2; /* -* TODO: fix x86_emulate.c to use guest_read/write_register +* TODO: fix emulate.c to use guest_read/write_register * instead of direct -regs accesses, can save hundred cycles * on Intel for instructions that don't read/change RSP, for * for example. -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 39/47] Documentation: Update KVM list email address
From: Amit Shah amit.s...@redhat.com The KVM list moved to vger.kernel.org last year Signed-off-by: Amit Shah amit.s...@redhat.com Signed-off-by: Avi Kivity a...@redhat.com --- Documentation/ioctl/ioctl-number.txt |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt index 7bb0d93..3223e12 100644 --- a/Documentation/ioctl/ioctl-number.txt +++ b/Documentation/ioctl/ioctl-number.txt @@ -191,7 +191,7 @@ CodeSeq#Include FileComments 0xAD 00 Netfilter devicein development: mailto:ru...@rustcorp.com.au 0xAE all linux/kvm.h Kernel-based Virtual Machine - mailto:kvm-de...@lists.sourceforge.net + mailto:kvm@vger.kernel.org 0xB0 all RATIO devices in development: mailto:v...@ratio.de 0xB1 00-1F PPPoX mailto:mostr...@styx.uwaterloo.ca -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 28/47] KVM: SVM: move special nested exit handling to separate function
From: Joerg Roedel joerg.roe...@amd.com This patch moves the handling for special nested vmexits like #pf to a separate function. This makes the kvm_override parameter obsolete and makes the code more readable. Signed-off-by: Joerg Roedel joerg.roe...@amd.com Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/svm.c | 80 --- 1 files changed, 50 insertions(+), 30 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 2edf2dd..e9e3931 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -47,6 +47,10 @@ MODULE_LICENSE(GPL); #define SVM_FEATURE_LBRV (1 1) #define SVM_FEATURE_SVML (1 2) +#define NESTED_EXIT_HOST 0 /* Exit handled on host level */ +#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ +#define NESTED_EXIT_CONTINUE 2 /* Further checks needed */ + #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) /* Turn on to get debugging output*/ @@ -126,7 +130,7 @@ module_param(nested, int, S_IRUGO); static void svm_flush_tlb(struct kvm_vcpu *vcpu); static void svm_complete_interrupts(struct vcpu_svm *svm); -static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override); +static int nested_svm_exit_handled(struct vcpu_svm *svm); static int nested_svm_vmexit(struct vcpu_svm *svm); static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); @@ -1365,7 +1369,7 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, svm-vmcb-control.exit_info_1 = error_code; svm-vmcb-control.exit_info_2 = svm-vcpu.arch.cr2; - return nested_svm_exit_handled(svm, false); + return nested_svm_exit_handled(svm); } static inline int nested_svm_intr(struct vcpu_svm *svm) @@ -1379,7 +1383,7 @@ static inline int nested_svm_intr(struct vcpu_svm *svm) svm-vmcb-control.exit_code = SVM_EXIT_INTR; - if (nested_svm_exit_handled(svm, false)) { + if (nested_svm_exit_handled(svm)) { nsvm_printk(VMexit - INTR\n); return 1; } @@ -1468,31 +1472,39 @@ out: return ret; } -static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override) +static int nested_svm_exit_special(struct vcpu_svm *svm) { u32 exit_code = svm-vmcb-control.exit_code; - bool vmexit = false; - if (kvm_override) { - switch (exit_code) { - case SVM_EXIT_INTR: - case SVM_EXIT_NMI: - return 0; + switch (exit_code) { + case SVM_EXIT_INTR: + case SVM_EXIT_NMI: + return NESTED_EXIT_HOST; /* For now we are always handling NPFs when using them */ - case SVM_EXIT_NPF: - if (npt_enabled) - return 0; - break; - /* When we're shadowing, trap PFs */ - case SVM_EXIT_EXCP_BASE + PF_VECTOR: - if (!npt_enabled) - return 0; - break; - default: - break; - } + case SVM_EXIT_NPF: + if (npt_enabled) + return NESTED_EXIT_HOST; + break; + /* When we're shadowing, trap PFs */ + case SVM_EXIT_EXCP_BASE + PF_VECTOR: + if (!npt_enabled) + return NESTED_EXIT_HOST; + break; + default: + break; } + return NESTED_EXIT_CONTINUE; +} + +/* + * If this function returns true, this #vmexit was already handled + */ +static int nested_svm_exit_handled(struct vcpu_svm *svm) +{ + u32 exit_code = svm-vmcb-control.exit_code; + int vmexit = NESTED_EXIT_HOST; + switch (exit_code) { case SVM_EXIT_MSR: vmexit = nested_svm_exit_handled_msr(svm); @@ -1500,42 +1512,42 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override) case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { u32 cr_bits = 1 (exit_code - SVM_EXIT_READ_CR0); if (svm-nested.intercept_cr_read cr_bits) - vmexit = true; + vmexit = NESTED_EXIT_DONE; break; } case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: { u32 cr_bits = 1 (exit_code - SVM_EXIT_WRITE_CR0); if (svm-nested.intercept_cr_write cr_bits) - vmexit = true; + vmexit = NESTED_EXIT_DONE; break; } case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: { u32 dr_bits = 1 (exit_code - SVM_EXIT_READ_DR0); if (svm-nested.intercept_dr_read dr_bits) -
[PATCH 24/47] KVM: SVM: clean up nestec vmload/vmsave paths
From: Joerg Roedel joerg.roe...@amd.com This patch removes the usage of nested_svm_do from the vmload and vmsave emulation code paths. Signed-off-by: Joerg Roedel joerg.roe...@amd.com Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/svm.c | 36 +--- 1 files changed, 17 insertions(+), 19 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 518d578..419e3fa 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -128,8 +128,6 @@ static void svm_complete_interrupts(struct vcpu_svm *svm); static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override); static int nested_svm_vmexit(struct vcpu_svm *svm); -static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb, -void *arg2, void *opaque); static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); @@ -1868,7 +1866,7 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, return 0; } -static int nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) +static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) { to_vmcb-save.fs = from_vmcb-save.fs; to_vmcb-save.gs = from_vmcb-save.gs; @@ -1882,44 +1880,44 @@ static int nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) to_vmcb-save.sysenter_cs = from_vmcb-save.sysenter_cs; to_vmcb-save.sysenter_esp = from_vmcb-save.sysenter_esp; to_vmcb-save.sysenter_eip = from_vmcb-save.sysenter_eip; - - return 1; -} - -static int nested_svm_vmload(struct vcpu_svm *svm, void *nested_vmcb, -void *arg2, void *opaque) -{ - return nested_svm_vmloadsave((struct vmcb *)nested_vmcb, svm-vmcb); -} - -static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb, -void *arg2, void *opaque) -{ - return nested_svm_vmloadsave(svm-vmcb, (struct vmcb *)nested_vmcb); } static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { + struct vmcb *nested_vmcb; + if (nested_svm_check_permissions(svm)) return 1; svm-next_rip = kvm_rip_read(svm-vcpu) + 3; skip_emulated_instruction(svm-vcpu); - nested_svm_do(svm, svm-vmcb-save.rax, 0, NULL, nested_svm_vmload); + nested_vmcb = nested_svm_map(svm, svm-vmcb-save.rax, KM_USER0); + if (!nested_vmcb) + return 1; + + nested_svm_vmloadsave(nested_vmcb, svm-vmcb); + nested_svm_unmap(nested_vmcb, KM_USER0); return 1; } static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { + struct vmcb *nested_vmcb; + if (nested_svm_check_permissions(svm)) return 1; svm-next_rip = kvm_rip_read(svm-vcpu) + 3; skip_emulated_instruction(svm-vcpu); - nested_svm_do(svm, svm-vmcb-save.rax, 0, NULL, nested_svm_vmsave); + nested_vmcb = nested_svm_map(svm, svm-vmcb-save.rax, KM_USER0); + if (!nested_vmcb) + return 1; + + nested_svm_vmloadsave(svm-vmcb, nested_vmcb); + nested_svm_unmap(nested_vmcb, KM_USER0); return 1; } -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 29/47] KVM: SVM: remove unnecessary is_nested check from svm_cpu_run
From: Joerg Roedel joerg.roe...@amd.com This check is not necessary. We have to sync the vcpu-arch.cr2 always back to the VMCB. This patch remove the is_nested check. Signed-off-by: Joerg Roedel joerg.roe...@amd.com Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/svm.c |3 +-- 1 files changed, 1 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e9e3931..f275d77 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2605,8 +2605,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) fs_selector = kvm_read_fs(); gs_selector = kvm_read_gs(); ldt_selector = kvm_read_ldt(); - if (!is_nested(svm)) - svm-vmcb-save.cr2 = vcpu-arch.cr2; + svm-vmcb-save.cr2 = vcpu-arch.cr2; /* required for live migration with NPT */ if (npt_enabled) svm-vmcb-save.cr3 = vcpu-arch.cr3; -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 35/47] KVM: Move #endif KVM_CAP_IRQ_ROUTING to correct place
The symbol only controls irq routing, not MSI-X. Signed-off-by: Avi Kivity a...@redhat.com --- virt/kvm/kvm_main.c |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4470251..1df4c04 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2236,6 +2236,7 @@ static long kvm_vm_ioctl(struct file *filp, vfree(entries); break; } +#endif /* KVM_CAP_IRQ_ROUTING */ #ifdef __KVM_HAVE_MSIX case KVM_ASSIGN_SET_MSIX_NR: { struct kvm_assigned_msix_nr entry_nr; @@ -2258,7 +2259,6 @@ static long kvm_vm_ioctl(struct file *filp, break; } #endif -#endif /* KVM_CAP_IRQ_ROUTING */ case KVM_IRQFD: { struct kvm_irqfd data; -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 16/47] KVM: SVM: complete interrupts after handling nested exits
From: Joerg Roedel joerg.roe...@amd.com The interrupt completion code must run after nested exits are handled because not injected interrupts or exceptions may be handled by the l1 guest first. Signed-off-by: Joerg Roedel joerg.roe...@amd.com Acked-by: Alexander Graf ag...@suse.de Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/svm.c |5 +++-- 1 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index df795bc..825b825 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -111,6 +111,7 @@ static int nested = 0; module_param(nested, int, S_IRUGO); static void svm_flush_tlb(struct kvm_vcpu *vcpu); +static void svm_complete_interrupts(struct vcpu_svm *svm); static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override); static int nested_svm_vmexit(struct vcpu_svm *svm); @@ -2324,6 +2325,8 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) } } + svm_complete_interrupts(svm); + if (npt_enabled) { int mmu_reload = 0; if ((vcpu-arch.cr0 ^ svm-vmcb-save.cr0) X86_CR0_PG) { @@ -2690,8 +2693,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu-arch.regs_avail = ~(1 VCPU_EXREG_PDPTR); vcpu-arch.regs_dirty = ~(1 VCPU_EXREG_PDPTR); } - - svm_complete_interrupts(svm); } #undef R -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 27/47] KVM: SVM: handle errors in vmrun emulation path appropriatly
From: Joerg Roedel joerg.roe...@amd.com If nested svm fails to load the msrpm the vmrun succeeds with the old msrpm which is not correct. This patch changes the logic to roll back to host mode in case the msrpm cannot be loaded. Signed-off-by: Joerg Roedel joerg.roe...@amd.com Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/svm.c | 14 +- 1 files changed, 13 insertions(+), 1 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 42b8b67..2edf2dd 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1877,6 +1877,7 @@ static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { nsvm_printk(VMrun\n); + if (nested_svm_check_permissions(svm)) return 1; @@ -1887,7 +1888,18 @@ static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return 1; if (!nested_svm_vmrun_msrpm(svm)) - return 1; + goto failed; + + return 1; + +failed: + + svm-vmcb-control.exit_code= SVM_EXIT_ERR; + svm-vmcb-control.exit_code_hi = 0; + svm-vmcb-control.exit_info_1 = 0; + svm-vmcb-control.exit_info_2 = 0; + + nested_svm_vmexit(svm); return 1; } -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 19/47] KVM: SVM: consolidate nested_svm_exit_handled
From: Joerg Roedel joerg.roe...@amd.com When caching guest intercepts there is no need anymore for the nested_svm_exit_handled_real function. So move its code into nested_svm_exit_handled. Signed-off-by: Joerg Roedel joerg.roe...@amd.com Acked-by: Alexander Graf ag...@suse.de Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/svm.c | 109 +++ 1 files changed, 49 insertions(+), 60 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 4426c63..bdd73fd 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1456,15 +1456,58 @@ static int nested_svm_do(struct vcpu_svm *svm, return retval; } -static int nested_svm_exit_handled_real(struct vcpu_svm *svm, - void *arg1, - void *arg2, - void *opaque) +static int nested_svm_exit_handled_msr(struct vcpu_svm *svm, + void *arg1, void *arg2, + void *opaque) +{ + struct vmcb *nested_vmcb = (struct vmcb *)arg1; + u8 *msrpm = (u8 *)arg2; + u32 t0, t1; + u32 msr = svm-vcpu.arch.regs[VCPU_REGS_RCX]; + u32 param = svm-vmcb-control.exit_info_1 1; + + if (!(nested_vmcb-control.intercept (1ULL INTERCEPT_MSR_PROT))) + return 0; + + switch (msr) { + case 0 ... 0x1fff: + t0 = (msr * 2) % 8; + t1 = msr / 8; + break; + case 0xc000 ... 0xc0001fff: + t0 = (8192 + msr - 0xc000) * 2; + t1 = (t0 / 8); + t0 %= 8; + break; + case 0xc001 ... 0xc0011fff: + t0 = (16384 + msr - 0xc001) * 2; + t1 = (t0 / 8); + t0 %= 8; + break; + default: + return 1; + break; + } + if (msrpm[t1] ((1 param) t0)) + return 1; + + return 0; +} + +static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override) { - bool kvm_overrides = *(bool *)opaque; u32 exit_code = svm-vmcb-control.exit_code; - if (kvm_overrides) { + switch (svm-vmcb-control.exit_code) { + case SVM_EXIT_MSR: + return nested_svm_do(svm, svm-nested.vmcb, +svm-nested.vmcb_msrpm, NULL, +nested_svm_exit_handled_msr); + default: + break; + } + + if (kvm_override) { switch (exit_code) { case SVM_EXIT_INTR: case SVM_EXIT_NMI: @@ -1526,60 +1569,6 @@ static int nested_svm_exit_handled_real(struct vcpu_svm *svm, return 0; } -static int nested_svm_exit_handled_msr(struct vcpu_svm *svm, - void *arg1, void *arg2, - void *opaque) -{ - struct vmcb *nested_vmcb = (struct vmcb *)arg1; - u8 *msrpm = (u8 *)arg2; -u32 t0, t1; - u32 msr = svm-vcpu.arch.regs[VCPU_REGS_RCX]; - u32 param = svm-vmcb-control.exit_info_1 1; - - if (!(nested_vmcb-control.intercept (1ULL INTERCEPT_MSR_PROT))) - return 0; - - switch(msr) { - case 0 ... 0x1fff: - t0 = (msr * 2) % 8; - t1 = msr / 8; - break; - case 0xc000 ... 0xc0001fff: - t0 = (8192 + msr - 0xc000) * 2; - t1 = (t0 / 8); - t0 %= 8; - break; - case 0xc001 ... 0xc0011fff: - t0 = (16384 + msr - 0xc001) * 2; - t1 = (t0 / 8); - t0 %= 8; - break; - default: - return 1; - break; - } - if (msrpm[t1] ((1 param) t0)) - return 1; - - return 0; -} - -static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override) -{ - bool k = kvm_override; - - switch (svm-vmcb-control.exit_code) { - case SVM_EXIT_MSR: - return nested_svm_do(svm, svm-nested.vmcb, -svm-nested.vmcb_msrpm, NULL, -nested_svm_exit_handled_msr); - default: break; - } - - return nested_svm_do(svm, svm-nested.vmcb, 0, k, -nested_svm_exit_handled_real); -} - static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb) { struct vmcb_control_area *dst = dst_vmcb-control; -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 20/47] KVM: SVM: do nested vmexit in nested_svm_exit_handled
From: Joerg Roedel joerg.roe...@amd.com If this function returns true a nested vmexit is required. Move that vmexit into the nested_svm_exit_handled function. This also simplifies the handling of nested #pf intercepts in this function. Signed-off-by: Joerg Roedel joerg.roe...@amd.com Acked-by: Alexander Graf ag...@suse.de Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/svm.c | 42 +++--- 1 files changed, 19 insertions(+), 23 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index bdd73fd..3bb6d4b 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1366,8 +1366,6 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, svm-vmcb-control.exit_info_2 = svm-vcpu.arch.cr2; if (nested_svm_exit_handled(svm, false)) { nsvm_printk(VMexit - EXCP 0x%x\n, nr); - - nested_svm_vmexit(svm); return 1; } } @@ -1388,7 +1386,6 @@ static inline int nested_svm_intr(struct vcpu_svm *svm) if (nested_svm_exit_handled(svm, false)) { nsvm_printk(VMexit - INTR\n); - nested_svm_vmexit(svm); return 1; } } @@ -1497,15 +1494,7 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm, static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override) { u32 exit_code = svm-vmcb-control.exit_code; - - switch (svm-vmcb-control.exit_code) { - case SVM_EXIT_MSR: - return nested_svm_do(svm, svm-nested.vmcb, -svm-nested.vmcb_msrpm, NULL, -nested_svm_exit_handled_msr); - default: - break; - } + bool vmexit = false; if (kvm_override) { switch (exit_code) { @@ -1528,45 +1517,55 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override) } switch (exit_code) { + case SVM_EXIT_MSR: + if (nested_svm_do(svm, svm-nested.vmcb, svm-nested.vmcb_msrpm, + NULL, nested_svm_exit_handled_msr)) + vmexit = true; + break; case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { u32 cr_bits = 1 (exit_code - SVM_EXIT_READ_CR0); if (svm-nested.intercept_cr_read cr_bits) - return 1; + vmexit = true; break; } case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: { u32 cr_bits = 1 (exit_code - SVM_EXIT_WRITE_CR0); if (svm-nested.intercept_cr_write cr_bits) - return 1; + vmexit = true; break; } case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: { u32 dr_bits = 1 (exit_code - SVM_EXIT_READ_DR0); if (svm-nested.intercept_dr_read dr_bits) - return 1; + vmexit = true; break; } case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: { u32 dr_bits = 1 (exit_code - SVM_EXIT_WRITE_DR0); if (svm-nested.intercept_dr_write dr_bits) - return 1; + vmexit = true; break; } case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { u32 excp_bits = 1 (exit_code - SVM_EXIT_EXCP_BASE); if (svm-nested.intercept_exceptions excp_bits) - return 1; + vmexit = true; break; } default: { u64 exit_bits = 1ULL (exit_code - SVM_EXIT_INTR); nsvm_printk(exit code: 0x%x\n, exit_code); if (svm-nested.intercept exit_bits) - return 1; + vmexit = true; } } - return 0; + if (vmexit) { + nsvm_printk(#VMEXIT reason=%04x\n, exit_code); + nested_svm_vmexit(svm); + } + + return vmexit; } static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb) @@ -2327,11 +2326,8 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) nsvm_printk(nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n, exit_code, svm-vmcb-control.exit_info_1, svm-vmcb-control.exit_info_2, svm-vmcb-save.rip); - if (nested_svm_exit_handled(svm, true)) { - nested_svm_vmexit(svm); - nsvm_printk(- #VMEXIT\n); + if (nested_svm_exit_handled(svm, true)) return 1; - } }
[PATCH 33/47] KVM: Update cr8 intercept when APIC TPR is changed by userspace
From: Gleb Natapov g...@redhat.com Since on vcpu entry we do it only if apic is enabled we should do it when TPR is changed while apic is disabled. This happens when windows resets HW without setting TPR to zero. Signed-off-by: Gleb Natapov g...@redhat.com Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/x86.c |2 ++ 1 files changed, 2 insertions(+), 0 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 132c510..31bf984 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -77,6 +77,7 @@ static u64 __read_mostly efer_reserved_bits = 0xfffeULL; #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU +static void update_cr8_intercept(struct kvm_vcpu *vcpu); static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries); @@ -1629,6 +1630,7 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, vcpu_load(vcpu); memcpy(vcpu-arch.apic-regs, s-regs, sizeof *s); kvm_apic_post_state_restore(vcpu); + update_cr8_intercept(vcpu); vcpu_put(vcpu); return 0; -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 25/47] KVM: SVM: clean up nested vmrun path
From: Joerg Roedel joerg.roe...@amd.com This patch removes the usage of nested_svm_do from the vmrun emulation path. Signed-off-by: Joerg Roedel joerg.roe...@amd.com Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/svm.c | 34 ++ 1 files changed, 22 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 419e3fa..1a915f3 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1727,25 +1727,35 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) return 0; } -static int nested_svm_vmrun_msrpm(struct vcpu_svm *svm, void *arg1, - void *arg2, void *opaque) +static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) { + u32 *nested_msrpm; int i; - u32 *nested_msrpm = (u32*)arg1; + + nested_msrpm = nested_svm_map(svm, svm-nested.vmcb_msrpm, KM_USER0); + if (!nested_msrpm) + return false; + for (i=0; i PAGE_SIZE * (1 MSRPM_ALLOC_ORDER) / 4; i++) svm-nested.msrpm[i] = svm-msrpm[i] | nested_msrpm[i]; + svm-vmcb-control.msrpm_base_pa = __pa(svm-nested.msrpm); - return 0; + nested_svm_unmap(nested_msrpm, KM_USER0); + + return true; } -static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, - void *arg2, void *opaque) +static bool nested_svm_vmrun(struct vcpu_svm *svm) { - struct vmcb *nested_vmcb = (struct vmcb *)arg1; + struct vmcb *nested_vmcb; struct vmcb *hsave = svm-nested.hsave; struct vmcb *vmcb = svm-vmcb; + nested_vmcb = nested_svm_map(svm, svm-vmcb-save.rax, KM_USER0); + if (!nested_vmcb) + return false; + /* nested_vmcb is our indicator if nested SVM is activated */ svm-nested.vmcb = svm-vmcb-save.rax; @@ -1861,9 +1871,11 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, svm-vmcb-control.event_inj = nested_vmcb-control.event_inj; svm-vmcb-control.event_inj_err = nested_vmcb-control.event_inj_err; + nested_svm_unmap(nested_vmcb, KM_USER0); + enable_gif(svm); - return 0; + return true; } static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb) @@ -1931,12 +1943,10 @@ static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) svm-next_rip = kvm_rip_read(svm-vcpu) + 3; skip_emulated_instruction(svm-vcpu); - if (nested_svm_do(svm, svm-vmcb-save.rax, 0, - NULL, nested_svm_vmrun)) + if (!nested_svm_vmrun(svm)) return 1; - if (nested_svm_do(svm, svm-nested.vmcb_msrpm, 0, - NULL, nested_svm_vmrun_msrpm)) + if (!nested_svm_vmrun_msrpm(svm)) return 1; return 1; -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 36/47] KVM: VMX: Adjust rflags if in real mode emulation
We set rflags.vm86 when virtualizing real mode to do through vm8086 mode; so we need to take it out again when reading rflags. Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/vmx.c |7 ++- 1 files changed, 6 insertions(+), 1 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 31c3a87..2b7e7bd 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -781,7 +781,12 @@ static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) { - return vmcs_readl(GUEST_RFLAGS); + unsigned long rflags; + + rflags = vmcs_readl(GUEST_RFLAGS); + if (to_vmx(vcpu)-rmode.vm86_active) + rflags = ~(unsigned long)(X86_EFLAGS_IOPL | X86_EFLAGS_VM); + return rflags; } static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 34/47] KVM: SVM: Drop tlb flush workaround in npt
It is no longer possible to reproduce the problem any more, so presumably it has been fixed. Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/svm.c | 13 ++--- 1 files changed, 2 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index be0f6ef..7853dd3 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1187,17 +1187,8 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) error_code = svm-vmcb-control.exit_info_1; trace_kvm_page_fault(fault_address, error_code); - /* -* FIXME: Tis shouldn't be necessary here, but there is a flush -* missing in the MMU code. Until we find this bug, flush the -* complete TLB here on an NPF -*/ - if (npt_enabled) - svm_flush_tlb(svm-vcpu); - else { - if (kvm_event_needs_reinjection(svm-vcpu)) - kvm_mmu_unprotect_page_virt(svm-vcpu, fault_address); - } + if (!npt_enabled kvm_event_needs_reinjection(svm-vcpu)) + kvm_mmu_unprotect_page_virt(svm-vcpu, fault_address); return kvm_mmu_page_fault(svm-vcpu, fault_address, error_code); } -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 40/47] KVM: export kvm_para.h
From: Michael S. Tsirkin m...@redhat.com kvm_para.h contains userspace interface and so should be exported. Signed-off-by: Michael S. Tsirkin m...@redhat.com Signed-off-by: Avi Kivity a...@redhat.com --- include/asm-generic/Kbuild.asm |5 + include/linux/Kbuild |4 2 files changed, 9 insertions(+), 0 deletions(-) diff --git a/include/asm-generic/Kbuild.asm b/include/asm-generic/Kbuild.asm index 290910e..96d7c98 100644 --- a/include/asm-generic/Kbuild.asm +++ b/include/asm-generic/Kbuild.asm @@ -3,6 +3,11 @@ ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/asm/kvm.h \ header-y += kvm.h endif +ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/asm/kvm_para.h \ + $(srctree)/include/asm-$(SRCARCH)/kvm_para.h),) +header-y += kvm_para.h +endif + ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/asm/a.out.h \ $(srctree)/include/asm-$(SRCARCH)/a.out.h),) unifdef-y += a.out.h diff --git a/include/linux/Kbuild b/include/linux/Kbuild index 334a359..cff4a10 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild @@ -268,6 +268,10 @@ ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/asm/kvm.h \ $(srctree)/include/asm-$(SRCARCH)/kvm.h),) unifdef-y += kvm.h endif +ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/asm/kvm_para.h \ + $(srctree)/include/asm-$(SRCARCH)/kvm_para.h),) +unifdef-y += kvm_para.h +endif unifdef-y += llc.h unifdef-y += loop.h unifdef-y += lp.h -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 30/47] KVM: SVM: move nested_svm_intr main logic out of if-clause
From: Joerg Roedel joerg.roe...@amd.com This patch removes one indentation level from nested_svm_intr and makes the logic more readable. Signed-off-by: Joerg Roedel joerg.roe...@amd.com Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/svm.c | 21 +++-- 1 files changed, 11 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f275d77..ff04a4b 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1374,19 +1374,20 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, static inline int nested_svm_intr(struct vcpu_svm *svm) { - if (is_nested(svm)) { - if (!(svm-vcpu.arch.hflags HF_VINTR_MASK)) - return 0; + if (!is_nested(svm)) + return 0; - if (!(svm-vcpu.arch.hflags HF_HIF_MASK)) - return 0; + if (!(svm-vcpu.arch.hflags HF_VINTR_MASK)) + return 0; - svm-vmcb-control.exit_code = SVM_EXIT_INTR; + if (!(svm-vcpu.arch.hflags HF_HIF_MASK)) + return 0; - if (nested_svm_exit_handled(svm)) { - nsvm_printk(VMexit - INTR\n); - return 1; - } + svm-vmcb-control.exit_code = SVM_EXIT_INTR; + + if (nested_svm_exit_handled(svm)) { + nsvm_printk(VMexit - INTR\n); + return 1; } return 0; -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 37/47] KVM: When switching to a vm8086 task, load segments as 16-bit
From: Anthony Liguori aligu...@us.ibm.com According to 16.2.5 in the SDM, eflags.vm in the tss is consulted before loading and new segments. If eflags.vm == 1, then the segments are treated as 16-bit segments. The LDTR and TR are not normally available in vm86 mode so if they happen to somehow get loaded, they need to be treated as 32-bit segments. This fixes an invalid vmentry failure in a custom OS that was happening after a task switch into vm8086 mode. Since the segments were being mistakenly treated as 32-bit, we loaded garbage state. Signed-off-by: Anthony Liguori aligu...@us.ibm.com Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/x86.c |9 - 1 files changed, 8 insertions(+), 1 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 31bf984..1aa7e6d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4101,12 +4101,19 @@ static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int se return 0; } +static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) +{ + return (seg != VCPU_SREG_LDTR) + (seg != VCPU_SREG_TR) + (kvm_x86_ops-get_rflags(vcpu) X86_EFLAGS_VM); +} + int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int type_bits, int seg) { struct kvm_segment kvm_seg; - if (!(vcpu-arch.cr0 X86_CR0_PE)) + if (is_vm86_segment(vcpu, seg) || !(vcpu-arch.cr0 X86_CR0_PE)) return kvm_load_realmode_segment(vcpu, selector, seg); if (load_segment_descriptor_to_kvm_desct(vcpu, selector, kvm_seg)) return 1; -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 41/47] KVM: Add __KERNEL__ guards to exported headers
Signed-off-by: Avi Kivity a...@redhat.com --- arch/ia64/include/asm/kvm_para.h |4 arch/s390/include/asm/kvm_para.h |4 2 files changed, 8 insertions(+), 0 deletions(-) diff --git a/arch/ia64/include/asm/kvm_para.h b/arch/ia64/include/asm/kvm_para.h index 0d6d8ca..1588aee 100644 --- a/arch/ia64/include/asm/kvm_para.h +++ b/arch/ia64/include/asm/kvm_para.h @@ -19,9 +19,13 @@ * */ +#ifdef __KERNEL__ + static inline unsigned int kvm_arch_para_features(void) { return 0; } #endif + +#endif diff --git a/arch/s390/include/asm/kvm_para.h b/arch/s390/include/asm/kvm_para.h index 2c50379..6964db2 100644 --- a/arch/s390/include/asm/kvm_para.h +++ b/arch/s390/include/asm/kvm_para.h @@ -13,6 +13,8 @@ #ifndef __S390_KVM_PARA_H #define __S390_KVM_PARA_H +#ifdef __KERNEL__ + /* * Hypercalls for KVM on s390. The calling convention is similar to the * s390 ABI, so we use R2-R6 for parameters 1-5. In addition we use R1 @@ -147,4 +149,6 @@ static inline unsigned int kvm_arch_para_features(void) return 0; } +#endif + #endif /* __S390_KVM_PARA_H */ -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 32/47] KVM: SVM: enable nested svm by default
From: Joerg Roedel joerg.roe...@amd.com Nested SVM is (in my experience) stable enough to be enabled by default. So omit the requirement to pass a module parameter. Signed-off-by: Joerg Roedel joerg.roe...@amd.com Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/svm.c |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 825035e..be0f6ef 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -124,7 +124,7 @@ static int npt = 1; module_param(npt, int, S_IRUGO); -static int nested = 0; +static int nested = 1; module_param(nested, int, S_IRUGO); static void svm_flush_tlb(struct kvm_vcpu *vcpu); -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 44/47] KVM: Use kvm_{read,write}_guest_virt() to read and write segment descriptors
From: Mikhail Ershov mike.ers...@gmail.com Segment descriptors tables can be placed on two non-contiguous pages. This patch makes reading segment descriptors by linear address. Signed-off-by: Mikhail Ershov mike.ers...@gmail.com Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/x86.c | 10 ++ 1 files changed, 2 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c0e9427..59a8ba4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4021,7 +4021,6 @@ static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu, static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, struct desc_struct *seg_desc) { - gpa_t gpa; struct descriptor_table dtable; u16 index = selector 3; @@ -4031,16 +4030,13 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, kvm_queue_exception_e(vcpu, GP_VECTOR, selector 0xfffc); return 1; } - gpa = vcpu-arch.mmu.gva_to_gpa(vcpu, dtable.base); - gpa += index * 8; - return kvm_read_guest(vcpu-kvm, gpa, seg_desc, 8); + return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); } /* allowed just for 8 bytes segments */ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, struct desc_struct *seg_desc) { - gpa_t gpa; struct descriptor_table dtable; u16 index = selector 3; @@ -4048,9 +4044,7 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, if (dtable.limit index * 8 + 7) return 1; - gpa = vcpu-arch.mmu.gva_to_gpa(vcpu, dtable.base); - gpa += index * 8; - return kvm_write_guest(vcpu-kvm, gpa, seg_desc, 8); + return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); } static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 42/47] KVM: Add missing #include
Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/include/asm/kvm_para.h |2 ++ 1 files changed, 2 insertions(+), 0 deletions(-) diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index b8a3305..c584076 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_KVM_PARA_H #define _ASM_X86_KVM_PARA_H +#include linux/types.h + /* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It * should be used to determine that a VM is running under KVM. */ -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 01/47] KVM: remove superfluous NULL pointer check in kvm_inject_pit_timer_irqs()
From: Bartlomiej Zolnierkiewicz bzoln...@gmail.com This takes care of the following entries from Dan's list: arch/x86/kvm/i8254.c +714 kvm_inject_pit_timer_irqs(6) warning: variable derefenced in initializer 'vcpu' arch/x86/kvm/i8254.c +714 kvm_inject_pit_timer_irqs(6) warning: variable derefenced before check 'vcpu' Reported-by: Dan Carpenter erro...@gmail.com Cc: cor...@lwn.net Cc: e...@redhat.com Cc: Julia Lawall ju...@diku.dk Signed-off-by: Bartlomiej Zolnierkiewicz bzoln...@gmail.com Acked-by: Sheng Yang sh...@linux.intel.com Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/i8254.c |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 472653c..82ad523 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -713,7 +713,7 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) struct kvm *kvm = vcpu-kvm; struct kvm_kpit_state *ps; - if (vcpu pit) { + if (pit) { int inject = 0; ps = pit-pit_state; -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 15/47] KVM: SVM: copy only necessary parts of the control area on vmrun/vmexit
From: Joerg Roedel joerg.roe...@amd.com The vmcb control area contains more then 800 bytes of reserved fields which are unnecessarily copied. Fix this by introducing a copy function which only copies the relevant part and saves time. Signed-off-by: Joerg Roedel joerg.roe...@amd.com Acked-by: Alexander Graf ag...@suse.de Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/svm.c | 36 ++-- 1 files changed, 34 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f11f880..df795bc 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1567,6 +1567,38 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override) nested_svm_exit_handled_real); } +static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb) +{ + struct vmcb_control_area *dst = dst_vmcb-control; + struct vmcb_control_area *from = from_vmcb-control; + + dst-intercept_cr_read= from-intercept_cr_read; + dst-intercept_cr_write = from-intercept_cr_write; + dst-intercept_dr_read= from-intercept_dr_read; + dst-intercept_dr_write = from-intercept_dr_write; + dst-intercept_exceptions = from-intercept_exceptions; + dst-intercept= from-intercept; + dst-iopm_base_pa = from-iopm_base_pa; + dst-msrpm_base_pa= from-msrpm_base_pa; + dst-tsc_offset = from-tsc_offset; + dst-asid = from-asid; + dst-tlb_ctl = from-tlb_ctl; + dst-int_ctl = from-int_ctl; + dst-int_vector = from-int_vector; + dst-int_state= from-int_state; + dst-exit_code= from-exit_code; + dst-exit_code_hi = from-exit_code_hi; + dst-exit_info_1 = from-exit_info_1; + dst-exit_info_2 = from-exit_info_2; + dst-exit_int_info= from-exit_int_info; + dst-exit_int_info_err= from-exit_int_info_err; + dst-nested_ctl = from-nested_ctl; + dst-event_inj= from-event_inj; + dst-event_inj_err= from-event_inj_err; + dst-nested_cr3 = from-nested_cr3; + dst-lbr_ctl = from-lbr_ctl; +} + static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1, void *arg2, void *opaque) { @@ -1612,7 +1644,7 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1, nested_vmcb-control.int_ctl = ~V_INTR_MASKING_MASK; /* Restore the original control entries */ - svm-vmcb-control = hsave-control; + copy_vmcb_control_area(vmcb, hsave); /* Kill any pending exceptions */ if (svm-vcpu.arch.exception.pending == true) @@ -1710,7 +1742,7 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, else hsave-save.cr3= svm-vcpu.arch.cr3; - hsave-control = vmcb-control; + copy_vmcb_control_area(hsave, vmcb); if (svm-vmcb-save.rflags X86_EFLAGS_IF) svm-vcpu.arch.hflags |= HF_HIF_MASK; -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 14/47] KVM: SVM: optimize nested vmrun
From: Joerg Roedel joerg.roe...@amd.com Only copy the necessary parts of the vmcb save area on vmrun and save precious time. Signed-off-by: Joerg Roedel joerg.roe...@amd.com Acked-by: Alexander Graf ag...@suse.de Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/svm.c | 28 +--- 1 files changed, 21 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 2f5f223..f11f880 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1681,6 +1681,7 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, { struct vmcb *nested_vmcb = (struct vmcb *)arg1; struct vmcb *hsave = svm-hsave; + struct vmcb *vmcb = svm-vmcb; /* nested_vmcb is our indicator if nested SVM is activated */ svm-nested_vmcb = svm-vmcb-save.rax; @@ -1691,12 +1692,25 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, /* Save the old vmcb, so we don't need to pick what we save, but can restore everything when a VMEXIT occurs */ - memcpy(hsave, svm-vmcb, sizeof(struct vmcb)); - /* We need to remember the original CR3 in the SPT case */ - if (!npt_enabled) - hsave-save.cr3 = svm-vcpu.arch.cr3; - hsave-save.cr4 = svm-vcpu.arch.cr4; - hsave-save.rip = svm-next_rip; + hsave-save.es = vmcb-save.es; + hsave-save.cs = vmcb-save.cs; + hsave-save.ss = vmcb-save.ss; + hsave-save.ds = vmcb-save.ds; + hsave-save.gdtr = vmcb-save.gdtr; + hsave-save.idtr = vmcb-save.idtr; + hsave-save.efer = svm-vcpu.arch.shadow_efer; + hsave-save.cr0= svm-vcpu.arch.cr0; + hsave-save.cr4= svm-vcpu.arch.cr4; + hsave-save.rflags = vmcb-save.rflags; + hsave-save.rip= svm-next_rip; + hsave-save.rsp= vmcb-save.rsp; + hsave-save.rax= vmcb-save.rax; + if (npt_enabled) + hsave-save.cr3= vmcb-save.cr3; + else + hsave-save.cr3= svm-vcpu.arch.cr3; + + hsave-control = vmcb-control; if (svm-vmcb-save.rflags X86_EFLAGS_IF) svm-vcpu.arch.hflags |= HF_HIF_MASK; @@ -1721,7 +1735,7 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, kvm_set_cr3(svm-vcpu, nested_vmcb-save.cr3); kvm_mmu_reset_context(svm-vcpu); } - svm-vmcb-save.cr2 = nested_vmcb-save.cr2; + svm-vmcb-save.cr2 = svm-vcpu.arch.cr2 = nested_vmcb-save.cr2; kvm_register_write(svm-vcpu, VCPU_REGS_RAX, nested_vmcb-save.rax); kvm_register_write(svm-vcpu, VCPU_REGS_RSP, nested_vmcb-save.rsp); kvm_register_write(svm-vcpu, VCPU_REGS_RIP, nested_vmcb-save.rip); -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 13/47] KVM: SVM: optimize nested #vmexit
From: Joerg Roedel joerg.roe...@amd.com It is more efficient to copy only the relevant parts of the vmcb back to the nested vmcb when we emulate an vmexit. Signed-off-by: Joerg Roedel joerg.roe...@amd.com Acked-by: Alexander Graf ag...@suse.de Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/svm.c | 68 +-- 1 files changed, 33 insertions(+), 35 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 9f72772..2f5f223 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1572,53 +1572,52 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1, { struct vmcb *nested_vmcb = (struct vmcb *)arg1; struct vmcb *hsave = svm-hsave; - u64 nested_save[] = { nested_vmcb-save.cr0, - nested_vmcb-save.cr3, - nested_vmcb-save.cr4, - nested_vmcb-save.efer, - nested_vmcb-control.intercept_cr_read, - nested_vmcb-control.intercept_cr_write, - nested_vmcb-control.intercept_dr_read, - nested_vmcb-control.intercept_dr_write, - nested_vmcb-control.intercept_exceptions, - nested_vmcb-control.intercept, - nested_vmcb-control.msrpm_base_pa, - nested_vmcb-control.iopm_base_pa, - nested_vmcb-control.tsc_offset }; + struct vmcb *vmcb = svm-vmcb; /* Give the current vmcb to the guest */ - memcpy(nested_vmcb, svm-vmcb, sizeof(struct vmcb)); - nested_vmcb-save.cr0 = nested_save[0]; - if (!npt_enabled) - nested_vmcb-save.cr3 = nested_save[1]; - nested_vmcb-save.cr4 = nested_save[2]; - nested_vmcb-save.efer = nested_save[3]; - nested_vmcb-control.intercept_cr_read = nested_save[4]; - nested_vmcb-control.intercept_cr_write = nested_save[5]; - nested_vmcb-control.intercept_dr_read = nested_save[6]; - nested_vmcb-control.intercept_dr_write = nested_save[7]; - nested_vmcb-control.intercept_exceptions = nested_save[8]; - nested_vmcb-control.intercept = nested_save[9]; - nested_vmcb-control.msrpm_base_pa = nested_save[10]; - nested_vmcb-control.iopm_base_pa = nested_save[11]; - nested_vmcb-control.tsc_offset = nested_save[12]; + disable_gif(svm); + + nested_vmcb-save.es = vmcb-save.es; + nested_vmcb-save.cs = vmcb-save.cs; + nested_vmcb-save.ss = vmcb-save.ss; + nested_vmcb-save.ds = vmcb-save.ds; + nested_vmcb-save.gdtr = vmcb-save.gdtr; + nested_vmcb-save.idtr = vmcb-save.idtr; + if (npt_enabled) + nested_vmcb-save.cr3= vmcb-save.cr3; + nested_vmcb-save.cr2= vmcb-save.cr2; + nested_vmcb-save.rflags = vmcb-save.rflags; + nested_vmcb-save.rip= vmcb-save.rip; + nested_vmcb-save.rsp= vmcb-save.rsp; + nested_vmcb-save.rax= vmcb-save.rax; + nested_vmcb-save.dr7= vmcb-save.dr7; + nested_vmcb-save.dr6= vmcb-save.dr6; + nested_vmcb-save.cpl= vmcb-save.cpl; + + nested_vmcb-control.int_ctl = vmcb-control.int_ctl; + nested_vmcb-control.int_vector= vmcb-control.int_vector; + nested_vmcb-control.int_state = vmcb-control.int_state; + nested_vmcb-control.exit_code = vmcb-control.exit_code; + nested_vmcb-control.exit_code_hi = vmcb-control.exit_code_hi; + nested_vmcb-control.exit_info_1 = vmcb-control.exit_info_1; + nested_vmcb-control.exit_info_2 = vmcb-control.exit_info_2; + nested_vmcb-control.exit_int_info = vmcb-control.exit_int_info; + nested_vmcb-control.exit_int_info_err = vmcb-control.exit_int_info_err; + nested_vmcb-control.tlb_ctl = 0; + nested_vmcb-control.event_inj = 0; + nested_vmcb-control.event_inj_err = 0; /* We always set V_INTR_MASKING and remember the old value in hflags */ if (!(svm-vcpu.arch.hflags HF_VINTR_MASK)) nested_vmcb-control.int_ctl = ~V_INTR_MASKING_MASK; - if ((nested_vmcb-control.int_ctl V_IRQ_MASK) - (nested_vmcb-control.int_vector)) { - nsvm_printk(WARNING: IRQ 0x%x still enabled on #VMEXIT\n, - nested_vmcb-control.int_vector); - } - /* Restore the original control entries */ svm-vmcb-control = hsave-control; /* Kill any pending exceptions */ if (svm-vcpu.arch.exception.pending == true) nsvm_printk(WARNING: Pending Exception\n); + kvm_clear_exception_queue(svm-vcpu); kvm_clear_interrupt_queue(svm-vcpu); @@ -1646,7 +1645,6 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, void
[PATCH 08/47] KVM: Call kvm_vcpu_kick() inside pic spinlock
From: Gleb Natapov g...@redhat.com d5ecfdd25 moved it out because back than it was impossible to call it inside spinlock. This restriction no longer exists. Signed-off-by: Gleb Natapov g...@redhat.com Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/i8259.c | 10 +- arch/x86/kvm/irq.h |1 - 2 files changed, 1 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index daf4606..d27320c 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -43,11 +43,9 @@ static void pic_unlock(struct kvm_pic *s) { struct kvm *kvm = s-kvm; unsigned acks = s-pending_acks; - bool wakeup = s-wakeup_needed; struct kvm_vcpu *vcpu; s-pending_acks = 0; - s-wakeup_needed = false; spin_unlock(s-lock); @@ -56,12 +54,6 @@ static void pic_unlock(struct kvm_pic *s) __ffs(acks)); acks = acks - 1; } - - if (wakeup) { - vcpu = s-kvm-bsp_vcpu; - if (vcpu) - kvm_vcpu_kick(vcpu); - } } static void pic_clear_isr(struct kvm_kpic_state *s, int irq) @@ -527,7 +519,7 @@ static void pic_irq_request(void *opaque, int level) s-output = level; if (vcpu level (s-pics[0].isr_ack (1 irq))) { s-pics[0].isr_ack = ~(1 irq); - s-wakeup_needed = true; + kvm_vcpu_kick(vcpu); } } diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 9f59318..7d6058a 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -63,7 +63,6 @@ struct kvm_kpic_state { struct kvm_pic { spinlock_t lock; - bool wakeup_needed; unsigned pending_acks; struct kvm *kvm; struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 23/47] KVM: SVM: clean up nested_svm_exit_handled_msr
From: Joerg Roedel joerg.roe...@amd.com This patch changes nested svm to call nested_svm_exit_handled_msr directly and not through nested_svm_do. Signed-off-by: Joerg Roedel joerg.roe...@amd.com Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/svm.c | 37 ++--- 1 files changed, 22 insertions(+), 15 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 5e55a1b..518d578 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1483,15 +1483,20 @@ static int nested_svm_do(struct vcpu_svm *svm, return retval; } -static int nested_svm_exit_handled_msr(struct vcpu_svm *svm, - void *arg1, void *arg2, - void *opaque) +static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm) { - struct vmcb *nested_vmcb = (struct vmcb *)arg1; - u8 *msrpm = (u8 *)arg2; - u32 t0, t1; - u32 msr = svm-vcpu.arch.regs[VCPU_REGS_RCX]; u32 param = svm-vmcb-control.exit_info_1 1; + u32 msr = svm-vcpu.arch.regs[VCPU_REGS_RCX]; + struct vmcb *nested_vmcb; + bool ret = false; + u32 t0, t1; + u8 *msrpm; + + nested_vmcb = nested_svm_map(svm, svm-nested.vmcb, KM_USER0); + msrpm = nested_svm_map(svm, svm-nested.vmcb_msrpm, KM_USER1); + + if (!nested_vmcb || !msrpm) + goto out; if (!(nested_vmcb-control.intercept (1ULL INTERCEPT_MSR_PROT))) return 0; @@ -1512,13 +1517,17 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm, t0 %= 8; break; default: - return 1; - break; + ret = true; + goto out; } - if (msrpm[t1] ((1 param) t0)) - return 1; - return 0; + ret = msrpm[t1] ((1 param) t0); + +out: + nested_svm_unmap(nested_vmcb, KM_USER0); + nested_svm_unmap(msrpm, KM_USER1); + + return ret; } static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override) @@ -1548,9 +1557,7 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override) switch (exit_code) { case SVM_EXIT_MSR: - if (nested_svm_do(svm, svm-nested.vmcb, svm-nested.vmcb_msrpm, - NULL, nested_svm_exit_handled_msr)) - vmexit = true; + vmexit = nested_svm_exit_handled_msr(svm); break; case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { u32 cr_bits = 1 (exit_code - SVM_EXIT_READ_CR0); -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 17/47] KVM: SVM: move nested svm state into seperate struct
From: Joerg Roedel joerg.roe...@amd.com This makes it more clear for which purpose these members in the vcpu_svm exist. Signed-off-by: Joerg Roedel joerg.roe...@amd.com Acked-by: Alexander Graf ag...@suse.de Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/svm.c | 62 +++ 1 files changed, 33 insertions(+), 29 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 825b825..fbadaa7 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -70,6 +70,18 @@ static const u32 host_save_user_msrs[] = { struct kvm_vcpu; +struct nested_state { + struct vmcb *hsave; + u64 hsave_msr; + u64 vmcb; + + /* These are the merged vectors */ + u32 *msrpm; + + /* gpa pointers to the real vectors */ + u64 vmcb_msrpm; +}; + struct vcpu_svm { struct kvm_vcpu vcpu; struct vmcb *vmcb; @@ -85,16 +97,8 @@ struct vcpu_svm { u64 host_gs_base; u32 *msrpm; - struct vmcb *hsave; - u64 hsave_msr; - - u64 nested_vmcb; - /* These are the merged vectors */ - u32 *nested_msrpm; - - /* gpa pointers to the real vectors */ - u64 nested_vmcb_msrpm; + struct nested_state nested; }; /* enable NPT for AMD64 and X86 with PAE */ @@ -127,7 +131,7 @@ static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) static inline bool is_nested(struct vcpu_svm *svm) { - return svm-nested_vmcb; + return svm-nested.vmcb; } static inline void enable_gif(struct vcpu_svm *svm) @@ -636,7 +640,7 @@ static void init_vmcb(struct vcpu_svm *svm) } force_new_asid(svm-vcpu); - svm-nested_vmcb = 0; + svm-nested.vmcb = 0; svm-vcpu.arch.hflags = 0; enable_gif(svm); @@ -699,9 +703,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) hsave_page = alloc_page(GFP_KERNEL); if (!hsave_page) goto uninit; - svm-hsave = page_address(hsave_page); + svm-nested.hsave = page_address(hsave_page); - svm-nested_msrpm = page_address(nested_msrpm_pages); + svm-nested.msrpm = page_address(nested_msrpm_pages); svm-vmcb = page_address(page); clear_page(svm-vmcb); @@ -731,8 +735,8 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) __free_page(pfn_to_page(svm-vmcb_pa PAGE_SHIFT)); __free_pages(virt_to_page(svm-msrpm), MSRPM_ALLOC_ORDER); - __free_page(virt_to_page(svm-hsave)); - __free_pages(virt_to_page(svm-nested_msrpm), MSRPM_ALLOC_ORDER); + __free_page(virt_to_page(svm-nested.hsave)); + __free_pages(virt_to_page(svm-nested.msrpm), MSRPM_ALLOC_ORDER); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, svm); } @@ -1558,13 +1562,13 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override) switch (svm-vmcb-control.exit_code) { case SVM_EXIT_MSR: - return nested_svm_do(svm, svm-nested_vmcb, -svm-nested_vmcb_msrpm, NULL, + return nested_svm_do(svm, svm-nested.vmcb, +svm-nested.vmcb_msrpm, NULL, nested_svm_exit_handled_msr); default: break; } - return nested_svm_do(svm, svm-nested_vmcb, 0, k, + return nested_svm_do(svm, svm-nested.vmcb, 0, k, nested_svm_exit_handled_real); } @@ -1604,7 +1608,7 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1, void *arg2, void *opaque) { struct vmcb *nested_vmcb = (struct vmcb *)arg1; - struct vmcb *hsave = svm-hsave; + struct vmcb *hsave = svm-nested.hsave; struct vmcb *vmcb = svm-vmcb; /* Give the current vmcb to the guest */ @@ -1679,7 +1683,7 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1, svm-vmcb-control.exit_int_info = 0; /* Exit nested SVM mode */ - svm-nested_vmcb = 0; + svm-nested.vmcb = 0; return 0; } @@ -1687,7 +1691,7 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1, static int nested_svm_vmexit(struct vcpu_svm *svm) { nsvm_printk(VMexit\n); - if (nested_svm_do(svm, svm-nested_vmcb, 0, + if (nested_svm_do(svm, svm-nested.vmcb, 0, NULL, nested_svm_vmexit_real)) return 1; @@ -1703,8 +1707,8 @@ static int nested_svm_vmrun_msrpm(struct vcpu_svm *svm, void *arg1, int i; u32 *nested_msrpm = (u32*)arg1; for (i=0; i PAGE_SIZE * (1 MSRPM_ALLOC_ORDER) / 4; i++) - svm-nested_msrpm[i] = svm-msrpm[i] | nested_msrpm[i]; - svm-vmcb-control.msrpm_base_pa = __pa(svm-nested_msrpm); + svm-nested.msrpm[i] = svm-msrpm[i] | nested_msrpm[i]; + svm-vmcb-control.msrpm_base_pa =
[PATCH 12/47] KVM: SVM: add helper functions for global interrupt flag
From: Joerg Roedel joerg.roe...@amd.com This patch makes the code easier to read when it comes to setting, clearing and checking the status of the virtualized global interrupt flag for the VCPU. Signed-off-by: Joerg Roedel joerg.roe...@amd.com Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/svm.c | 33 + 1 files changed, 25 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 10e718d..9f72772 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -129,6 +129,21 @@ static inline bool is_nested(struct vcpu_svm *svm) return svm-nested_vmcb; } +static inline void enable_gif(struct vcpu_svm *svm) +{ + svm-vcpu.arch.hflags |= HF_GIF_MASK; +} + +static inline void disable_gif(struct vcpu_svm *svm) +{ + svm-vcpu.arch.hflags = ~HF_GIF_MASK; +} + +static inline bool gif_set(struct vcpu_svm *svm) +{ + return !!(svm-vcpu.arch.hflags HF_GIF_MASK); +} + static unsigned long iopm_base; struct kvm_ldttss_desc { @@ -621,7 +636,9 @@ static void init_vmcb(struct vcpu_svm *svm) force_new_asid(svm-vcpu); svm-nested_vmcb = 0; - svm-vcpu.arch.hflags = HF_GIF_MASK; + svm-vcpu.arch.hflags = 0; + + enable_gif(svm); } static int svm_vcpu_reset(struct kvm_vcpu *vcpu) @@ -1629,7 +1646,7 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1, svm-vmcb-save.cpl = 0; svm-vmcb-control.exit_int_info = 0; - svm-vcpu.arch.hflags = ~HF_GIF_MASK; + disable_gif(svm); /* Exit nested SVM mode */ svm-nested_vmcb = 0; @@ -1761,7 +1778,7 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1, svm-vmcb-control.event_inj = nested_vmcb-control.event_inj; svm-vmcb-control.event_inj_err = nested_vmcb-control.event_inj_err; - svm-vcpu.arch.hflags |= HF_GIF_MASK; + enable_gif(svm); return 0; } @@ -1850,7 +1867,7 @@ static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) svm-next_rip = kvm_rip_read(svm-vcpu) + 3; skip_emulated_instruction(svm-vcpu); - svm-vcpu.arch.hflags |= HF_GIF_MASK; + enable_gif(svm); return 1; } @@ -1863,7 +1880,7 @@ static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) svm-next_rip = kvm_rip_read(svm-vcpu) + 3; skip_emulated_instruction(svm-vcpu); - svm-vcpu.arch.hflags = ~HF_GIF_MASK; + disable_gif(svm); /* After a CLGI no interrupts should come */ svm_clear_vintr(svm); @@ -2352,7 +2369,7 @@ static void svm_set_irq(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - BUG_ON(!(svm-vcpu.arch.hflags HF_GIF_MASK)); + BUG_ON(!(gif_set(svm))); svm-vmcb-control.event_inj = vcpu-arch.interrupt.nr | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; @@ -2383,7 +2400,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) struct vmcb *vmcb = svm-vmcb; return (vmcb-save.rflags X86_EFLAGS_IF) !(vmcb-control.int_state SVM_INTERRUPT_SHADOW_MASK) - (svm-vcpu.arch.hflags HF_GIF_MASK) + gif_set(svm) !is_nested(svm); } @@ -2398,7 +2415,7 @@ static void enable_irq_window(struct kvm_vcpu *vcpu) * GIF becomes 1, because that's a separate STGI/VMRUN intercept. * The next time we get that intercept, this function will be * called again though and we'll get the vintr intercept. */ - if (svm-vcpu.arch.hflags HF_GIF_MASK) { + if (gif_set(svm)) { svm_set_vintr(svm); svm_inject_irq(svm, 0x0); } -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 02/47] KVM: MMU: make __kvm_mmu_free_some_pages handle empty list
From: Izik Eidus iei...@redhat.com First check if the list is empty before attempting to look at list entries. Signed-off-by: Izik Eidus iei...@redhat.com Signed-off-by: Marcelo Tosatti mtosa...@redhat.com Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/mmu.c |3 ++- 1 files changed, 2 insertions(+), 1 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 1249c12..28be35c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2705,7 +2705,8 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { - while (vcpu-kvm-arch.n_free_mmu_pages KVM_REFILL_PAGES) { + while (vcpu-kvm-arch.n_free_mmu_pages KVM_REFILL_PAGES + !list_empty(vcpu-kvm-arch.active_mmu_pages)) { struct kvm_mmu_page *sp; sp = container_of(vcpu-kvm-arch.active_mmu_pages.prev, -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 10/47] KVM: Replace pic_lock()/pic_unlock() with direct call to spinlock functions
From: Gleb Natapov g...@redhat.com They are not doing anything else now. Signed-off-by: Gleb Natapov g...@redhat.com Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/i8259.c | 36 1 files changed, 12 insertions(+), 24 deletions(-) diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 3aacd33..01f1516 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -32,18 +32,6 @@ #include linux/kvm_host.h #include trace.h -static void pic_lock(struct kvm_pic *s) - __acquires(s-lock) -{ - spin_lock(s-lock); -} - -static void pic_unlock(struct kvm_pic *s) - __releases(s-lock) -{ - spin_unlock(s-lock); -} - static void pic_clear_isr(struct kvm_kpic_state *s, int irq) { s-isr = ~(1 irq); @@ -56,10 +44,10 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq) void kvm_pic_clear_isr_ack(struct kvm *kvm) { struct kvm_pic *s = pic_irqchip(kvm); - pic_lock(s); + spin_lock(s-lock); s-pics[0].isr_ack = 0xff; s-pics[1].isr_ack = 0xff; - pic_unlock(s); + spin_unlock(s-lock); } /* @@ -160,9 +148,9 @@ static void pic_update_irq(struct kvm_pic *s) void kvm_pic_update_irq(struct kvm_pic *s) { - pic_lock(s); + spin_lock(s-lock); pic_update_irq(s); - pic_unlock(s); + spin_unlock(s-lock); } int kvm_pic_set_irq(void *opaque, int irq, int level) @@ -170,14 +158,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level) struct kvm_pic *s = opaque; int ret = -1; - pic_lock(s); + spin_lock(s-lock); if (irq = 0 irq PIC_NUM_PINS) { ret = pic_set_irq1(s-pics[irq 3], irq 7, level); pic_update_irq(s); trace_kvm_pic_set_irq(irq 3, irq 7, s-pics[irq 3].elcr, s-pics[irq 3].imr, ret == 0); } - pic_unlock(s); + spin_unlock(s-lock); return ret; } @@ -205,7 +193,7 @@ int kvm_pic_read_irq(struct kvm *kvm) int irq, irq2, intno; struct kvm_pic *s = pic_irqchip(kvm); - pic_lock(s); + spin_lock(s-lock); irq = pic_get_irq(s-pics[0]); if (irq = 0) { pic_intack(s-pics[0], irq); @@ -230,7 +218,7 @@ int kvm_pic_read_irq(struct kvm *kvm) intno = s-pics[0].irq_base + irq; } pic_update_irq(s); - pic_unlock(s); + spin_unlock(s-lock); return intno; } @@ -448,7 +436,7 @@ static int picdev_write(struct kvm_io_device *this, printk(KERN_ERR PIC: non byte write\n); return 0; } - pic_lock(s); + spin_lock(s-lock); switch (addr) { case 0x20: case 0x21: @@ -461,7 +449,7 @@ static int picdev_write(struct kvm_io_device *this, elcr_ioport_write(s-pics[addr 1], addr, data); break; } - pic_unlock(s); + spin_unlock(s-lock); return 0; } @@ -478,7 +466,7 @@ static int picdev_read(struct kvm_io_device *this, printk(KERN_ERR PIC: non byte read\n); return 0; } - pic_lock(s); + spin_lock(s-lock); switch (addr) { case 0x20: case 0x21: @@ -492,7 +480,7 @@ static int picdev_read(struct kvm_io_device *this, break; } *(unsigned char *)val = data; - pic_unlock(s); + spin_unlock(s-lock); return 0; } -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 06/47] KVM: ignore reads to perfctr msrs
From: Amit Shah amit.s...@redhat.com We ignore writes to the perfctr msrs. Ignore reads as well. Kaspersky antivirus crashes Windows guests if it can't read these MSRs. Signed-off-by: Amit Shah amit.s...@redhat.com Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/x86.c |3 +++ 1 files changed, 3 insertions(+), 0 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 92b5edd..132c510 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1048,9 +1048,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_K8_SYSCFG: case MSR_K7_HWCR: case MSR_VM_HSAVE_PA: + case MSR_P6_PERFCTR0: + case MSR_P6_PERFCTR1: case MSR_P6_EVNTSEL0: case MSR_P6_EVNTSEL1: case MSR_K7_EVNTSEL0: + case MSR_K7_PERFCTR0: case MSR_K8_INT_PENDING_MSG: case MSR_AMD64_NB_CFG: case MSR_FAM10H_MMIO_CONF_BASE: -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 05/47] KVM: VMX: Optimize vmx_get_cpl()
Instead of calling vmx_get_segment() (which reads a whole bunch of vmcs fields), read only the cs selector which contains the cpl. Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/vmx.c |5 + 1 files changed, 1 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 32e6d20..0ba706e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1773,16 +1773,13 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, static int vmx_get_cpl(struct kvm_vcpu *vcpu) { - struct kvm_segment kvm_seg; - if (!(vcpu-arch.cr0 X86_CR0_PE)) /* if real mode */ return 0; if (vmx_get_rflags(vcpu) X86_EFLAGS_VM) /* if virtual 8086 */ return 3; - vmx_get_segment(vcpu, kvm_seg, VCPU_SREG_CS); - return kvm_seg.selector 3; + return vmcs_read16(GUEST_CS_SELECTOR) 3; } static u32 vmx_segment_access_rights(struct kvm_segment *var) -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/2] eventfd: new EFD_STATE flag
On 08/26/2009 01:29 PM, Michael S. Tsirkin wrote: How we wanted to solve it with EFD_STATE: Share a separate eventfd between each device and the hypervisor. device sets state to either 0 or 1. hypervisor polls all eventfds, reads interrupt line on changes, calculates the interrupt level and updates guest. Alternative solution: shared memory where each device writes interrupt line value. This makes setup more complex (need to share around much more than just an fd), and makes access from interrupt impossible unless we lock the memory (and locking userspace memory introduces yet another set of issues). For completeness: If the device is implemented in the same process as the hypervisor, an eventfd isn't really needed, as there is an ioctl which performs the same operation. An important class of device implementations is real devices that are assigned to a guest. We would like to forward the interrupt directly from the host interrupt handler to qemu. Currently, we have a kvm-specific interrupt handler that forwards the interrupt using kvm-specific interfaces. We would like to use a generic interrupt handler implemented by uio, so we want a generic interrupt transfer mechanism. uio already supports edge-triggered interrupts using an eventfd-like mechanism. So it makes sense to extend uio to support real eventfds, and to make it also support level-triggered interrupts. We can work around the lack of state eventfd by having userspace wait on whatever mechanism uio uses to make the interrupt state visible, and then use the ioctl mentioned above to inform the hypervisor of this state. But it's faster and nicer to give both components an eventfd and let them communicate directly. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 07/47] KVM: fix EFER read buffer overflow
From: Roel Kluin roel.kl...@gmail.com Check whether index is within bounds before grabbing the element. Signed-off-by: Roel Kluin roel.kl...@gmail.com Cc: Avi Kivity a...@redhat.com Signed-off-by: Andrew Morton a...@linux-foundation.org Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/kvm/vmx.c |7 +-- 1 files changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0ba706e..31c3a87 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -571,12 +571,15 @@ static void reload_tss(void) static void load_transition_efer(struct vcpu_vmx *vmx) { int efer_offset = vmx-msr_offset_efer; - u64 host_efer = vmx-host_msrs[efer_offset].data; - u64 guest_efer = vmx-guest_msrs[efer_offset].data; + u64 host_efer; + u64 guest_efer; u64 ignore_bits; if (efer_offset 0) return; + host_efer = vmx-host_msrs[efer_offset].data; + guest_efer = vmx-guest_msrs[efer_offset].data; + /* * NX is emulated; LMA and LME handled by hardware; SCE meaninless * outside long mode -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 00/47] KVM updates for 2.6.32 merge window (4/4)
Fourth and final batch of the 2.6.32 KVM patch queue. Amit Shah (2): KVM: ignore reads to perfctr msrs Documentation: Update KVM list email address Anthony Liguori (1): KVM: When switching to a vm8086 task, load segments as 16-bit Avi Kivity (10): KVM: VMX: Optimize vmx_get_cpl() x86: Export kmap_atomic_to_page() KVM: SVM: Drop tlb flush workaround in npt KVM: Move #endif KVM_CAP_IRQ_ROUTING to correct place KVM: VMX: Adjust rflags if in real mode emulation KVM: Rename x86_emulate.c to emulate.c KVM: Add __KERNEL__ guards to exported headers KVM: Add missing #include KVM: Protect update_cr8_intercept() when running without an apic KVM: Document KVM_CAP_IRQCHIP Bartlomiej Zolnierkiewicz (1): KVM: remove superfluous NULL pointer check in kvm_inject_pit_timer_irqs() Gleb Natapov (4): KVM: Call kvm_vcpu_kick() inside pic spinlock KVM: Call ack notifiers from PIC when guest OS acks an IRQ. KVM: Replace pic_lock()/pic_unlock() with direct call to spinlock functions KVM: Update cr8 intercept when APIC TPR is changed by userspace Izik Eidus (1): KVM: MMU: make __kvm_mmu_free_some_pages handle empty list Jan Kiszka (1): KVM: x86: Disallow hypercalls for guest callers in rings 0 Joerg Roedel (21): KVM: SVM: add helper functions for global interrupt flag KVM: SVM: optimize nested #vmexit KVM: SVM: optimize nested vmrun KVM: SVM: copy only necessary parts of the control area on vmrun/vmexit KVM: SVM: complete interrupts after handling nested exits KVM: SVM: move nested svm state into seperate struct KVM: SVM: cache nested intercepts KVM: SVM: consolidate nested_svm_exit_handled KVM: SVM: do nested vmexit in nested_svm_exit_handled KVM: SVM: simplify nested_svm_check_exception KVM: SVM: get rid of nested_svm_vmexit_real KVM: SVM: clean up nested_svm_exit_handled_msr KVM: SVM: clean up nestec vmload/vmsave paths KVM: SVM: clean up nested vmrun path KVM: SVM: remove nested_svm_do and helper functions KVM: SVM: handle errors in vmrun emulation path appropriatly KVM: SVM: move special nested exit handling to separate function KVM: SVM: remove unnecessary is_nested check from svm_cpu_run KVM: SVM: move nested_svm_intr main logic out of if-clause KVM: SVM: check for nested VINTR flag in svm_interrupt_allowed KVM: SVM: enable nested svm by default Marcelo Tosatti (1): KVM: MMU: fix bogus alloc_mmu_pages assignment Michael S. Tsirkin (1): KVM: export kvm_para.h Mikhail Ershov (1): KVM: Use kvm_{read,write}_guest_virt() to read and write segment descriptors Mohammed Gamal (1): KVM: x86 emulator: Add adc and sbb missing decoder flags Roel Kluin (1): KVM: fix EFER read buffer overflow Sheng Yang (1): KVM: VMX: Fix EPT with WP bit change during paging Documentation/ioctl/ioctl-number.txt |2 +- Documentation/kvm/api.txt | 76 +++ arch/ia64/include/asm/kvm_para.h |4 + arch/s390/include/asm/kvm_para.h |4 + .../asm/{kvm_x86_emulate.h = kvm_emulate.h} |0 arch/x86/include/asm/kvm_host.h|2 +- arch/x86/include/asm/kvm_para.h|2 + arch/x86/kvm/Makefile |2 +- arch/x86/kvm/{x86_emulate.c = emulate.c} |8 +- arch/x86/kvm/i8254.c |2 +- arch/x86/kvm/i8259.c | 64 +-- arch/x86/kvm/irq.h |1 - arch/x86/kvm/mmu.c | 11 +- arch/x86/kvm/svm.c | 650 +++- arch/x86/kvm/vmx.c | 25 +- arch/x86/kvm/x86.c | 35 +- arch/x86/mm/highmem_32.c |1 + include/asm-generic/Kbuild.asm |5 + include/linux/Kbuild |4 + include/linux/kvm_para.h |1 + virt/kvm/kvm_main.c|2 +- 21 files changed, 525 insertions(+), 376 deletions(-) rename arch/x86/include/asm/{kvm_x86_emulate.h = kvm_emulate.h} (100%) rename arch/x86/kvm/{x86_emulate.c = emulate.c} (99%) -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 11/47] x86: Export kmap_atomic_to_page()
Needed by KVM. Signed-off-by: Avi Kivity a...@redhat.com --- arch/x86/mm/highmem_32.c |1 + 1 files changed, 1 insertions(+), 0 deletions(-) diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 2112ed5..572f47c 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -104,6 +104,7 @@ EXPORT_SYMBOL(kunmap); EXPORT_SYMBOL(kmap_atomic); EXPORT_SYMBOL(kunmap_atomic); EXPORT_SYMBOL(kmap_atomic_prot); +EXPORT_SYMBOL(kmap_atomic_to_page); void __init set_highmem_pages_init(void) { -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v4 0/9] make interrupt injection lockless (almost)
On 08/24/2009 11:54 AM, Gleb Natapov wrote: kvm-irq_lock protects too much stuff, but still fail to protect everything it was design to protect (see ack notifiers call in pic). I want to make IRQ injection fast path as lockless as possible. This patch series removes kvm-irq_lock from irq injection path effectively making interrupt injection to lapic lockless (several kvm_irq_delivery_to_apic() may run in parallel), but access to lapic was never fully locked in the first place. VCPU could access lapic in parallel with interrupt injection. Patches 2-3 changes irq routing data structure to much more efficient one. Applied all, thanks. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Extending virtio_console to support multiple ports
On (Tue) Aug 25 2009 [11:47:20], Amit Shah wrote: Hello all, Here is a new iteration of the patch series that implements a transport for guest and host communications. The code has been updated to reuse the virtio-console device instead of creating a new virtio-serial device. And the problem now is that hvc calls the put_chars function with spinlocks held and we now allocate pages in send_buf(), called from put_chars. A few solutions: - Keep things as they are, virtio_console.c remains as it is and virtio_serial.c gets added - Have separate write paths for console devices in virtio_console.c, which would beat the purpose of merging the two drivers and then they'd be better off standalone - Convert hvc's usage of spinlocks to mutexes. I've no idea how this will play out; I'm no expert here. But I did try doing this and so far it all looks OK. No lockups, lockdep warnings, nothing. I have full debugging enabled. But this doesn't mean it's right. Comments? Amit -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] virtio-blk: set QUEUE_ORDERED_DRAIN by default
On Tue, 25 Aug 2009 11:46:08 pm Christoph Hellwig wrote: On Tue, Aug 25, 2009 at 11:41:37PM +0930, Rusty Russell wrote: On Fri, 21 Aug 2009 06:26:16 am Christoph Hellwig wrote: Currently virtio-blk doesn't set any QUEUE_ORDERED_ flag by default, which means it does not allow filesystems to use barriers. But the typical use case for virtio-blk is to use a backed that uses synchronous I/O Really? Does qemu open with O_SYNC? I'm definitely no block expert, but this seems strange... Rusty. Qemu can open it various ways, but the only one that is fully safe is O_SYNC (cache=writethrough). (Rusty goes away and reads the qemu man page). By default, if no explicit caching is specified for a qcow2 disk image, cache=writeback will be used. Are you claiming qcow2 is unusual? I can believe snapshot is less common, though I use it all the time. You'd normally have to add a feature for something like this. I don't think this is different. Sorry, Rusty. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Page allocation failures in guest
On Wed, 26 Aug 2009 02:25:01 pm Pierre Ossman wrote: On Wed, 26 Aug 2009 11:47:17 +0930 Rusty Russell ru...@rustcorp.com.au wrote: On Fri, 14 Aug 2009 05:55:48 am Pierre Ossman wrote: On Wed, 12 Aug 2009 15:01:52 +0930 Rusty Russell ru...@rustcorp.com.au wrote: Subject: virtio: net refill on out-of-memory ... Patch applied. Now we wait. :) Any results? It's been up for 12 days, so I'd say it works. But there is nothing in dmesg, which suggests I haven't triggered the condition yet. No, that's totally expected. I wouldn't expect a GFP_ATOMIC order 0 alloc failure to be noted, and the patch doesn't add any printks. Dave, can you push this to Linus ASAP? Thanks, Rusty. Subject: virtio: net refill on out-of-memory If we run out of memory, use keventd to fill the buffer. There's a report of this happening: Page allocation failures in guest, Message-ID: 20090713115158.0a489...@mjolnir.ossman.eu Signed-off-by: Rusty Russell ru...@rustcorp.com.au --- drivers/net/virtio_net.c | 61 +++ 1 file changed, 46 insertions(+), 15 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -71,6 +71,9 @@ struct virtnet_info struct sk_buff_head recv; struct sk_buff_head send; + /* Work struct for refilling if we run low on memory. */ + struct delayed_work refill; + /* Chain pages by the private ptr. */ struct page *pages; }; @@ -274,19 +277,22 @@ drop: dev_kfree_skb(skb); } -static void try_fill_recv_maxbufs(struct virtnet_info *vi) +static bool try_fill_recv_maxbufs(struct virtnet_info *vi, gfp_t gfp) { struct sk_buff *skb; struct scatterlist sg[2+MAX_SKB_FRAGS]; int num, err, i; + bool oom = false; sg_init_table(sg, 2+MAX_SKB_FRAGS); for (;;) { struct virtio_net_hdr *hdr; skb = netdev_alloc_skb(vi-dev, MAX_PACKET_LEN + NET_IP_ALIGN); - if (unlikely(!skb)) + if (unlikely(!skb)) { + oom = true; break; + } skb_reserve(skb, NET_IP_ALIGN); skb_put(skb, MAX_PACKET_LEN); @@ -297,7 +303,7 @@ static void try_fill_recv_maxbufs(struct if (vi-big_packets) { for (i = 0; i MAX_SKB_FRAGS; i++) { skb_frag_t *f = skb_shinfo(skb)-frags[i]; - f-page = get_a_page(vi, GFP_ATOMIC); + f-page = get_a_page(vi, gfp); if (!f-page) break; @@ -326,31 +332,35 @@ static void try_fill_recv_maxbufs(struct if (unlikely(vi-num vi-max)) vi-max = vi-num; vi-rvq-vq_ops-kick(vi-rvq); + return !oom; } -static void try_fill_recv(struct virtnet_info *vi) +/* Returns false if we couldn't fill entirely (OOM). */ +static bool try_fill_recv(struct virtnet_info *vi, gfp_t gfp) { struct sk_buff *skb; struct scatterlist sg[1]; int err; + bool oom = false; - if (!vi-mergeable_rx_bufs) { - try_fill_recv_maxbufs(vi); - return; - } + if (!vi-mergeable_rx_bufs) + return try_fill_recv_maxbufs(vi, gfp); for (;;) { skb_frag_t *f; skb = netdev_alloc_skb(vi-dev, GOOD_COPY_LEN + NET_IP_ALIGN); - if (unlikely(!skb)) + if (unlikely(!skb)) { + oom = true; break; + } skb_reserve(skb, NET_IP_ALIGN); f = skb_shinfo(skb)-frags[0]; - f-page = get_a_page(vi, GFP_ATOMIC); + f-page = get_a_page(vi, gfp); if (!f-page) { + oom = true; kfree_skb(skb); break; } @@ -374,6 +384,7 @@ static void try_fill_recv(struct virtnet if (unlikely(vi-num vi-max)) vi-max = vi-num; vi-rvq-vq_ops-kick(vi-rvq); + return !oom; } static void skb_recv_done(struct virtqueue *rvq) @@ -386,6 +397,23 @@ static void skb_recv_done(struct virtque } } +static void refill_work(struct work_struct *work) +{ + struct virtnet_info *vi; + bool still_empty; + + vi = container_of(work, struct virtnet_info, refill.work); + napi_disable(vi-napi); + try_fill_recv(vi, GFP_KERNEL); + still_empty = (vi-num == 0); + napi_enable(vi-napi); + + /* In theory, this can happen: if we don't get any buffers in +* we will *never* try to fill again. */ + if (still_empty) + schedule_delayed_work(vi-refill, HZ/2); +} + static int virtnet_poll(struct napi_struct
Re: [PATCH] virtio-blk: set QUEUE_ORDERED_DRAIN by default
On 08/26/2009 03:06 PM, Rusty Russell wrote: On Tue, 25 Aug 2009 11:46:08 pm Christoph Hellwig wrote: On Tue, Aug 25, 2009 at 11:41:37PM +0930, Rusty Russell wrote: On Fri, 21 Aug 2009 06:26:16 am Christoph Hellwig wrote: Currently virtio-blk doesn't set any QUEUE_ORDERED_ flag by default, which means it does not allow filesystems to use barriers. But the typical use case for virtio-blk is to use a backed that uses synchronous I/O Really? Does qemu open with O_SYNC? I'm definitely no block expert, but this seems strange... Rusty. Qemu can open it various ways, but the only one that is fully safe is O_SYNC (cache=writethrough). (Rusty goes away and reads the qemu man page). By default, if no explicit caching is specified for a qcow2 disk image, cache=writeback will be used. It's now switched to writethrough. In any case, cache=writeback means lie to the guest, we don't care about integrity. Are you claiming qcow2 is unusual? I can believe snapshot is less common, though I use it all the time. You'd normally have to add a feature for something like this. I don't think this is different. Why do we need to add a feature for this? -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH][RFC] Use return value from kvm_set_irq() to re-inject PIT interrupts.
On Mon, Aug 24, 2009 at 09:19:05PM +0300, Gleb Natapov wrote: Current code very fragile and relies on hacks to work. Lets take calling of ack notifiers on pic reset as an example. Why is it needed? To signal the ack notifiers users that, in case of reset with pending IRR, the given interrupt has been acked (its an artificial ack event). But IRR was not acked. The reason it is done is that otherwise the current logic will prevent further interrupt injection. Or will keep the host irq disabled, for the assigned device case (in case you drop the hackish ack notification from pic_reset). I don't think it exists there because of PIT reinjection only, it seems a generic problem for users of ack notifiers (a reset notifier as you mentioned would also do it, and be cleaner). Is there a need to differentiate between actual interrupt ack and reset with pending IRR? At the time this code was written, there was no indication that differentation would be necessary. This is two different things. Ack notifiers should be called when guest acks interrupt. Calling it on reset is wrong (see below). We can add reset notifiers, but we just build yet another infrastructure to support current reinjection scheme. Its not specific to PIT reinjection. Anything that relies on ack notification to perform some action (either reinjection or host irq line enablement or some other use) suffers from the same thing. You might argue that a separate reset notification is more appropriate. It is obviously wrong thing to do from assigned devices POV. Thats not entirely clear to me. So what happens if a guest with PIC assigned device resets with a pending IRR? The host interrupt line will be kept disabled, even though the guest is able to process further interrupts? The host interrupt line will be enabled (assigned device ack notifier does this) without clearing interrupt condition in assigned device (guest hasn't acked irq so how can we be sure it ran device's irq handler?). Host will hang. Why ioapic calls mask notifiers but pic doesn't? Because it is not implemented. I see that. Why? Why it was important to implement for ioapic but not for pic? 4780c65904f0fc4e312ee2da9383eacbe04e61ea Do we know what doesn't work now? What you mean? Besides diffstat for the patch shows: 2 files changed, 16 insertions(+), 59 deletions(-) 43 lines less for the same functionality. Looks like clear win to me. Ack notifiers are asynchronous notifications. Using the return value from kvm_set_irq implies that timer emulation is based on a tick generating device on the host side. No notification is needed in the first place. You know immediately if injection fails or not. I don't see why using return value from kvm_set_irq implies that timer emulation is based on a tick generating device on the host side? What can you do with ack notifiers that can't be done without? If you don't have a host timer emulating the guest PIT, to periodically bang on kvm_set_irq, how do you know when to attempt reinjection? You keep calling kvm_set_irq on every guest entry to figure out when reinjection is possible? If we have timer to inject then yes. It is relatively cheap. Most of the time pending count will be zero. Won't work with non-tick-based emulation on the host. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH][RFC] Use return value from kvm_set_irq() to re-inject PIT interrupts.
On Mon, Aug 24, 2009 at 10:01:50PM +0300, Gleb Natapov wrote: On Mon, Aug 24, 2009 at 09:19:05PM +0300, Gleb Natapov wrote: It is obviously wrong thing to do from assigned devices POV. Thats not entirely clear to me. So what happens if a guest with PIC assigned device resets with a pending IRR? The host interrupt line will be kept disabled, even though the guest is able to process further interrupts? The host interrupt line will be enabled (assigned device ack notifier does this) without clearing interrupt condition in assigned device (guest hasn't acked irq so how can we be sure it ran device's irq handler?). Host will hang. Actually, on the second thought, it will not hang. Next time host interrupt handler runs it will disable interrupt once again. Right. And if you don't signal ack notification on reset with pending IRR the host line will be kept disabled. As said on other email, its not specific to PIT reinjection logic. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH][RFC] Use return value from kvm_set_irq() to re-inject PIT interrupts.
On Wed, Aug 26, 2009 at 09:43:48AM -0300, Marcelo Tosatti wrote: On Mon, Aug 24, 2009 at 09:19:05PM +0300, Gleb Natapov wrote: Current code very fragile and relies on hacks to work. Lets take calling of ack notifiers on pic reset as an example. Why is it needed? To signal the ack notifiers users that, in case of reset with pending IRR, the given interrupt has been acked (its an artificial ack event). But IRR was not acked. The reason it is done is that otherwise the current logic will prevent further interrupt injection. Or will keep the host irq disabled, for the assigned device case (in case you drop the hackish ack notification from pic_reset). I don't think it exists there because of PIT reinjection only, it seems a generic problem for users of ack notifiers (a reset notifier as you mentioned would also do it, and be cleaner). Yes, I agree pic reset should be propagated to assigned devices somehow. Is there a need to differentiate between actual interrupt ack and reset with pending IRR? At the time this code was written, there was no indication that differentation would be necessary. This is two different things. Ack notifiers should be called when guest acks interrupt. Calling it on reset is wrong (see below). We can add reset notifiers, but we just build yet another infrastructure to support current reinjection scheme. Its not specific to PIT reinjection. Anything that relies on ack notification to perform some action (either reinjection or host irq line enablement or some other use) suffers from the same thing. You might argue that a separate reset notification is more appropriate. It is obviously wrong thing to do from assigned devices POV. Thats not entirely clear to me. So what happens if a guest with PIC assigned device resets with a pending IRR? The host interrupt line will be kept disabled, even though the guest is able to process further interrupts? The host interrupt line will be enabled (assigned device ack notifier does this) without clearing interrupt condition in assigned device (guest hasn't acked irq so how can we be sure it ran device's irq handler?). Host will hang. Why ioapic calls mask notifiers but pic doesn't? Because it is not implemented. I see that. Why? Why it was important to implement for ioapic but not for pic? 4780c65904f0fc4e312ee2da9383eacbe04e61ea This commit and previous one adds infrastructure to fix a bug that is there only because how we choose to do pit reinjection. Do it differently and you can revert both of them. Do we know what doesn't work now? What you mean? I mean that pit doesn't call mask notifier so similar bug to 4780c65 hides somewhere out there. How can we test it? Besides diffstat for the patch shows: 2 files changed, 16 insertions(+), 59 deletions(-) 43 lines less for the same functionality. Looks like clear win to me. Ack notifiers are asynchronous notifications. Using the return value from kvm_set_irq implies that timer emulation is based on a tick generating device on the host side. No notification is needed in the first place. You know immediately if injection fails or not. I don't see why using return value from kvm_set_irq implies that timer emulation is based on a tick generating device on the host side? What can you do with ack notifiers that can't be done without? If you don't have a host timer emulating the guest PIT, to periodically bang on kvm_set_irq, how do you know when to attempt reinjection? You keep calling kvm_set_irq on every guest entry to figure out when reinjection is possible? If we have timer to inject then yes. It is relatively cheap. Most of the time pending count will be zero. Won't work with non-tick-based emulation on the host. Why? This is the most important point, can you elaborate? -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[KVM-AUTOTEST PATCH] kvm: specify nic_model explicitly for rtl8139
Instead of relying on the default nic_model, specify it explicitly. Different qemu branches use different defaults, and the default may change. Signed-off-by: Avi Kivity a...@redhat.com --- client/tests/kvm/kvm_tests.cfg.sample |1 + 1 files changed, 1 insertions(+), 0 deletions(-) diff --git a/client/tests/kvm/kvm_tests.cfg.sample b/client/tests/kvm/kvm_tests.cfg.sample index a83ef9b..aa6162f 100644 --- a/client/tests/kvm/kvm_tests.cfg.sample +++ b/client/tests/kvm/kvm_tests.cfg.sample @@ -145,6 +145,7 @@ variants: # NICs variants: - @rtl8139: +nic_model = rtl8139 - virtio: rtl8139.install rtl8139.setup no install setup nic_model = virtio -- 1.6.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH][RFC] Use return value from kvm_set_irq() to re-inject PIT interrupts.
On Wed, Aug 26, 2009 at 04:19:17PM +0300, Gleb Natapov wrote: On Wed, Aug 26, 2009 at 09:43:48AM -0300, Marcelo Tosatti wrote: On Mon, Aug 24, 2009 at 09:19:05PM +0300, Gleb Natapov wrote: Current code very fragile and relies on hacks to work. Lets take calling of ack notifiers on pic reset as an example. Why is it needed? To signal the ack notifiers users that, in case of reset with pending IRR, the given interrupt has been acked (its an artificial ack event). But IRR was not acked. The reason it is done is that otherwise the current logic will prevent further interrupt injection. Or will keep the host irq disabled, for the assigned device case (in case you drop the hackish ack notification from pic_reset). I don't think it exists there because of PIT reinjection only, it seems a generic problem for users of ack notifiers (a reset notifier as you mentioned would also do it, and be cleaner). Yes, I agree pic reset should be propagated to assigned devices somehow. Is there a need to differentiate between actual interrupt ack and reset with pending IRR? At the time this code was written, there was no indication that differentation would be necessary. This is two different things. Ack notifiers should be called when guest acks interrupt. Calling it on reset is wrong (see below). We can add reset notifiers, but we just build yet another infrastructure to support current reinjection scheme. Its not specific to PIT reinjection. Anything that relies on ack notification to perform some action (either reinjection or host irq line enablement or some other use) suffers from the same thing. You might argue that a separate reset notification is more appropriate. It is obviously wrong thing to do from assigned devices POV. Thats not entirely clear to me. So what happens if a guest with PIC assigned device resets with a pending IRR? The host interrupt line will be kept disabled, even though the guest is able to process further interrupts? The host interrupt line will be enabled (assigned device ack notifier does this) without clearing interrupt condition in assigned device (guest hasn't acked irq so how can we be sure it ran device's irq handler?). Host will hang. Why ioapic calls mask notifiers but pic doesn't? Because it is not implemented. I see that. Why? Why it was important to implement for ioapic but not for pic? 4780c65904f0fc4e312ee2da9383eacbe04e61ea This commit and previous one adds infrastructure to fix a bug that is there only because how we choose to do pit reinjection. Do it differently and you can revert both of them. Do we know what doesn't work now? What you mean? I mean that pit doesn't call mask notifier so similar bug to 4780c65 hides somewhere out there. How can we test it? Besides diffstat for the patch shows: 2 files changed, 16 insertions(+), 59 deletions(-) 43 lines less for the same functionality. Looks like clear win to me. Ack notifiers are asynchronous notifications. Using the return value from kvm_set_irq implies that timer emulation is based on a tick generating device on the host side. No notification is needed in the first place. You know immediately if injection fails or not. I don't see why using return value from kvm_set_irq implies that timer emulation is based on a tick generating device on the host side? What can you do with ack notifiers that can't be done without? If you don't have a host timer emulating the guest PIT, to periodically bang on kvm_set_irq, how do you know when to attempt reinjection? You keep calling kvm_set_irq on every guest entry to figure out when reinjection is possible? If we have timer to inject then yes. It is relatively cheap. Most of the time pending count will be zero. Won't work with non-tick-based emulation on the host. Why? This is the most important point, can you elaborate? From http://www.mail-archive.com/kvm@vger.kernel.org/msg18644.html. An injectable timer interrupt is defined by: - time(now) = time(next_expiration) - Previous timer interrupt has been acked (thus we can inject). The thing is, sure you can drop ack notifiers and check IRR on every guest entry, but why bother if you can receive an asynchronous notification? Would you prefer to replace + if (!ktimer-can_inject) With kvm_set_irq() ? Not relatively cheap. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH][RFC] Use return value from kvm_set_irq() to re-inject PIT interrupts.
On 08/26/2009 04:38 PM, Marcelo Tosatti wrote: An injectable timer interrupt is defined by: - time(now)= time(next_expiration) - Previous timer interrupt has been acked (thus we can inject). The thing is, sure you can drop ack notifiers and check IRR on every guest entry, but why bother if you can receive an asynchronous notification? Would you prefer to replace + if (!ktimer-can_inject) With kvm_set_irq() ? Not relatively cheap. Well, we expect it to be a rare condition that we have pending timer interrupts, so if it leads to significant code simplification, it can be worthwhile. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH][RFC] Use return value from kvm_set_irq() to re-inject PIT interrupts.
On Wed, Aug 26, 2009 at 04:19:17PM +0300, Gleb Natapov wrote: On Wed, Aug 26, 2009 at 09:43:48AM -0300, Marcelo Tosatti wrote: On Mon, Aug 24, 2009 at 09:19:05PM +0300, Gleb Natapov wrote: Current code very fragile and relies on hacks to work. Lets take calling of ack notifiers on pic reset as an example. Why is it needed? To signal the ack notifiers users that, in case of reset with pending IRR, the given interrupt has been acked (its an artificial ack event). But IRR was not acked. The reason it is done is that otherwise the current logic will prevent further interrupt injection. Or will keep the host irq disabled, for the assigned device case (in case you drop the hackish ack notification from pic_reset). I don't think it exists there because of PIT reinjection only, it seems a generic problem for users of ack notifiers (a reset notifier as you mentioned would also do it, and be cleaner). Yes, I agree pic reset should be propagated to assigned devices somehow. Is there a need to differentiate between actual interrupt ack and reset with pending IRR? At the time this code was written, there was no indication that differentation would be necessary. This is two different things. Ack notifiers should be called when guest acks interrupt. Calling it on reset is wrong (see below). We can add reset notifiers, but we just build yet another infrastructure to support current reinjection scheme. Its not specific to PIT reinjection. Anything that relies on ack notification to perform some action (either reinjection or host irq line enablement or some other use) suffers from the same thing. You might argue that a separate reset notification is more appropriate. It is obviously wrong thing to do from assigned devices POV. Thats not entirely clear to me. So what happens if a guest with PIC assigned device resets with a pending IRR? The host interrupt line will be kept disabled, even though the guest is able to process further interrupts? The host interrupt line will be enabled (assigned device ack notifier does this) without clearing interrupt condition in assigned device (guest hasn't acked irq so how can we be sure it ran device's irq handler?). Host will hang. Why ioapic calls mask notifiers but pic doesn't? Because it is not implemented. I see that. Why? Why it was important to implement for ioapic but not for pic? 4780c65904f0fc4e312ee2da9383eacbe04e61ea This commit and previous one adds infrastructure to fix a bug that is there only because how we choose to do pit reinjection. Do it differently and you can revert both of them. Do we know what doesn't work now? What you mean? I mean that pit doesn't call mask notifier so similar bug to 4780c65 hides somewhere out there. How can we test it? Program periodic PIT, mask irq0, wait a while, unmask irq0 ? -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH][RFC] Use return value from kvm_set_irq() to re-inject PIT interrupts.
On Wed, Aug 26, 2009 at 10:38:42AM -0300, Marcelo Tosatti wrote: If you don't have a host timer emulating the guest PIT, to periodically bang on kvm_set_irq, how do you know when to attempt reinjection? You keep calling kvm_set_irq on every guest entry to figure out when reinjection is possible? If we have timer to inject then yes. It is relatively cheap. Most of the time pending count will be zero. Won't work with non-tick-based emulation on the host. Why? This is the most important point, can you elaborate? From http://www.mail-archive.com/kvm@vger.kernel.org/msg18644.html. An injectable timer interrupt is defined by: - time(now) = time(next_expiration) - Previous timer interrupt has been acked (thus we can inject). The thing is, sure you can drop ack notifiers and check IRR on every guest entry, but why bother if you can receive an asynchronous notification? Would you prefer to replace + if (!ktimer-can_inject) With kvm_set_irq() ? Not relatively cheap. Most of the times time(now) will be less then time(next_expiration) so on most entries kvm_set_irq() will not be called at all. When interrupt has to be injected I prefer to try to inject it ASAP. PIC and APIC effectively have 2 element interrupt queue (irr/isr) so injection may succeed even though ack was not yet received. -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCHv4 2/2] vhost_net: a kernel-level virtio server
On Wed, Aug 26, 2009 at 03:40:59PM +0200, Arnd Bergmann wrote: On Tuesday 25 August 2009, Michael S. Tsirkin wrote: I'd like to avoid that here, though it's kind of ugly. We'd need VHOST_GET_FEATURES (and ACK) to take a struct like: u32 feature_size; u32 features[]; Hmm, variable length ioctl arguments, I'd rather not go there. The ioctl infrastructure already has a length argument encoded in the ioctl number. We can use that if we need more, e.g. /* now */ #define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64) /* * uncomment if we run out of feature bits: struct vhost_get_features2 { __u64 bits[2]; }; #define VHOST_GET_FEATURES2 _IOR(VHOST_VIRTIO, 0x00, \ struct vhost_get_features2) */ Thinking about this proposal some more, how will the guest determine the size to supply the GET_FEATURES ioctl? Wait, the *guest*? Sorry. the userspace hypervisor. Maybe I misunderstood something in a major way here, but I expected the features to be negotiated between host user space (qemu) and host kernel, as well as between guest and qemu (as they are already), but never between guest and kernel. Yes. I would certainly expect the bits to be distinct from the virtio-net feature bits. E.g. stuff like TAP frame format opposed to TCP socket frame format (length+date) is something we need to negotiate here but that the guest does not care about. My idea is to use virtio format for things I share with virtio (e.g. mergeable buffers). Since we are a kind of transport, I thought that I will use the transport bits, that is bits 28 and up for vhost things. Since we are a bit tight in 32 bit space already, let's just use a 64 bit integer and be done with it? Can't hurt, but don't use a struct unless you think we are going to need more than 64 bits. Arnd -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 05/47] KVM: VMX: Optimize vmx_get_cpl()
Op 26-08-09 12:29, Avi Kivity schreef: Instead of calling vmx_get_segment() (which reads a whole bunch of vmcs fields), read only the cs selector which contains the cpl. Signed-off-by: Avi Kivity a...@redhat.com Can't we also optimise cs_ss_rpl_check()? (Please review, untested.) - 8 -- 8 Instead of calling vmx_get_segment() (which reads a whole bunch of vmcs fields), read only the cs/ss selectors which contains the rpls. Signed-off-by: Roel Kluin roel.kl...@gmail.com --- diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 29f9129..5d8512a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1932,13 +1932,8 @@ static bool ldtr_valid(struct kvm_vcpu *vcpu) static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) { - struct kvm_segment cs, ss; - - vmx_get_segment(vcpu, cs, VCPU_SREG_CS); - vmx_get_segment(vcpu, ss, VCPU_SREG_SS); - - return ((cs.selector SELECTOR_RPL_MASK) == -(ss.selector SELECTOR_RPL_MASK)); + return ((vmcs_read16(GUEST_CS_SELECTOR) SELECTOR_RPL_MASK) == +(vmcs_read16(GUEST_SS_SELECTOR) SELECTOR_RPL_MASK)); } /* -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 05/47] KVM: VMX: Optimize vmx_get_cpl()
On 08/26/2009 05:15 PM, Roel Kluin wrote: Op 26-08-09 12:29, Avi Kivity schreef: Instead of calling vmx_get_segment() (which reads a whole bunch of vmcs fields), read only the cs selector which contains the cpl. Signed-off-by: Avi Kivitya...@redhat.com Can't we also optimise cs_ss_rpl_check()? (Please review, untested.) -8 -- 8 Instead of calling vmx_get_segment() (which reads a whole bunch of vmcs fields), read only the cs/ss selectors which contains the rpls. It's really a slowpath, so I prefer not to touch it. We're likely to start caching guest segment fields soon, so the less code that reads them directly, the better. Signed-off-by: Roel Kluinroel.kl...@gmail.com --- diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 29f9129..5d8512a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1932,13 +1932,8 @@ static bool ldtr_valid(struct kvm_vcpu *vcpu) static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) { - struct kvm_segment cs, ss; - - vmx_get_segment(vcpu,cs, VCPU_SREG_CS); - vmx_get_segment(vcpu,ss, VCPU_SREG_SS); - - return ((cs.selector SELECTOR_RPL_MASK) == -(ss.selector SELECTOR_RPL_MASK)); + return ((vmcs_read16(GUEST_CS_SELECTOR) SELECTOR_RPL_MASK) == +(vmcs_read16(GUEST_SS_SELECTOR) SELECTOR_RPL_MASK)); } /* -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCHv4 2/2] vhost_net: a kernel-level virtio server
On Tuesday 25 August 2009, Michael S. Tsirkin wrote: I'd like to avoid that here, though it's kind of ugly. We'd need VHOST_GET_FEATURES (and ACK) to take a struct like: u32 feature_size; u32 features[]; Hmm, variable length ioctl arguments, I'd rather not go there. The ioctl infrastructure already has a length argument encoded in the ioctl number. We can use that if we need more, e.g. /* now */ #define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64) /* * uncomment if we run out of feature bits: struct vhost_get_features2 { __u64 bits[2]; }; #define VHOST_GET_FEATURES2 _IOR(VHOST_VIRTIO, 0x00, \ struct vhost_get_features2) */ Thinking about this proposal some more, how will the guest determine the size to supply the GET_FEATURES ioctl? Wait, the *guest*? Maybe I misunderstood something in a major way here, but I expected the features to be negotiated between host user space (qemu) and host kernel, as well as between guest and qemu (as they are already), but never between guest and kernel. I would certainly expect the bits to be distinct from the virtio-net feature bits. E.g. stuff like TAP frame format opposed to TCP socket frame format (length+date) is something we need to negotiate here but that the guest does not care about. Since we are a bit tight in 32 bit space already, let's just use a 64 bit integer and be done with it? Can't hurt, but don't use a struct unless you think we are going to need more than 64 bits. Arnd -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Performace data when running Windows VMs
I recently gathered some performance data when running Windows Server 2008 VMs, and I wanted to share it here. There are 12 Windows Server2008 64-bit VMs (1 vcpu, 2 GB) running which handle the concurrent execution of 6 J2EE type benchmarks. Each benchmark needs a App VM and a Database VM. The benchmark clients inject a fixed rate of requests which yields X% CPU utilization on the host. A different hypervisor was compared; KVM used about 60% more CPU cycles to complete the same amount of work. Both had their hypervisor specific paravirt IO drivers in the VMs. Server is a 2 socket Core/i7, SMT off, with 72 GB memory Host kernel used was kvm.git v2.6.31-rc3-3419-g6df4865 Qemu was kvm-87. I tried a few newer versions of Qemu; none of them worked with the RedHat virtIO Windows drivers. I tried: f3600c589a9ee5ea4c0fec74ed4e06a15b461d52 0.11.0-rc1 0.10.6 kvm-88 All but 0.10.6 had Problem code 10 driver error in the VM. 0.10.6 had a disk read error occurred very early in the booting of the VM. I/O on the host was not what I would call very high: outbound network averaged at 163 Mbit/s inbound was 8 Mbit/s, while disk read ops was 243/sec and write ops was 561/sec Host CPU breakdown was the following: user nice system irq softirq guest idle iowait 5.67 0.00 11.64 0.09 1.0531.90 46.06 3.59 The amount of kernel time had me concerned. Here is oprofile: samples %app name symbol name 1163422 52.3744 kvm-intel.ko vmx_vcpu_run 1039964.6816 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 native_set_debugreg 81036 3.6480 kvm.ko kvm_arch_vcpu_ioctl_run 37913 1.7068 qemu-system-x86_64 cpu_physical_memory_rw 34720 1.5630 qemu-system-x86_64 phys_page_find_alloc 23234 1.0459 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 native_write_msr_safe 20964 0.9437 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 native_get_debugreg 17628 0.7936 libc-2.5.so memcpy 16587 0.7467 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 __down_read 15681 0.7059 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 __up_read 15466 0.6962 kvm.ko find_highest_vector 14611 0.6578 qemu-system-x86_64 qemu_get_ram_ptr 11254 0.5066 kvm-intel.ko vmcs_writel 11133 0.5012 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 copy_user_generic_string 10917 0.4915 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 native_read_msr_safe 10760 0.4844 qemu-system-x86_64 virtqueue_get_head 9025 0.4063 kvm-intel.ko vmx_handle_exit 8953 0.4030 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 schedule 8753 0.3940 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 fget_light 8465 0.3811 qemu-system-x86_64 virtqueue_avail_bytes 8185 0.3685 kvm-intel.ko handle_cr 8069 0.3632 kvm.ko kvm_set_irq 7697 0.3465 kvm.ko kvm_lapic_sync_from_vapic 7586 0.3415 qemu-system-x86_64 main_loop_wait 7480 0.3367 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 do_select 7121 0.3206 qemu-system-x86_64 lduw_phys 7003 0.3153 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 audit_syscall_exit 6062 0.2729 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 kfree 5477 0.2466 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 fput 5454 0.2455 kvm.ko kvm_lapic_get_cr8 5096 0.2294 kvm.ko kvm_load_guest_fpu 5057 0.2277 kvm.ko apic_update_ppr 4929 0.2219 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 up_read 4900 0.2206 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 audit_syscall_entry 4866 0.2191 kvm.ko kvm_apic_has_interrupt 4670 0.2102 kvm-intel.ko skip_emulated_instruction 4644 0.2091 kvm.ko kvm_cpu_has_interrupt 4548 0.2047 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 __switch_to 4328 0.1948 kvm.ko kvm_apic_accept_pic_intr 4303 0.1937 libpthread-2.5.sopthread_mutex_lock 4235 0.1906 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 system_call 4175 0.1879 kvm.ko kvm_put_guest_fpu 4170 0.1877 qemu-system-x86_64 ldl_phys 4098 0.1845 kvm-intel.ko vmx_set_interrupt_shadow 4003 0.1802 qemu-system-x86_64 kvm_run I was wondering why the get/set debugreg was so high. I don't recall seeing this much with Linux VMs. Here is an average of kvm_stat: efer_relo 0 exits 1262814 fpu_reloa 103842 halt_exit 9918 halt_wake 9763 host_stat 103846 hypercall 0 insn_emul 23277 insn_emul
Re: Performace data when running Windows VMs
On 08/26/2009 05:57 PM, Andrew Theurer wrote: I recently gathered some performance data when running Windows Server 2008 VMs, and I wanted to share it here. There are 12 Windows Server2008 64-bit VMs (1 vcpu, 2 GB) running which handle the concurrent execution of 6 J2EE type benchmarks. Each benchmark needs a App VM and a Database VM. The benchmark clients inject a fixed rate of requests which yields X% CPU utilization on the host. A different hypervisor was compared; KVM used about 60% more CPU cycles to complete the same amount of work. Both had their hypervisor specific paravirt IO drivers in the VMs. Server is a 2 socket Core/i7, SMT off, with 72 GB memory Did you use large pages? Host kernel used was kvm.git v2.6.31-rc3-3419-g6df4865 Qemu was kvm-87. I tried a few newer versions of Qemu; none of them worked with the RedHat virtIO Windows drivers. I tried: f3600c589a9ee5ea4c0fec74ed4e06a15b461d52 0.11.0-rc1 0.10.6 kvm-88 All but 0.10.6 had Problem code 10 driver error in the VM. 0.10.6 had a disk read error occurred very early in the booting of the VM. Yan? I/O on the host was not what I would call very high: outbound network averaged at 163 Mbit/s inbound was 8 Mbit/s, while disk read ops was 243/sec and write ops was 561/sec What was the disk bandwidth used? Presumably, direct access to the volume with cache=off? linux-aio should help reduce cpu usage. Host CPU breakdown was the following: user nice system irq softirq guest idle iowait 5.67 0.00 11.64 0.09 1.0531.90 46.06 3.59 The amount of kernel time had me concerned. Here is oprofile: user+system is about 55% of guest time, and it's all overhead. samples %app name symbol name 1163422 52.3744 kvm-intel.ko vmx_vcpu_run 1039964.6816 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 native_set_debugreg 81036 3.6480 kvm.ko kvm_arch_vcpu_ioctl_run 37913 1.7068 qemu-system-x86_64 cpu_physical_memory_rw 34720 1.5630 qemu-system-x86_64 phys_page_find_alloc We should really optimize these two. 23234 1.0459 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 native_write_msr_safe 20964 0.9437 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 native_get_debugreg 17628 0.7936 libc-2.5.so memcpy 16587 0.7467 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 __down_read 15681 0.7059 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 __up_read 15466 0.6962 kvm.ko find_highest_vector 14611 0.6578 qemu-system-x86_64 qemu_get_ram_ptr 11254 0.5066 kvm-intel.ko vmcs_writel 11133 0.5012 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 copy_user_generic_string 10917 0.4915 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 native_read_msr_safe 10760 0.4844 qemu-system-x86_64 virtqueue_get_head 9025 0.4063 kvm-intel.ko vmx_handle_exit 8953 0.4030 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 schedule 8753 0.3940 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 fget_light 8465 0.3811 qemu-system-x86_64 virtqueue_avail_bytes 8185 0.3685 kvm-intel.ko handle_cr 8069 0.3632 kvm.ko kvm_set_irq 7697 0.3465 kvm.ko kvm_lapic_sync_from_vapic 7586 0.3415 qemu-system-x86_64 main_loop_wait 7480 0.3367 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 do_select 7121 0.3206 qemu-system-x86_64 lduw_phys 7003 0.3153 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 audit_syscall_exit 6062 0.2729 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 kfree 5477 0.2466 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 fput 5454 0.2455 kvm.ko kvm_lapic_get_cr8 5096 0.2294 kvm.ko kvm_load_guest_fpu 5057 0.2277 kvm.ko apic_update_ppr 4929 0.2219 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 up_read 4900 0.2206 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 audit_syscall_entry 4866 0.2191 kvm.ko kvm_apic_has_interrupt 4670 0.2102 kvm-intel.ko skip_emulated_instruction 4644 0.2091 kvm.ko kvm_cpu_has_interrupt 4548 0.2047 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 __switch_to 4328 0.1948 kvm.ko kvm_apic_accept_pic_intr 4303 0.1937 libpthread-2.5.sopthread_mutex_lock 4235 0.1906 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 system_call 4175 0.1879 kvm.ko kvm_put_guest_fpu 4170 0.1877 qemu-system-x86_64 ldl_phys 4098 0.1845 kvm-intel.ko vmx_set_interrupt_shadow 4003 0.1802 qemu-system-x86_64 kvm_run
Re: Extending virtio_console to support multiple ports
[cc'ing some people who have made some commits in hvc_console.c] On (Wed) Aug 26 2009 [16:57:18], Amit Shah wrote: On (Tue) Aug 25 2009 [11:47:20], Amit Shah wrote: Hello all, Here is a new iteration of the patch series that implements a transport for guest and host communications. The code has been updated to reuse the virtio-console device instead of creating a new virtio-serial device. And the problem now is that hvc calls the put_chars function with spinlocks held and we now allocate pages in send_buf(), called from put_chars. A few solutions: [snip] - Convert hvc's usage of spinlocks to mutexes. I've no idea how this will play out; I'm no expert here. But I did try doing this and so far it all looks OK. No lockups, lockdep warnings, nothing. I have full debugging enabled. But this doesn't mean it's right. So just to test this further I added the capability to have more than one hvc console spawn from virtio_console, created two consoles and did a 'cat' of a file in each of the virtio-consoles. It's been running for half an hour now without any badness. No spew in debug logs too. I also checked the code in hvc_console.c that takes the spin_locks. Nothing there that runs from (or needs to run from) interrupt context. So the change to mutexes does seem reasonable. Also, the spinlock code was added really long back -- git blame shows Linus' first git commit introduced them in the git history, so it's pure legacy baggage. Also found a bug: hvc_resize() wants to be called with a lock held (hp-lock) but virtio_console just calls it directly. Anyway I'm wondering whether all those locks are needed. Amit diff --git a/drivers/char/hvc_console.c b/drivers/char/hvc_console.c index d97779e..51078a3 100644 --- a/drivers/char/hvc_console.c +++ b/drivers/char/hvc_console.c @@ -35,7 +35,7 @@ #include linux/tty.h #include linux/tty_flip.h #include linux/sched.h -#include linux/spinlock.h +#include linux/mutex.h #include linux/delay.h #include linux/freezer.h @@ -81,7 +81,7 @@ static LIST_HEAD(hvc_structs); * Protect the list of hvc_struct instances from inserts and removals during * list traversal. */ -static DEFINE_SPINLOCK(hvc_structs_lock); +static DEFINE_MUTEX(hvc_structs_lock); /* * This value is used to assign a tty-index value to a hvc_struct based @@ -98,23 +98,22 @@ static int last_hvc = -1; static struct hvc_struct *hvc_get_by_index(int index) { struct hvc_struct *hp; - unsigned long flags; - spin_lock(hvc_structs_lock); + mutex_lock(hvc_structs_lock); list_for_each_entry(hp, hvc_structs, next) { - spin_lock_irqsave(hp-lock, flags); + mutex_lock(hp-lock); if (hp-index == index) { kref_get(hp-kref); - spin_unlock_irqrestore(hp-lock, flags); - spin_unlock(hvc_structs_lock); + mutex_unlock(hp-lock); + mutex_unlock(hvc_structs_lock); return hp; } - spin_unlock_irqrestore(hp-lock, flags); + mutex_unlock(hp-lock); } hp = NULL; - spin_unlock(hvc_structs_lock); + mutex_unlock(hvc_structs_lock); return hp; } @@ -228,15 +227,14 @@ console_initcall(hvc_console_init); static void destroy_hvc_struct(struct kref *kref) { struct hvc_struct *hp = container_of(kref, struct hvc_struct, kref); - unsigned long flags; - spin_lock(hvc_structs_lock); + mutex_lock(hvc_structs_lock); - spin_lock_irqsave(hp-lock, flags); + mutex_lock(hp-lock); list_del((hp-next)); - spin_unlock_irqrestore(hp-lock, flags); + mutex_unlock(hp-lock); - spin_unlock(hvc_structs_lock); + mutex_unlock(hvc_structs_lock); kfree(hp); } @@ -302,17 +300,16 @@ static void hvc_unthrottle(struct tty_struct *tty) static int hvc_open(struct tty_struct *tty, struct file * filp) { struct hvc_struct *hp; - unsigned long flags; int rc = 0; /* Auto increments kref reference if found. */ if (!(hp = hvc_get_by_index(tty-index))) return -ENODEV; - spin_lock_irqsave(hp-lock, flags); + mutex_lock(hp-lock); /* Check and then increment for fast path open. */ if (hp-count++ 0) { - spin_unlock_irqrestore(hp-lock, flags); + mutex_unlock(hp-lock); hvc_kick(); return 0; } /* else count == 0 */ @@ -321,7 +318,7 @@ static int hvc_open(struct tty_struct *tty, struct file * filp) hp-tty = tty; - spin_unlock_irqrestore(hp-lock, flags); + mutex_unlock(hp-lock); if (hp-ops-notifier_add) rc = hp-ops-notifier_add(hp, hp-data); @@ -333,9 +330,9 @@ static int hvc_open(struct tty_struct *tty, struct file * filp) *
Re: Performace data when running Windows VMs
On Wed, 2009-08-26 at 18:44 +0300, Avi Kivity wrote: On 08/26/2009 05:57 PM, Andrew Theurer wrote: I recently gathered some performance data when running Windows Server 2008 VMs, and I wanted to share it here. There are 12 Windows Server2008 64-bit VMs (1 vcpu, 2 GB) running which handle the concurrent execution of 6 J2EE type benchmarks. Each benchmark needs a App VM and a Database VM. The benchmark clients inject a fixed rate of requests which yields X% CPU utilization on the host. A different hypervisor was compared; KVM used about 60% more CPU cycles to complete the same amount of work. Both had their hypervisor specific paravirt IO drivers in the VMs. Server is a 2 socket Core/i7, SMT off, with 72 GB memory Did you use large pages? Yes. Host kernel used was kvm.git v2.6.31-rc3-3419-g6df4865 Qemu was kvm-87. I tried a few newer versions of Qemu; none of them worked with the RedHat virtIO Windows drivers. I tried: f3600c589a9ee5ea4c0fec74ed4e06a15b461d52 0.11.0-rc1 0.10.6 kvm-88 All but 0.10.6 had Problem code 10 driver error in the VM. 0.10.6 had a disk read error occurred very early in the booting of the VM. Yan? I/O on the host was not what I would call very high: outbound network averaged at 163 Mbit/s inbound was 8 Mbit/s, while disk read ops was 243/sec and write ops was 561/sec What was the disk bandwidth used? Presumably, direct access to the volume with cache=off? 2.4 MB/sec write, 0.6MB/sec read, cache=none The VMs' boot disks are IDE, but apps use their second disk which is virtio. linux-aio should help reduce cpu usage. I assume this is in a newer version of Qemu? Host CPU breakdown was the following: user nice system irq softirq guest idle iowait 5.67 0.00 11.64 0.09 1.0531.90 46.06 3.59 The amount of kernel time had me concerned. Here is oprofile: user+system is about 55% of guest time, and it's all overhead. samples %app name symbol name 1163422 52.3744 kvm-intel.ko vmx_vcpu_run 1039964.6816 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 native_set_debugreg 81036 3.6480 kvm.ko kvm_arch_vcpu_ioctl_run 37913 1.7068 qemu-system-x86_64 cpu_physical_memory_rw 34720 1.5630 qemu-system-x86_64 phys_page_find_alloc We should really optimize these two. 23234 1.0459 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 native_write_msr_safe 20964 0.9437 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 native_get_debugreg 17628 0.7936 libc-2.5.so memcpy 16587 0.7467 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 __down_read 15681 0.7059 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 __up_read 15466 0.6962 kvm.ko find_highest_vector 14611 0.6578 qemu-system-x86_64 qemu_get_ram_ptr 11254 0.5066 kvm-intel.ko vmcs_writel 11133 0.5012 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 copy_user_generic_string 10917 0.4915 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 native_read_msr_safe 10760 0.4844 qemu-system-x86_64 virtqueue_get_head 9025 0.4063 kvm-intel.ko vmx_handle_exit 8953 0.4030 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 schedule 8753 0.3940 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 fget_light 8465 0.3811 qemu-system-x86_64 virtqueue_avail_bytes 8185 0.3685 kvm-intel.ko handle_cr 8069 0.3632 kvm.ko kvm_set_irq 7697 0.3465 kvm.ko kvm_lapic_sync_from_vapic 7586 0.3415 qemu-system-x86_64 main_loop_wait 7480 0.3367 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 do_select 7121 0.3206 qemu-system-x86_64 lduw_phys 7003 0.3153 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 audit_syscall_exit 6062 0.2729 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 kfree 5477 0.2466 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 fput 5454 0.2455 kvm.ko kvm_lapic_get_cr8 5096 0.2294 kvm.ko kvm_load_guest_fpu 5057 0.2277 kvm.ko apic_update_ppr 4929 0.2219 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 up_read 4900 0.2206 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 audit_syscall_entry 4866 0.2191 kvm.ko kvm_apic_has_interrupt 4670 0.2102 kvm-intel.ko skip_emulated_instruction 4644 0.2091 kvm.ko kvm_cpu_has_interrupt 4548 0.2047 vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 __switch_to 4328 0.1948 kvm.ko
Re: Performace data when running Windows VMs
On 08/26/2009 07:14 PM, Andrew Theurer wrote: On Wed, 2009-08-26 at 18:44 +0300, Avi Kivity wrote: On 08/26/2009 05:57 PM, Andrew Theurer wrote: I recently gathered some performance data when running Windows Server 2008 VMs, and I wanted to share it here. There are 12 Windows Server2008 64-bit VMs (1 vcpu, 2 GB) running which handle the concurrent execution of 6 J2EE type benchmarks. Each benchmark needs a App VM and a Database VM. The benchmark clients inject a fixed rate of requests which yields X% CPU utilization on the host. A different hypervisor was compared; KVM used about 60% more CPU cycles to complete the same amount of work. Both had their hypervisor specific paravirt IO drivers in the VMs. Server is a 2 socket Core/i7, SMT off, with 72 GB memory Did you use large pages? Yes. The stats show 'largepage = 12'. Something's wrong. There's a commit (7736d680) that's supposed to fix largepage support for kvm-87, maybe it's incomplete. I/O on the host was not what I would call very high: outbound network averaged at 163 Mbit/s inbound was 8 Mbit/s, while disk read ops was 243/sec and write ops was 561/sec What was the disk bandwidth used? Presumably, direct access to the volume with cache=off? 2.4 MB/sec write, 0.6MB/sec read, cache=none The VMs' boot disks are IDE, but apps use their second disk which is virtio. Chickenfeed. Do the network stats include interguest traffic? I presume *all* of the traffic was interguest. linux-aio should help reduce cpu usage. I assume this is in a newer version of Qemu? No, posted and awaiting merge. Could it be that Windows uses the debug registers? Maybe we're incorrectly deciding to switch them. I was wondering about that. I was thinking of just backing out the support for debugregs and see what happens. Did the up/down_read seem kind of high? Are we doing a lock of locking? It is. We do. Marcelo made some threats to remove this lock. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Performace data when running Windows VMs
On Wednesday 26 August 2009 11:14:57 am Andrew Theurer wrote: snip I/O on the host was not what I would call very high: outbound network averaged at 163 Mbit/s inbound was 8 Mbit/s, while disk read ops was 243/sec and write ops was 561/sec What was the disk bandwidth used? Presumably, direct access to the volume with cache=off? 2.4 MB/sec write, 0.6MB/sec read, cache=none The VMs' boot disks are IDE, but apps use their second disk which is virtio. In my testing, I got better performance from IDE than the new virtio block driver for windows. There appears to be some optimization left to do on them. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCHv4 2/2] vhost_net: a kernel-level virtio server
On Tue, Aug 25, 2009 at 04:16:34PM +0300, Michael S. Tsirkin wrote: + /* If they don't want an interrupt, don't send one, unless empty. */ + if ((flags VRING_AVAIL_F_NO_INTERRUPT) vq-inflight) + return; And I wouldn't support notify on empty at all, TBH. If I don't, virtio net in guest uses a timer, which might be expensive. Will need to check what this does. It should definitely be conditional on the guest accepting the NOTIFY_ON_EMPTY feature. lguest does not do it this way though, do it? -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/2] eventfd: new EFD_STATE flag
On Wed, 26 Aug 2009, Michael S. Tsirkin wrote: On Tue, Aug 25, 2009 at 02:57:01PM -0700, Davide Libenzi wrote: On Tue, 25 Aug 2009, Michael S. Tsirkin wrote: Yes, we don't want that. The best thing is to try to restate the problem in a way that is generic, and then either solve or best use existing solution. Right? I thought I had that, but apparently not. The reason I'm Cc-ing you is not to try and spam you until you give up and accept the patch, it's hoping that you see the pattern behind our usage, and help generalize it. If I understand it correctly, you believe this is not possible and so any solution will have to be in KVM? Or maybe I didn't state the problem clearly enough and should restate it? Please do. - Davide Problem looks like this: There are multiple processes (devices) where each has a condition (interrupt line) which it has logic to determine is either true or false. A single other process (hypervisor) is interested in a condition (interrupt level) which is a logical OR of all interrupt lines. On changes, an interrupt level value needs to be read and copied to guest virtual cpu. We also want ability to replace some or all processes above by a kernel components, with condition changes done potentially from hardware interrupt context. How we wanted to solve it with EFD_STATE: Share a separate eventfd between each device and the hypervisor. device sets state to either 0 or 1. hypervisor polls all eventfds, reads interrupt line on changes, calculates the interrupt level and updates guest. Alternative solution: shared memory where each device writes interrupt line value. This makes setup more complex (need to share around much more than just an fd), and makes access from interrupt impossible unless we lock the memory (and locking userspace memory introduces yet another set of issues). OK, if I get it correctly, there is one eventfd signaler (the device), and one eventfd reader (the hypervisor), right? Each hypervisor listens for multiple devices detecting state changes, and associating the eventfd line to the IRQ number by some configuration (ala PCI), right? - Davide -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Performace data when running Windows VMs
On Wed, 2009-08-26 at 19:26 +0300, Avi Kivity wrote: On 08/26/2009 07:14 PM, Andrew Theurer wrote: On Wed, 2009-08-26 at 18:44 +0300, Avi Kivity wrote: On 08/26/2009 05:57 PM, Andrew Theurer wrote: I recently gathered some performance data when running Windows Server 2008 VMs, and I wanted to share it here. There are 12 Windows Server2008 64-bit VMs (1 vcpu, 2 GB) running which handle the concurrent execution of 6 J2EE type benchmarks. Each benchmark needs a App VM and a Database VM. The benchmark clients inject a fixed rate of requests which yields X% CPU utilization on the host. A different hypervisor was compared; KVM used about 60% more CPU cycles to complete the same amount of work. Both had their hypervisor specific paravirt IO drivers in the VMs. Server is a 2 socket Core/i7, SMT off, with 72 GB memory Did you use large pages? Yes. The stats show 'largepage = 12'. Something's wrong. There's a commit (7736d680) that's supposed to fix largepage support for kvm-87, maybe it's incomplete. How strange. /proc/meminfo showed that almost all of the pages were used: HugePages_Total: 12556 HugePages_Free: 220 HugePages_Rsvd:0 HugePages_Surp:0 Hugepagesize: 2048 kB I just assumed they were used properly. Maybe not. I/O on the host was not what I would call very high: outbound network averaged at 163 Mbit/s inbound was 8 Mbit/s, while disk read ops was 243/sec and write ops was 561/sec What was the disk bandwidth used? Presumably, direct access to the volume with cache=off? 2.4 MB/sec write, 0.6MB/sec read, cache=none The VMs' boot disks are IDE, but apps use their second disk which is virtio. Chickenfeed. Do the network stats include interguest traffic? I presume *all* of the traffic was interguest. Sar network data: IFACE rxpck/s txpck/srxkB/stxkB/s Average: lo 0.00 0.00 0.00 0.00 Average: usb0 0.39 0.19 0.02 0.01 Average: eth0 2968.83 5093.02340.13 6966.64 Average: eth1 2992.92 5124.08342.75 7008.53 Average: eth2 1455.53 2500.63167.45 3421.64 Average: eth3 1500.59 2574.36171.98 3524.82 Average: br0 2.41 0.95 0.32 0.13 Average: br1 1.52 0.00 0.20 0.00 Average: br2 1.52 0.00 0.20 0.00 Average: br3 1.52 0.00 0.20 0.00 Average: br4 0.00 0.00 0.00 0.00 Average: tap3669.38708.07290.89140.81 Average: tap109678.53723.58294.07143.31 Average: tap215673.20711.47291.99141.78 Average: tap321675.26719.33293.01142.37 Average:tap27679.23729.90293.86143.60 Average: tap133680.17734.08294.33143.85 Average: tap2 1002.24 2214.19 3458.54457.95 Average: tap108 1021.85 2246.53 3491.02463.48 Average: tap214 1002.81 2195.22 3411.80457.28 Average: tap320 1017.43 2241.49 3508.20462.54 Average:tap26 1028.52 2237.98 3483.84462.53 Average: tap132 1034.05 2240.89 3493.37463.32 tap0-99 go to eth0, 100-199 to eth1, 200-299 to eth2, 300-399 to eth4. There is some inter-guest traffic between VM pairs (like taps 23, 108119, etc.) but not that significant. linux-aio should help reduce cpu usage. I assume this is in a newer version of Qemu? No, posted and awaiting merge. Could it be that Windows uses the debug registers? Maybe we're incorrectly deciding to switch them. I was wondering about that. I was thinking of just backing out the support for debugregs and see what happens. Did the up/down_read seem kind of high? Are we doing a lock of locking? It is. We do. Marcelo made some threats to remove this lock. Thanks, -Andrew -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Performace data when running Windows VMs
On Wed, 2009-08-26 at 11:27 -0500, Brian Jackson wrote: On Wednesday 26 August 2009 11:14:57 am Andrew Theurer wrote: snip I/O on the host was not what I would call very high: outbound network averaged at 163 Mbit/s inbound was 8 Mbit/s, while disk read ops was 243/sec and write ops was 561/sec What was the disk bandwidth used? Presumably, direct access to the volume with cache=off? 2.4 MB/sec write, 0.6MB/sec read, cache=none The VMs' boot disks are IDE, but apps use their second disk which is virtio. In my testing, I got better performance from IDE than the new virtio block driver for windows. There appears to be some optimization left to do on them. Thanks Brian. I will try IDE on both VM disks to see how it compares. -Andrew -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: AlacrityVM benchmark numbers updated
Avi Kivity wrote: On 08/26/2009 04:01 AM, Gregory Haskins wrote: We are pleased to announce the availability of the latest networking benchmark numbers for AlacrityVM. We've made several tweaks to the original v0.1 release to improve performance. The most notable is a switch from get_user_pages to switch_mm+copy_[to/from]_user thanks to a review suggestion from Michael Tsirkin (as well as his patch to implement it). This change alone accounted for freeing up an additional 1.2Gbps, which is over 25% improvement from v0.1. The previous numbers were 4560Gbps before the change, and 5708Gbps after (for 1500mtu over 10GE). This moves us ever closer to the goal of native performance under virtualization. Interesting, it's good to see that copy_*_user() works so well. Note that there's a possible optimization that goes in the opposite direction - keep using get_user_pages(), but use the dma engine API to perform the actual copy. I expect that it will only be a win when using tso to transfer full pages. Large pages may also help. Copyless tx also wants get_user_pages(). It makes sense to check if switch_mm() + get_user_pages_fast() gives better performance than get_user_pages(). Actually, I have already look at this and it does indeed seem better to use switch_mm+gupf() over gup() by quite a large margin. You could then couple that with your DMA-engine idea to potentially gain even more benefits (though probably not for networking since most NICs have their own DMA engine anyway). Kind Regards, -Greg signature.asc Description: OpenPGP digital signature
Re: [Qemu-devel] Re: Notes on block I/O data integrity
Nikola Ciprich wrote: clustered LVM SHOULD not have problems with it, as we're using just striped volumes, Note that LVM does not implement barriers at all, except for simple cases of a single backing device (I'm not sure if that includes dm-crypt). So your striped volumes may not offer this level of integrity. -- Jamie -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] Re: Notes on block I/O data integrity
Christoph Hellwig wrote: what about LVM? iv'e read somewhere that it used to just eat barriers used by XFS, making it less safe than simple partitions. Oh, any additional layers open another by cans of worms. On Linux until very recently using LVM or software raid means only disabled write caches are safe. I believe that's still true except if there's more than one backing drive, so software RAID still isn't safe. Did that change? But even with barriers, software RAID may have a consistency problem if one stripe is updated and the system fails before the matching parity stripe is updated. I've been told that some hardware RAID implementations implement a kind of journalling to deal with this, but Linux software RAID does not. -- Jamie -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/2] eventfd: new EFD_STATE flag
On 08/26/2009 08:45 PM, Davide Libenzi wrote: OK, if I get it correctly, there is one eventfd signaler (the device), and one eventfd reader (the hypervisor), right? Each hypervisor listens for multiple devices detecting state changes, and associating the eventfd line to the IRQ number by some configuration (ala PCI), right? Yes. The PCI stuff happens in userspace, all the hypervisor sees is this eventfd is IRQ 10. There may be multiple eventfds routed to one IRQ (corresponding to a shared IRQ line). -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/2] eventfd: new EFD_STATE flag
On Wed, 26 Aug 2009, Avi Kivity wrote: On 08/26/2009 08:45 PM, Davide Libenzi wrote: OK, if I get it correctly, there is one eventfd signaler (the device), and one eventfd reader (the hypervisor), right? Each hypervisor listens for multiple devices detecting state changes, and associating the eventfd line to the IRQ number by some configuration (ala PCI), right? Yes. The PCI stuff happens in userspace, all the hypervisor sees is this eventfd is IRQ 10. There may be multiple eventfds routed to one IRQ (corresponding to a shared IRQ line). Ok, so why not using the eventfd counter as state? On the device side: void write_state(int sfd, int state) { u64 cnt; /* Clear the current state, sfd is in non-blocking mode */ read(sfd, cnt, sizeof(cnt)); /* Writes new state */ cnt = 1 + !!state; write(sfd, cnt, sizeof(cnt)); } On the hypervisor side: int read_state(int sfd) { u64 cnt; read(sfd, cnt, sizeof(cnt)); return state - 1; } - Davide -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Performace data when running Windows VMs
On 08/26/2009 08:51 PM, Andrew Theurer wrote: The stats show 'largepage = 12'. Something's wrong. There's a commit (7736d680) that's supposed to fix largepage support for kvm-87, maybe it's incomplete. How strange. /proc/meminfo showed that almost all of the pages were used: HugePages_Total: 12556 HugePages_Free: 220 HugePages_Rsvd:0 HugePages_Surp:0 Hugepagesize: 2048 kB I just assumed they were used properly. Maybe not. My mistake. The kvm_stat numbers you provided were rate (per second), so it just means it's still faulting in pages at a rate of 1 per guest per second. I/O on the host was not what I would call very high: outbound network averaged at 163 Mbit/s inbound was 8 Mbit/s, while disk read ops was 243/sec and write ops was 561/sec What was the disk bandwidth used? Presumably, direct access to the volume with cache=off? 2.4 MB/sec write, 0.6MB/sec read, cache=none The VMs' boot disks are IDE, but apps use their second disk which is virtio. Chickenfeed. Do the network stats include interguest traffic? I presume *all* of the traffic was interguest. Sar network data: IFACE rxpck/s txpck/srxkB/stxkB/s Average: lo 0.00 0.00 0.00 0.00 Average: usb0 0.39 0.19 0.02 0.01 Average: eth0 2968.83 5093.02340.13 6966.64 Average: eth1 2992.92 5124.08342.75 7008.53 Average: eth2 1455.53 2500.63167.45 3421.64 Average: eth3 1500.59 2574.36171.98 3524.82 Average: br0 2.41 0.95 0.32 0.13 Average: br1 1.52 0.00 0.20 0.00 Average: br2 1.52 0.00 0.20 0.00 Average: br3 1.52 0.00 0.20 0.00 Average: br4 0.00 0.00 0.00 0.00 Average: tap3669.38708.07290.89140.81 Average: tap109678.53723.58294.07143.31 Average: tap215673.20711.47291.99141.78 Average: tap321675.26719.33293.01142.37 Average:tap27679.23729.90293.86143.60 Average: tap133680.17734.08294.33143.85 Average: tap2 1002.24 2214.19 3458.54457.95 Average: tap108 1021.85 2246.53 3491.02463.48 Average: tap214 1002.81 2195.22 3411.80457.28 Average: tap320 1017.43 2241.49 3508.20462.54 Average:tap26 1028.52 2237.98 3483.84462.53 Average: tap132 1034.05 2240.89 3493.37463.32 tap0-99 go to eth0, 100-199 to eth1, 200-299 to eth2, 300-399 to eth4. There is some inter-guest traffic between VM pairs (like taps 23, 108119, etc.) but not that significant. Oh, so there are external load generators involved. Can you run this on kvm.git master, with CONFIG_TRACEPOINTS=y CONFIG_TRACER_MAX_TRACE=y CONFIG_RING_BUFFER=y CONFIG_FTRACE_NMI_ENTER=y CONFIG_EVENT_TRACING=y CONFIG_TRACING=y CONFIG_GENERIC_TRACER=y CONFIG_TRACING_SUPPORT=y CONFIG_FTRACE=y CONFIG_DYNAMIC_FTRACE=y (some may be overkill) and, while the test is running, do: cd /sys/kernel/debug/tracing echo kvm set_event (wait two seconds) cat trace /tmp/trace and send me /tmp/trace.bz2? should be quite big. -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Page allocation failures in guest
From: Rusty Russell ru...@rustcorp.com.au Date: Wed, 26 Aug 2009 21:48:58 +0930 Dave, can you push this to Linus ASAP? Ok. Subject: virtio: net refill on out-of-memory If we run out of memory, use keventd to fill the buffer. There's a report of this happening: Page allocation failures in guest, Message-ID: 20090713115158.0a489...@mjolnir.ossman.eu Signed-off-by: Rusty Russell ru...@rustcorp.com.au Applied, thanks. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: AlacrityVM benchmark numbers updated
On 08/26/2009 09:42 PM, Gregory Haskins wrote: Actually, I have already look at this and it does indeed seem better to use switch_mm+gupf() over gup() by quite a large margin. You could then couple that with your DMA-engine idea to potentially gain even more benefits (though probably not for networking since most NICs have their own DMA engine anyway). For tx, we'll just go copyless once we plumb the destructors properly. But for rx on a shared interface it is impossible to avoid the copy. You can only choose if you want it done by the cpu or a local dma engine. -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/2] eventfd: new EFD_STATE flag
On 08/26/2009 10:13 PM, Davide Libenzi wrote: Ok, so why not using the eventfd counter as state? On the device side: void write_state(int sfd, int state) { u64 cnt; /* Clear the current state, sfd is in non-blocking mode */ read(sfd,cnt, sizeof(cnt)); /* Writes new state */ cnt = 1 + !!state; write(sfd,cnt, sizeof(cnt)); } On the hypervisor side: int read_state(int sfd) { u64 cnt; read(sfd,cnt, sizeof(cnt)); return state - 1; } Hadn't though of read+write as set. While the 1+ is a little ugly, it's workable. I see no kernel equivalent to read(), but that's easily done. -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/2] eventfd: new EFD_STATE flag
On Wed, 26 Aug 2009, Avi Kivity wrote: On 08/26/2009 10:13 PM, Davide Libenzi wrote: Ok, so why not using the eventfd counter as state? On the device side: void write_state(int sfd, int state) { u64 cnt; /* Clear the current state, sfd is in non-blocking mode */ read(sfd,cnt, sizeof(cnt)); /* Writes new state */ cnt = 1 + !!state; write(sfd,cnt, sizeof(cnt)); } On the hypervisor side: int read_state(int sfd) { u64 cnt; read(sfd,cnt, sizeof(cnt)); return state - 1; } Hadn't though of read+write as set. While the 1+ is a little ugly, it's workable. Pick what you want, as long as it always writes something != 0 :) I see no kernel equivalent to read(), but that's easily done. Adding an in-kernel read based on ctx, that is no problem at all. - Davide -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html