[COMMIT master] KVM: Convert irq notifiers lists to RCU locking

2009-08-26 Thread Avi Kivity
From: Gleb Natapov g...@redhat.com

Use RCU locking for mask/ack notifiers lists.

Signed-off-by: Gleb Natapov g...@redhat.com
Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index f019725..6c94614 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -183,19 +183,19 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned 
irqchip, unsigned pin)
 
rcu_read_lock();
gsi = rcu_dereference(kvm-irq_routing)-chip[irqchip][pin];
-   rcu_read_unlock();
-
if (gsi != -1)
-   hlist_for_each_entry(kian, n, kvm-irq_ack_notifier_list, link)
+   hlist_for_each_entry_rcu(kian, n, kvm-irq_ack_notifier_list,
+link)
if (kian-gsi == gsi)
kian-irq_acked(kian);
+   rcu_read_unlock();
 }
 
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
   struct kvm_irq_ack_notifier *kian)
 {
mutex_lock(kvm-irq_lock);
-   hlist_add_head(kian-link, kvm-irq_ack_notifier_list);
+   hlist_add_head_rcu(kian-link, kvm-irq_ack_notifier_list);
mutex_unlock(kvm-irq_lock);
 }
 
@@ -203,8 +203,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
struct kvm_irq_ack_notifier *kian)
 {
mutex_lock(kvm-irq_lock);
-   hlist_del_init(kian-link);
+   hlist_del_init_rcu(kian-link);
mutex_unlock(kvm-irq_lock);
+   synchronize_rcu();
 }
 
 int kvm_request_irq_source_id(struct kvm *kvm)
@@ -257,7 +258,7 @@ void kvm_register_irq_mask_notifier(struct kvm *kvm, int 
irq,
 {
mutex_lock(kvm-irq_lock);
kimn-irq = irq;
-   hlist_add_head(kimn-link, kvm-mask_notifier_list);
+   hlist_add_head_rcu(kimn-link, kvm-mask_notifier_list);
mutex_unlock(kvm-irq_lock);
 }
 
@@ -265,8 +266,9 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int 
irq,
  struct kvm_irq_mask_notifier *kimn)
 {
mutex_lock(kvm-irq_lock);
-   hlist_del(kimn-link);
+   hlist_del_rcu(kimn-link);
mutex_unlock(kvm-irq_lock);
+   synchronize_rcu();
 }
 
 void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask)
@@ -274,11 +276,11 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, 
bool mask)
struct kvm_irq_mask_notifier *kimn;
struct hlist_node *n;
 
-   WARN_ON(!mutex_is_locked(kvm-irq_lock));
-
-   hlist_for_each_entry(kimn, n, kvm-mask_notifier_list, link)
+   rcu_read_lock();
+   hlist_for_each_entry_rcu(kimn, n, kvm-mask_notifier_list, link)
if (kimn-irq == irq)
kimn-func(kimn, mask);
+   rcu_read_unlock();
 }
 
 void kvm_free_irq_routing(struct kvm *kvm)
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[COMMIT master] KVM: Maintain back mapping from irqchip/pin to gsi

2009-08-26 Thread Avi Kivity
From: Gleb Natapov g...@redhat.com

Maintain back mapping from irqchip/pin to gsi to speedup
interrupt acknowledgment notifications.

[avi: build fix on non-x86/ia64]

Signed-off-by: Gleb Natapov g...@redhat.com
Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/arch/ia64/include/asm/kvm.h b/arch/ia64/include/asm/kvm.h
index 18a7e49..bc90c75 100644
--- a/arch/ia64/include/asm/kvm.h
+++ b/arch/ia64/include/asm/kvm.h
@@ -60,6 +60,7 @@ struct kvm_ioapic_state {
 #define KVM_IRQCHIP_PIC_MASTER   0
 #define KVM_IRQCHIP_PIC_SLAVE1
 #define KVM_IRQCHIP_IOAPIC   2
+#define KVM_NR_IRQCHIPS  3
 
 #define KVM_CONTEXT_SIZE   8*1024
 
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 4a5fe91..f02e87a 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -79,6 +79,7 @@ struct kvm_ioapic_state {
 #define KVM_IRQCHIP_PIC_MASTER   0
 #define KVM_IRQCHIP_PIC_SLAVE1
 #define KVM_IRQCHIP_IOAPIC   2
+#define KVM_NR_IRQCHIPS  3
 
 /* for KVM_GET_REGS and KVM_SET_REGS */
 struct kvm_regs {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 802c080..b8db809 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -132,7 +132,10 @@ struct kvm_kernel_irq_routing_entry {
struct hlist_node link;
 };
 
+#ifdef __KVM_HAVE_IOAPIC
+
 struct kvm_irq_routing_table {
+   int chip[KVM_NR_IRQCHIPS][KVM_IOAPIC_NUM_PINS];
struct kvm_kernel_irq_routing_entry *rt_entries;
u32 nr_rt_entries;
/*
@@ -142,6 +145,12 @@ struct kvm_irq_routing_table {
struct hlist_head map[0];
 };
 
+#else
+
+struct kvm_irq_routing_table {};
+
+#endif
+
 struct kvm {
spinlock_t mmu_lock;
spinlock_t requests_lock;
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 81950f6..59cf8da 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -175,25 +175,16 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned 
irqchip, unsigned pin)
 {
struct kvm_irq_ack_notifier *kian;
struct hlist_node *n;
-   unsigned gsi = pin;
-   int i;
+   int gsi;
 
trace_kvm_ack_irq(irqchip, pin);
 
-   for (i = 0; i  kvm-irq_routing-nr_rt_entries; i++) {
-   struct kvm_kernel_irq_routing_entry *e;
-   e = kvm-irq_routing-rt_entries[i];
-   if (e-type == KVM_IRQ_ROUTING_IRQCHIP 
-   e-irqchip.irqchip == irqchip 
-   e-irqchip.pin == pin) {
-   gsi = e-gsi;
-   break;
-   }
-   }
-
-   hlist_for_each_entry(kian, n, kvm-arch.irq_ack_notifier_list, link)
-   if (kian-gsi == gsi)
-   kian-irq_acked(kian);
+   gsi = kvm-irq_routing-chip[irqchip][pin];
+   if (gsi != -1)
+   hlist_for_each_entry(kian, n, kvm-arch.irq_ack_notifier_list,
+link)
+   if (kian-gsi == gsi)
+   kian-irq_acked(kian);
 }
 
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
@@ -332,6 +323,9 @@ static int setup_routing_entry(struct kvm_irq_routing_table 
*rt,
}
e-irqchip.irqchip = ue-u.irqchip.irqchip;
e-irqchip.pin = ue-u.irqchip.pin + delta;
+   if (e-irqchip.pin = KVM_IOAPIC_NUM_PINS)
+   goto out;
+   rt-chip[ue-u.irqchip.irqchip][e-irqchip.pin] = ue-gsi;
break;
case KVM_IRQ_ROUTING_MSI:
e-set = kvm_set_msi;
@@ -356,7 +350,7 @@ int kvm_set_irq_routing(struct kvm *kvm,
unsigned flags)
 {
struct kvm_irq_routing_table *new, *old;
-   u32 i, nr_rt_entries = 0;
+   u32 i, j, nr_rt_entries = 0;
int r;
 
for (i = 0; i  nr; ++i) {
@@ -377,6 +371,9 @@ int kvm_set_irq_routing(struct kvm *kvm,
new-rt_entries = (void *)new-map[nr_rt_entries];
 
new-nr_rt_entries = nr_rt_entries;
+   for (i = 0; i  3; i++)
+   for (j = 0; j  KVM_IOAPIC_NUM_PINS; j++)
+   new-chip[i][j] = -1;
 
for (i = 0; i  nr; ++i) {
r = -EINVAL;
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[COMMIT master] Revert KVM: x86 emulator: Report unhandled instructions

2009-08-26 Thread Avi Kivity
From: Avi Kivity a...@redhat.com

This reverts commit ea67fbbcf346a15b1e8e18cff7c64c248972b961.  Unhandled
instructions can and do occur in normal runs.  This needs to be made optional
so as not to spam the logs.

Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 15593e8..0644d3d 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2194,7 +2194,6 @@ writeback:
 
 done:
if (rc == X86EMUL_UNHANDLEABLE) {
-   kvm_report_emulation_failure(ctxt-vcpu, unhandled 
instruction);
c-eip = saved_eip;
return -1;
}
@@ -2468,7 +2467,7 @@ twobyte_insn:
goto writeback;
 
 cannot_emulate:
-   kvm_report_emulation_failure(ctxt-vcpu, unhandled instruction);
+   DPRINTF(Cannot emulate %02x\n, c-b);
c-eip = saved_eip;
return -1;
 }
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[COMMIT master] KVM: Move IO APIC to its own lock

2009-08-26 Thread Avi Kivity
From: Gleb Natapov g...@redhat.com

The allows removal of irq_lock from the injection path.

Signed-off-by: Gleb Natapov g...@redhat.com
Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 0ad09f0..4a98314 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -851,8 +851,7 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm,
r = 0;
switch (chip-chip_id) {
case KVM_IRQCHIP_IOAPIC:
-   memcpy(chip-chip.ioapic, ioapic_irqchip(kvm),
-   sizeof(struct kvm_ioapic_state));
+   r = kvm_get_ioapic(kvm, chip-chip.ioapic);
break;
default:
r = -EINVAL;
@@ -868,9 +867,7 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct 
kvm_irqchip *chip)
r = 0;
switch (chip-chip_id) {
case KVM_IRQCHIP_IOAPIC:
-   memcpy(ioapic_irqchip(kvm),
-   chip-chip.ioapic,
-   sizeof(struct kvm_ioapic_state));
+   r = kvm_set_ioapic(kvm, chip-chip.ioapic);
break;
default:
r = -EINVAL;
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index ccc941a..d057c0c 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -38,7 +38,15 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
s-isr_ack |= (1  irq);
if (s != s-pics_state-pics[0])
irq += 8;
+   /*
+* We are dropping lock while calling ack notifiers since ack
+* notifier callbacks for assigned devices call into PIC recursively.
+* Other interrupt may be delivered to PIC while lock is dropped but
+* it should be safe since PIC state is already updated at this stage.
+*/
+   spin_unlock(s-pics_state-lock);
kvm_notify_acked_irq(s-pics_state-kvm, SELECT_PIC(irq), irq);
+   spin_lock(s-pics_state-lock);
 }
 
 void kvm_pic_clear_isr_ack(struct kvm *kvm)
@@ -176,16 +184,18 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
 static inline void pic_intack(struct kvm_kpic_state *s, int irq)
 {
s-isr |= 1  irq;
-   if (s-auto_eoi) {
-   if (s-rotate_on_auto_eoi)
-   s-priority_add = (irq + 1)  7;
-   pic_clear_isr(s, irq);
-   }
/*
 * We don't clear a level sensitive interrupt here
 */
if (!(s-elcr  (1  irq)))
s-irr = ~(1  irq);
+
+   if (s-auto_eoi) {
+   if (s-rotate_on_auto_eoi)
+   s-priority_add = (irq + 1)  7;
+   pic_clear_isr(s, irq);
+   }
+
 }
 
 int kvm_pic_read_irq(struct kvm *kvm)
@@ -294,9 +304,9 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 
val)
priority = get_priority(s, s-isr);
if (priority != 8) {
irq = (priority + s-priority_add)  7;
-   pic_clear_isr(s, irq);
if (cmd == 5)
s-priority_add = (irq + 1)  7;
+   pic_clear_isr(s, irq);
pic_update_irq(s-pics_state);
}
break;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 5b9d1ae..8f0967f 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -471,11 +471,8 @@ static void apic_set_eoi(struct kvm_lapic *apic)
trigger_mode = IOAPIC_LEVEL_TRIG;
else
trigger_mode = IOAPIC_EDGE_TRIG;
-   if (!(apic_get_reg(apic, APIC_SPIV)  APIC_SPIV_DIRECTED_EOI)) {
-   mutex_lock(apic-vcpu-kvm-irq_lock);
+   if (!(apic_get_reg(apic, APIC_SPIV)  APIC_SPIV_DIRECTED_EOI))
kvm_ioapic_update_eoi(apic-vcpu-kvm, vector, trigger_mode);
-   mutex_unlock(apic-vcpu-kvm-irq_lock);
-   }
 }
 
 static void apic_send_ipi(struct kvm_lapic *apic)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d22400f..c7b0b83 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2023,9 +2023,7 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, 
struct kvm_irqchip *chip)
sizeof(struct kvm_pic_state));
break;
case KVM_IRQCHIP_IOAPIC:
-   memcpy(chip-chip.ioapic,
-   ioapic_irqchip(kvm),
-   sizeof(struct kvm_ioapic_state));
+   r = kvm_get_ioapic(kvm, chip-chip.ioapic);
break;
default:
r = -EINVAL;
@@ -2055,11 +2053,7 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, 
struct kvm_irqchip *chip)
spin_unlock(pic_irqchip(kvm)-lock);
break;
case KVM_IRQCHIP_IOAPIC:
- 

[COMMIT master] KVM: Call pic_clear_isr() on pic reset to reuse logic there

2009-08-26 Thread Avi Kivity
From: Gleb Natapov g...@redhat.com

Also move call of ack notifiers after pic state change.

Signed-off-by: Gleb Natapov g...@redhat.com
Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 01f1516..ccc941a 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -225,22 +225,11 @@ int kvm_pic_read_irq(struct kvm *kvm)
 
 void kvm_pic_reset(struct kvm_kpic_state *s)
 {
-   int irq, irqbase, n;
+   int irq;
struct kvm *kvm = s-pics_state-irq_request_opaque;
struct kvm_vcpu *vcpu0 = kvm-bsp_vcpu;
+   u8 irr = s-irr, isr = s-imr;
 
-   if (s == s-pics_state-pics[0])
-   irqbase = 0;
-   else
-   irqbase = 8;
-
-   for (irq = 0; irq  PIC_NUM_PINS/2; irq++) {
-   if (vcpu0  kvm_apic_accept_pic_intr(vcpu0))
-   if (s-irr  (1  irq) || s-isr  (1  irq)) {
-   n = irq + irqbase;
-   kvm_notify_acked_irq(kvm, SELECT_PIC(n), n);
-   }
-   }
s-last_irr = 0;
s-irr = 0;
s-imr = 0;
@@ -256,6 +245,13 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
s-rotate_on_auto_eoi = 0;
s-special_fully_nested_mode = 0;
s-init4 = 0;
+
+   for (irq = 0; irq  PIC_NUM_PINS/2; irq++) {
+   if (vcpu0  kvm_apic_accept_pic_intr(vcpu0))
+   if (irr  (1  irq) || isr  (1  irq)) {
+   pic_clear_isr(s, irq);
+   }
+   }
 }
 
 static void pic_ioport_write(void *opaque, u32 addr, u32 val)
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[COMMIT master] KVM: Move irq routing data structure to rcu locking

2009-08-26 Thread Avi Kivity
From: Gleb Natapov g...@redhat.com

Signed-off-by: Gleb Natapov g...@redhat.com
Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 59cf8da..fb861dd 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -159,7 +159,8 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 
irq, int level)
 * IOAPIC.  So set the bit in both. The guest will ignore
 * writes to the unused one.
 */
-   irq_rt = kvm-irq_routing;
+   rcu_read_lock();
+   irq_rt = rcu_dereference(kvm-irq_routing);
if (irq  irq_rt-nr_rt_entries)
hlist_for_each_entry(e, n, irq_rt-map[irq], link) {
int r = e-set(e, kvm, irq_source_id, level);
@@ -168,6 +169,7 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 
irq, int level)
 
ret = r + ((ret  0) ? 0 : ret);
}
+   rcu_read_unlock();
return ret;
 }
 
@@ -179,7 +181,10 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned 
irqchip, unsigned pin)
 
trace_kvm_ack_irq(irqchip, pin);
 
-   gsi = kvm-irq_routing-chip[irqchip][pin];
+   rcu_read_lock();
+   gsi = rcu_dereference(kvm-irq_routing)-chip[irqchip][pin];
+   rcu_read_unlock();
+
if (gsi != -1)
hlist_for_each_entry(kian, n, kvm-arch.irq_ack_notifier_list,
 link)
@@ -279,9 +284,9 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool 
mask)
 
 void kvm_free_irq_routing(struct kvm *kvm)
 {
-   mutex_lock(kvm-irq_lock);
+   /* Called only during vm destruction. Nobody can use the pointer
+  at this stage */
kfree(kvm-irq_routing);
-   mutex_unlock(kvm-irq_lock);
 }
 
 static int setup_routing_entry(struct kvm_irq_routing_table *rt,
@@ -387,8 +392,9 @@ int kvm_set_irq_routing(struct kvm *kvm,
 
mutex_lock(kvm-irq_lock);
old = kvm-irq_routing;
-   kvm-irq_routing = new;
+   rcu_assign_pointer(kvm-irq_routing, new);
mutex_unlock(kvm-irq_lock);
+   synchronize_rcu();
 
new = old;
r = 0;
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[COMMIT master] KVM: PIT: fix pit_state copy in set_pit2/get_pit2

2009-08-26 Thread Avi Kivity
From: Marcelo Tosatti mtosa...@redhat.com

The kvm_pit_state2 structure contains extra space, so the memcpy
in kvm_vm_ioctl_set_pit2 corrupts kvm-arch.vpit-pit_state.

Fix it by memcpy'ing the channel information and assigning flags
manually.

Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2e92aef..d22400f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2095,7 +2095,9 @@ static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct 
kvm_pit_state2 *ps)
int r = 0;
 
mutex_lock(kvm-arch.vpit-pit_state.lock);
-   memcpy(ps, kvm-arch.vpit-pit_state, sizeof(struct kvm_pit_state2));
+   memcpy(ps-channels, kvm-arch.vpit-pit_state.channels,
+   sizeof(ps-channels));
+   ps-flags = kvm-arch.vpit-pit_state.flags;
mutex_unlock(kvm-arch.vpit-pit_state.lock);
return r;
 }
@@ -2109,7 +2111,9 @@ static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct 
kvm_pit_state2 *ps)
cur_legacy = ps-flags  KVM_PIT_FLAGS_HPET_LEGACY;
if (!prev_legacy  cur_legacy)
start = 1;
-   memcpy(kvm-arch.vpit-pit_state, ps, sizeof(struct kvm_pit_state2));
+   memcpy(kvm-arch.vpit-pit_state.channels, ps-channels,
+  sizeof(kvm-arch.vpit-pit_state.channels));
+   kvm-arch.vpit-pit_state.flags = ps-flags;
kvm_pit_load_count(kvm, 0, kvm-arch.vpit-pit_state.channels[0].count, 
start);
mutex_unlock(kvm-arch.vpit-pit_state.lock);
return r;
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: vhost net: performance with ping benchmark

2009-08-26 Thread Rusty Russell
On Tue, 25 Aug 2009 10:04:41 pm Arnd Bergmann wrote:
 On Tuesday 25 August 2009, Avi Kivity wrote:
  On 08/25/2009 05:22 AM, Anthony Liguori wrote:
  
   I think 2.6.32 is pushing it. 
  
  2.6.32 is pushing it, but we need to push it.
 
 Agreed.

Get real.  It's not happening.

We need migration completely solved and tested.  I want to see all the
features supported, including indirect descs and GSO.

If this wasn't a new userspace ABI, I'd be all for throwing it in as
experimental ASAP.

Rusty.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: vhost net: performance with ping benchmark

2009-08-26 Thread Michael S. Tsirkin
On Wed, Aug 26, 2009 at 05:04:44PM +0930, Rusty Russell wrote:
 On Tue, 25 Aug 2009 10:04:41 pm Arnd Bergmann wrote:
  On Tuesday 25 August 2009, Avi Kivity wrote:
   On 08/25/2009 05:22 AM, Anthony Liguori wrote:
   
I think 2.6.32 is pushing it. 
   
   2.6.32 is pushing it, but we need to push it.
  
  Agreed.
 
 Get real.  It's not happening.
 
 We need migration completely solved and tested.  I want to see all the
 features supported, including indirect descs and GSO.

I'm not sure why indirect descs are needed for virtio-net. Comments?

 If this wasn't a new userspace ABI, I'd be all for throwing it in as
 experimental ASAP.
 
 Rusty.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] VMX: Return to userspace on invalid state emulation failure

2009-08-26 Thread Avi Kivity

On 08/25/2009 01:37 AM, Mohammed Gamal wrote:

Return to userspace instead of repeatedly trying to emulate
instructions that have already failed

Signed-off-by: Mohammed Gamalm.gamal...@gmail.com
---
  arch/x86/kvm/vmx.c |6 +-
  1 files changed, 5 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6b57eed..c559bb7 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3337,6 +3337,8 @@ static void handle_invalid_guest_state(struct kvm_vcpu 
*vcpu)

if (err != EMULATE_DONE) {
kvm_report_emulation_failure(vcpu, emulation failure);
+   vcpu-run-exit_reason = KVM_EXIT_INTERNAL_ERROR;
+   vcpu-run-internal.suberror = 
KVM_INTERNAL_ERROR_EMULATION;
break;
}

@@ -3607,7 +3609,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx-entry_time = ktime_get();

/* Handle invalid guest state instead of entering VMX */
-   if (vmx-emulation_required  emulate_invalid_guest_state) {
+   if (vmx-emulation_required  emulate_invalid_guest_state
+ !(vcpu-run-exit_reason == KVM_EXIT_INTERNAL_ERROR
+ vcpu-run-internal.suberror == 
KVM_INTERNAL_ERROR_EMULATION)) {
handle_invalid_guest_state(vcpu);
return;
}
   


Still suffers from the same problem.  You don't always update 
vcpu-run-exit_reason, so you can't test it.  Best to return a value 
from handle_invalid_guest_state() (the standard return codes for exit 
handlers are 1 for return-to-guest, 0 for return-to-host, and -errno to 
return with an error).



--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] fix apic id reading in x2apic mode

2009-08-26 Thread Avi Kivity

On 08/25/2009 04:39 PM, Gleb Natapov wrote:

Format of apic id register is different in x2apic mode.
Return correct apic id when apic is in x2apic mode.

   


Applied, thanks.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] VMX: Return to userspace on invalid state emulation failure

2009-08-26 Thread Mohammed Gamal
On Wed, Aug 26, 2009 at 12:02 PM, Avi Kivitya...@redhat.com wrote:
 On 08/25/2009 01:37 AM, Mohammed Gamal wrote:

 Return to userspace instead of repeatedly trying to emulate
 instructions that have already failed

 Signed-off-by: Mohammed Gamalm.gamal...@gmail.com
 ---
  arch/x86/kvm/vmx.c |    6 +-
  1 files changed, 5 insertions(+), 1 deletions(-)

 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index 6b57eed..c559bb7 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -3337,6 +3337,8 @@ static void handle_invalid_guest_state(struct
 kvm_vcpu *vcpu)

                if (err != EMULATE_DONE) {
                        kvm_report_emulation_failure(vcpu, emulation
 failure);
 +                       vcpu-run-exit_reason = KVM_EXIT_INTERNAL_ERROR;
 +                       vcpu-run-internal.suberror =
 KVM_INTERNAL_ERROR_EMULATION;
                        break;
                }

 @@ -3607,7 +3609,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
                vmx-entry_time = ktime_get();

        /* Handle invalid guest state instead of entering VMX */
 -       if (vmx-emulation_required  emulate_invalid_guest_state) {
 +       if (vmx-emulation_required  emulate_invalid_guest_state
 +                 !(vcpu-run-exit_reason == KVM_EXIT_INTERNAL_ERROR
 +                 vcpu-run-internal.suberror ==
 KVM_INTERNAL_ERROR_EMULATION)) {
                handle_invalid_guest_state(vcpu);
                return;
        }


 Still suffers from the same problem.  You don't always update
 vcpu-run-exit_reason, so you can't test it.  Best to return a value from
 handle_invalid_guest_state() (the standard return codes for exit handlers
 are 1 for return-to-guest, 0 for return-to-host, and -errno to return with
 an error).

I was thinking of the same idea since I was also concerned about
vcpu-run-exit_reason not being updated. But how can we interpret the
return values of handle_invalid_guest_state() inside vmx_vcpu_run()
since it doesn't have a return value. Or would it be better to move
handle_invalid_guest_state() to the standard vmx exit handlers?
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] VMX: Return to userspace on invalid state emulation failure

2009-08-26 Thread Avi Kivity

On 08/26/2009 01:07 PM, Mohammed Gamal wrote:

On Wed, Aug 26, 2009 at 12:02 PM, Avi Kivitya...@redhat.com  wrote:
   

On 08/25/2009 01:37 AM, Mohammed Gamal wrote:
 

Return to userspace instead of repeatedly trying to emulate
instructions that have already failed

Signed-off-by: Mohammed Gamalm.gamal...@gmail.com
---
  arch/x86/kvm/vmx.c |6 +-
  1 files changed, 5 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6b57eed..c559bb7 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3337,6 +3337,8 @@ static void handle_invalid_guest_state(struct
kvm_vcpu *vcpu)

if (err != EMULATE_DONE) {
kvm_report_emulation_failure(vcpu, emulation
failure);
+   vcpu-run-exit_reason = KVM_EXIT_INTERNAL_ERROR;
+   vcpu-run-internal.suberror =
KVM_INTERNAL_ERROR_EMULATION;
break;
}

@@ -3607,7 +3609,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
vmx-entry_time = ktime_get();

/* Handle invalid guest state instead of entering VMX */
-   if (vmx-emulation_requiredemulate_invalid_guest_state) {
+   if (vmx-emulation_requiredemulate_invalid_guest_state
+!(vcpu-run-exit_reason == KVM_EXIT_INTERNAL_ERROR
+ vcpu-run-internal.suberror ==
KVM_INTERNAL_ERROR_EMULATION)) {
handle_invalid_guest_state(vcpu);
return;
}

   

Still suffers from the same problem.  You don't always update
vcpu-run-exit_reason, so you can't test it.  Best to return a value from
handle_invalid_guest_state() (the standard return codes for exit handlers
are 1 for return-to-guest, 0 for return-to-host, and -errno to return with
an error).

 

I was thinking of the same idea since I was also concerned about
vcpu-run-exit_reason not being updated. But how can we interpret the
return values of handle_invalid_guest_state() inside vmx_vcpu_run()
since it doesn't have a return value. Or would it be better to move
handle_invalid_guest_state() to the standard vmx exit handlers?
   


We can move the call to vmx_handle_exit().  We have a check for 
emulate_invalid_guest_state there anyway.  I don't think it should be a 
standard exit handler since there is no exit_reason for it.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: AlacrityVM benchmark numbers updated

2009-08-26 Thread Avi Kivity

On 08/26/2009 04:01 AM, Gregory Haskins wrote:

We are pleased to announce the availability of the latest networking
benchmark numbers for AlacrityVM.  We've made several tweaks to the
original v0.1 release to improve performance.  The most notable is a
switch from get_user_pages to switch_mm+copy_[to/from]_user thanks to a
review suggestion from Michael Tsirkin (as well as his patch to
implement it).

This change alone accounted for freeing up an additional 1.2Gbps, which
is over 25% improvement from v0.1.  The previous numbers were 4560Gbps
before the change, and 5708Gbps after (for 1500mtu over 10GE).  This
moves us ever closer to the goal of native performance under virtualization.
   


Interesting, it's good to see that copy_*_user() works so well.  Note 
that there's a possible optimization that goes in the opposite direction 
- keep using get_user_pages(), but use the dma engine API to perform the 
actual copy.  I expect that it will only be a win when using tso to 
transfer full pages.  Large pages may also help.


Copyless tx also wants get_user_pages().  It makes sense to check if 
switch_mm() + get_user_pages_fast() gives better performance than 
get_user_pages().


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: R/W HG memory mappings with kvm?

2009-08-26 Thread Avi Kivity

On 08/24/2009 07:55 AM, Avi Kivity wrote:

On 08/24/2009 12:59 AM, Stephen Donnelly wrote:

On Thu, Aug 20, 2009 at 12:14 AM, Avi Kivitya...@redhat.com  wrote:

On 08/13/2009 07:07 AM, Stephen Donnelly wrote:

npages = get_user_pages_fast(addr, 1, 1, page); returns -EFAULT,
presumably because (vma-vm_flags(VM_IO | VM_PFNMAP)).

It takes then unlikely branch, and checks the vma, but I don't
understand what it is doing here: pfn = ((addr - vma-vm_start)
PAGE_SHIFT) + vma-vm_pgoff;

It's calculating the pfn according to pfnmap rules.

 From what I understand this will only work when remapping 'main
memory', e.g. where the pgoff is equal to the physical page offset?
VMAs that remap IO memory will usually set pgoff to 0 for the start of
the mapping.


If so, how do they calculate the pfn when mapping pages?  kvm needs to 
be able to do the same thing.


Maybe the simplest thing is to call vma-vm_ops-fault here.  
Marcelo/Chris?  Context is improving gfn_to_pfn() on the mmio path.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 03/47] KVM: MMU: fix bogus alloc_mmu_pages assignment

2009-08-26 Thread Avi Kivity
From: Marcelo Tosatti mtosa...@redhat.com

Remove the bogus n_free_mmu_pages assignment from alloc_mmu_pages.

It breaks accounting of mmu pages, since n_free_mmu_pages is modified
but the real number of pages remains the same.

Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/mmu.c |8 
 1 files changed, 0 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 28be35c..6f38178 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2786,14 +2786,6 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
 
ASSERT(vcpu);
 
-   spin_lock(vcpu-kvm-mmu_lock);
-   if (vcpu-kvm-arch.n_requested_mmu_pages)
-   vcpu-kvm-arch.n_free_mmu_pages =
-   vcpu-kvm-arch.n_requested_mmu_pages;
-   else
-   vcpu-kvm-arch.n_free_mmu_pages =
-   vcpu-kvm-arch.n_alloc_mmu_pages;
-   spin_unlock(vcpu-kvm-mmu_lock);
/*
 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
 * Therefore we need to allocate shadow page tables in the first
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 04/47] KVM: x86: Disallow hypercalls for guest callers in rings 0

2009-08-26 Thread Avi Kivity
From: Jan Kiszka jan.kis...@siemens.com

So far unprivileged guest callers running in ring 3 can issue, e.g., MMU
hypercalls. Normally, such callers cannot provide any hand-crafted MMU
command structure as it has to be passed by its physical address, but
they can still crash the guest kernel by passing random addresses.

To close the hole, this patch considers hypercalls valid only if issued
from guest ring 0. This may still be relaxed on a per-hypercall base in
the future once required.

Cc: sta...@kernel.org
Signed-off-by: Jan Kiszka jan.kis...@siemens.com
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/x86.c   |6 ++
 include/linux/kvm_para.h |1 +
 2 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fa525d5..92b5edd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3213,6 +3213,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
a3 = 0x;
}
 
+   if (kvm_x86_ops-get_cpl(vcpu) != 0) {
+   ret = -KVM_EPERM;
+   goto out;
+   }
+
switch (nr) {
case KVM_HC_VAPIC_POLL_IRQ:
ret = 0;
@@ -3224,6 +3229,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
ret = -KVM_ENOSYS;
break;
}
+out:
kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
++vcpu-stat.hypercalls;
return r;
diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h
index 3ddce03..d731092 100644
--- a/include/linux/kvm_para.h
+++ b/include/linux/kvm_para.h
@@ -13,6 +13,7 @@
 #define KVM_ENOSYS 1000
 #define KVM_EFAULT EFAULT
 #define KVM_E2BIG  E2BIG
+#define KVM_EPERM  EPERM
 
 #define KVM_HC_VAPIC_POLL_IRQ  1
 #define KVM_HC_MMU_OP  2
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 18/47] KVM: SVM: cache nested intercepts

2009-08-26 Thread Avi Kivity
From: Joerg Roedel joerg.roe...@amd.com

When the nested intercepts are cached we don't need to call
get_user_pages and/or map the nested vmcb on every nested #vmexit to
check who will handle the intercept.
Further this patch aligns the emulated svm behavior better to real
hardware.

Signed-off-by: Joerg Roedel joerg.roe...@amd.com
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/svm.c |   30 +++---
 1 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index fbadaa7..4426c63 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -80,6 +80,15 @@ struct nested_state {
 
/* gpa pointers to the real vectors */
u64 vmcb_msrpm;
+
+   /* cache for intercepts of the guest */
+   u16 intercept_cr_read;
+   u16 intercept_cr_write;
+   u16 intercept_dr_read;
+   u16 intercept_dr_write;
+   u32 intercept_exceptions;
+   u64 intercept;
+
 };
 
 struct vcpu_svm {
@@ -1452,7 +1461,6 @@ static int nested_svm_exit_handled_real(struct vcpu_svm 
*svm,
void *arg2,
void *opaque)
 {
-   struct vmcb *nested_vmcb = (struct vmcb *)arg1;
bool kvm_overrides = *(bool *)opaque;
u32 exit_code = svm-vmcb-control.exit_code;
 
@@ -1479,38 +1487,38 @@ static int nested_svm_exit_handled_real(struct vcpu_svm 
*svm,
switch (exit_code) {
case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: {
u32 cr_bits = 1  (exit_code - SVM_EXIT_READ_CR0);
-   if (nested_vmcb-control.intercept_cr_read  cr_bits)
+   if (svm-nested.intercept_cr_read  cr_bits)
return 1;
break;
}
case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: {
u32 cr_bits = 1  (exit_code - SVM_EXIT_WRITE_CR0);
-   if (nested_vmcb-control.intercept_cr_write  cr_bits)
+   if (svm-nested.intercept_cr_write  cr_bits)
return 1;
break;
}
case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: {
u32 dr_bits = 1  (exit_code - SVM_EXIT_READ_DR0);
-   if (nested_vmcb-control.intercept_dr_read  dr_bits)
+   if (svm-nested.intercept_dr_read  dr_bits)
return 1;
break;
}
case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: {
u32 dr_bits = 1  (exit_code - SVM_EXIT_WRITE_DR0);
-   if (nested_vmcb-control.intercept_dr_write  dr_bits)
+   if (svm-nested.intercept_dr_write  dr_bits)
return 1;
break;
}
case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
u32 excp_bits = 1  (exit_code - SVM_EXIT_EXCP_BASE);
-   if (nested_vmcb-control.intercept_exceptions  excp_bits)
+   if (svm-nested.intercept_exceptions  excp_bits)
return 1;
break;
}
default: {
u64 exit_bits = 1ULL  (exit_code - SVM_EXIT_INTR);
nsvm_printk(exit code: 0x%x\n, exit_code);
-   if (nested_vmcb-control.intercept  exit_bits)
+   if (svm-nested.intercept  exit_bits)
return 1;
}
}
@@ -1801,6 +1809,14 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void 
*arg1,
 
svm-nested.vmcb_msrpm = nested_vmcb-control.msrpm_base_pa;
 
+   /* cache intercepts */
+   svm-nested.intercept_cr_read= 
nested_vmcb-control.intercept_cr_read;
+   svm-nested.intercept_cr_write   = 
nested_vmcb-control.intercept_cr_write;
+   svm-nested.intercept_dr_read= 
nested_vmcb-control.intercept_dr_read;
+   svm-nested.intercept_dr_write   = 
nested_vmcb-control.intercept_dr_write;
+   svm-nested.intercept_exceptions = 
nested_vmcb-control.intercept_exceptions;
+   svm-nested.intercept= nested_vmcb-control.intercept;
+
force_new_asid(svm-vcpu);
svm-vmcb-control.exit_int_info = nested_vmcb-control.exit_int_info;
svm-vmcb-control.exit_int_info_err = 
nested_vmcb-control.exit_int_info_err;
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 22/47] KVM: SVM: get rid of nested_svm_vmexit_real

2009-08-26 Thread Avi Kivity
From: Joerg Roedel joerg.roe...@amd.com

This patch is the starting point of removing nested_svm_do from the
nested svm code. The nested_svm_do function basically maps two guest
physical pages to host virtual addresses and calls a passed function
on it. This function pointer code flow is hard to read and not the
best technical solution here.
As a side effect this patch indroduces the nested_svm_[un]map helper
functions.

Signed-off-by: Joerg Roedel joerg.roe...@amd.com
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/svm.c |   52 
 1 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 67fad66..5e55a1b 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1390,6 +1390,39 @@ static inline int nested_svm_intr(struct vcpu_svm *svm)
return 0;
 }
 
+static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx)
+{
+   struct page *page;
+
+   down_read(current-mm-mmap_sem);
+   page = gfn_to_page(svm-vcpu.kvm, gpa  PAGE_SHIFT);
+   up_read(current-mm-mmap_sem);
+
+   if (is_error_page(page))
+   goto error;
+
+   return kmap_atomic(page, idx);
+
+error:
+   kvm_release_page_clean(page);
+   kvm_inject_gp(svm-vcpu, 0);
+
+   return NULL;
+}
+
+static void nested_svm_unmap(void *addr, enum km_type idx)
+{
+   struct page *page;
+
+   if (!addr)
+   return;
+
+   page = kmap_atomic_to_page(addr);
+
+   kunmap_atomic(addr, idx);
+   kvm_release_page_dirty(page);
+}
+
 static struct page *nested_svm_get_page(struct vcpu_svm *svm, u64 gpa)
 {
struct page *page;
@@ -1597,13 +1630,16 @@ static inline void copy_vmcb_control_area(struct vmcb 
*dst_vmcb, struct vmcb *fr
dst-lbr_ctl  = from-lbr_ctl;
 }
 
-static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1,
- void *arg2, void *opaque)
+static int nested_svm_vmexit(struct vcpu_svm *svm)
 {
-   struct vmcb *nested_vmcb = (struct vmcb *)arg1;
+   struct vmcb *nested_vmcb;
struct vmcb *hsave = svm-nested.hsave;
struct vmcb *vmcb = svm-vmcb;
 
+   nested_vmcb = nested_svm_map(svm, svm-nested.vmcb, KM_USER0);
+   if (!nested_vmcb)
+   return 1;
+
/* Give the current vmcb to the guest */
disable_gif(svm);
 
@@ -1678,15 +1714,7 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, 
void *arg1,
/* Exit nested SVM mode */
svm-nested.vmcb = 0;
 
-   return 0;
-}
-
-static int nested_svm_vmexit(struct vcpu_svm *svm)
-{
-   nsvm_printk(VMexit\n);
-   if (nested_svm_do(svm, svm-nested.vmcb, 0,
- NULL, nested_svm_vmexit_real))
-   return 1;
+   nested_svm_unmap(nested_vmcb, KM_USER0);
 
kvm_mmu_reset_context(svm-vcpu);
kvm_mmu_load(svm-vcpu);
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/2] eventfd: new EFD_STATE flag

2009-08-26 Thread Michael S. Tsirkin
On Tue, Aug 25, 2009 at 02:57:01PM -0700, Davide Libenzi wrote:
 On Tue, 25 Aug 2009, Michael S. Tsirkin wrote:
 
  Yes, we don't want that. The best thing is to try to restate the problem
  in a way that is generic, and then either solve or best use existing
  solution. Right?
  
  I thought I had that, but apparently not.  The reason I'm Cc-ing you is
  not to try and spam you until you give up and accept the patch, it's
  hoping that you see the pattern behind our usage, and help generalize
  it.
  
  If I understand it correctly, you believe this is not possible and so
  any solution will have to be in KVM? Or maybe I didn't state the problem
  clearly enough and should restate it?
 
 Please do.
 
 
 
 - Davide


Problem looks like this:

There are multiple processes (devices) where each has a condition
(interrupt line) which it has logic to determine is either true or
false.

A single other process (hypervisor) is interested in a condition
(interrupt level) which is a logical OR of all interrupt lines.
On changes, an interrupt level value needs to be read and copied to
guest virtual cpu.

We also want ability to replace some or all processes above by a kernel
components, with condition changes done potentially from hardware
interrupt context.


How we wanted to solve it with EFD_STATE: Share a separate eventfd
between each device and the hypervisor.  device sets state to either 0
or 1.  hypervisor polls all eventfds, reads interrupt line on changes,
calculates the interrupt level and updates guest.

Alternative solution: shared memory where each device writes interrupt
line value. This makes setup more complex (need to share around much more
than just an fd), and makes access from interrupt impossible unless we
lock the memory (and locking userspace memory introduces yet another set
of issues).


-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 21/47] KVM: SVM: simplify nested_svm_check_exception

2009-08-26 Thread Avi Kivity
From: Joerg Roedel joerg.roe...@amd.com

Makes the code of this function more readable by removing on
indentation level for the core logic.

Signed-off-by: Joerg Roedel joerg.roe...@amd.com
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/svm.c |   19 ---
 1 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 3bb6d4b..67fad66 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1359,18 +1359,15 @@ static int nested_svm_check_permissions(struct vcpu_svm 
*svm)
 static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
  bool has_error_code, u32 error_code)
 {
-   if (is_nested(svm)) {
-   svm-vmcb-control.exit_code = SVM_EXIT_EXCP_BASE + nr;
-   svm-vmcb-control.exit_code_hi = 0;
-   svm-vmcb-control.exit_info_1 = error_code;
-   svm-vmcb-control.exit_info_2 = svm-vcpu.arch.cr2;
-   if (nested_svm_exit_handled(svm, false)) {
-   nsvm_printk(VMexit - EXCP 0x%x\n, nr);
-   return 1;
-   }
-   }
+   if (!is_nested(svm))
+   return 0;
 
-   return 0;
+   svm-vmcb-control.exit_code = SVM_EXIT_EXCP_BASE + nr;
+   svm-vmcb-control.exit_code_hi = 0;
+   svm-vmcb-control.exit_info_1 = error_code;
+   svm-vmcb-control.exit_info_2 = svm-vcpu.arch.cr2;
+
+   return nested_svm_exit_handled(svm, false);
 }
 
 static inline int nested_svm_intr(struct vcpu_svm *svm)
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 31/47] KVM: SVM: check for nested VINTR flag in svm_interrupt_allowed

2009-08-26 Thread Avi Kivity
From: Joerg Roedel joerg.roe...@amd.com

Not checking for this flag breaks any nested hypervisor that does not
set VINTR. So fix it with this patch.

Signed-off-by: Joerg Roedel joerg.roe...@amd.com
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/svm.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ff04a4b..825035e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2466,7 +2466,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
return (vmcb-save.rflags  X86_EFLAGS_IF) 
!(vmcb-control.int_state  SVM_INTERRUPT_SHADOW_MASK) 
gif_set(svm) 
-   !is_nested(svm);
+   !(is_nested(svm)  (svm-vcpu.arch.hflags  HF_VINTR_MASK));
 }
 
 static void enable_irq_window(struct kvm_vcpu *vcpu)
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 26/47] KVM: SVM: remove nested_svm_do and helper functions

2009-08-26 Thread Avi Kivity
From: Joerg Roedel joerg.roe...@amd.com

This function is not longer required. So remove it.

Signed-off-by: Joerg Roedel joerg.roe...@amd.com
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/svm.c |   60 
 1 files changed, 0 insertions(+), 60 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1a915f3..42b8b67 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1421,66 +1421,6 @@ static void nested_svm_unmap(void *addr, enum km_type 
idx)
kvm_release_page_dirty(page);
 }
 
-static struct page *nested_svm_get_page(struct vcpu_svm *svm, u64 gpa)
-{
-   struct page *page;
-
-   down_read(current-mm-mmap_sem);
-   page = gfn_to_page(svm-vcpu.kvm, gpa  PAGE_SHIFT);
-   up_read(current-mm-mmap_sem);
-
-   if (is_error_page(page)) {
-   printk(KERN_INFO %s: could not find page at 0x%llx\n,
-  __func__, gpa);
-   kvm_release_page_clean(page);
-   kvm_inject_gp(svm-vcpu, 0);
-   return NULL;
-   }
-   return page;
-}
-
-static int nested_svm_do(struct vcpu_svm *svm,
-u64 arg1_gpa, u64 arg2_gpa, void *opaque,
-int (*handler)(struct vcpu_svm *svm,
-   void *arg1,
-   void *arg2,
-   void *opaque))
-{
-   struct page *arg1_page;
-   struct page *arg2_page = NULL;
-   void *arg1;
-   void *arg2 = NULL;
-   int retval;
-
-   arg1_page = nested_svm_get_page(svm, arg1_gpa);
-   if(arg1_page == NULL)
-   return 1;
-
-   if (arg2_gpa) {
-   arg2_page = nested_svm_get_page(svm, arg2_gpa);
-   if(arg2_page == NULL) {
-   kvm_release_page_clean(arg1_page);
-   return 1;
-   }
-   }
-
-   arg1 = kmap_atomic(arg1_page, KM_USER0);
-   if (arg2_gpa)
-   arg2 = kmap_atomic(arg2_page, KM_USER1);
-
-   retval = handler(svm, arg1, arg2, opaque);
-
-   kunmap_atomic(arg1, KM_USER0);
-   if (arg2_gpa)
-   kunmap_atomic(arg2, KM_USER1);
-
-   kvm_release_page_dirty(arg1_page);
-   if (arg2_gpa)
-   kvm_release_page_dirty(arg2_page);
-
-   return retval;
-}
-
 static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm)
 {
u32 param = svm-vmcb-control.exit_info_1  1;
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 47/47] KVM: Document KVM_CAP_IRQCHIP

2009-08-26 Thread Avi Kivity
Signed-off-by: Avi Kivity a...@redhat.com
---
 Documentation/kvm/api.txt |   76 +
 1 files changed, 76 insertions(+), 0 deletions(-)

diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index 1b1c22d..5a4bc8c 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -517,6 +517,82 @@ struct kvm_fpu {
__u32 pad2;
 };
 
+4.23 KVM_CREATE_IRQCHIP
+
+Capability: KVM_CAP_IRQCHIP
+Architectures: x86, ia64
+Type: vm ioctl
+Parameters: none
+Returns: 0 on success, -1 on error
+
+Creates an interrupt controller model in the kernel.  On x86, creates a virtual
+ioapic, a virtual PIC (two PICs, nested), and sets up future vcpus to have a
+local APIC.  IRQ routing for GSIs 0-15 is set to both PIC and IOAPIC; GSI 16-23
+only go to the IOAPIC.  On ia64, a IOSAPIC is created.
+
+4.24 KVM_IRQ_LINE
+
+Capability: KVM_CAP_IRQCHIP
+Architectures: x86, ia64
+Type: vm ioctl
+Parameters: struct kvm_irq_level
+Returns: 0 on success, -1 on error
+
+Sets the level of a GSI input to the interrupt controller model in the kernel.
+Requires that an interrupt controller model has been previously created with
+KVM_CREATE_IRQCHIP.  Note that edge-triggered interrupts require the level
+to be set to 1 and then back to 0.
+
+struct kvm_irq_level {
+   union {
+   __u32 irq; /* GSI */
+   __s32 status;  /* not used for KVM_IRQ_LEVEL */
+   };
+   __u32 level;   /* 0 or 1 */
+};
+
+4.25 KVM_GET_IRQCHIP
+
+Capability: KVM_CAP_IRQCHIP
+Architectures: x86, ia64
+Type: vm ioctl
+Parameters: struct kvm_irqchip (in/out)
+Returns: 0 on success, -1 on error
+
+Reads the state of a kernel interrupt controller created with
+KVM_CREATE_IRQCHIP into a buffer provided by the caller.
+
+struct kvm_irqchip {
+   __u32 chip_id;  /* 0 = PIC1, 1 = PIC2, 2 = IOAPIC */
+   __u32 pad;
+union {
+   char dummy[512];  /* reserving space */
+   struct kvm_pic_state pic;
+   struct kvm_ioapic_state ioapic;
+   } chip;
+};
+
+4.26 KVM_SET_IRQCHIP
+
+Capability: KVM_CAP_IRQCHIP
+Architectures: x86, ia64
+Type: vm ioctl
+Parameters: struct kvm_irqchip (in)
+Returns: 0 on success, -1 on error
+
+Sets the state of a kernel interrupt controller created with
+KVM_CREATE_IRQCHIP from a buffer provided by the caller.
+
+struct kvm_irqchip {
+   __u32 chip_id;  /* 0 = PIC1, 1 = PIC2, 2 = IOAPIC */
+   __u32 pad;
+union {
+   char dummy[512];  /* reserving space */
+   struct kvm_pic_state pic;
+   struct kvm_ioapic_state ioapic;
+   } chip;
+};
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 43/47] KVM: x86 emulator: Add adc and sbb missing decoder flags

2009-08-26 Thread Avi Kivity
From: Mohammed Gamal m.gamal...@gmail.com

Add missing decoder flags for adc and sbb instructions
(opcodes 0x14-0x15, 0x1c-0x1d)

Signed-off-by: Mohammed Gamal m.gamal...@gmail.com
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/emulate.c |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 2eb807a..1be5cd6 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -100,11 +100,11 @@ static u32 opcode_table[256] = {
/* 0x10 - 0x17 */
ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-   0, 0, 0, 0,
+   ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0,
/* 0x18 - 0x1F */
ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-   0, 0, 0, 0,
+   ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0,
/* 0x20 - 0x27 */
ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 45/47] KVM: VMX: Fix EPT with WP bit change during paging

2009-08-26 Thread Avi Kivity
From: Sheng Yang sh...@linux.intel.com

QNX update WP bit when paging enabled, which is not covered yet. This one fix
QNX boot with EPT.

Cc: sta...@kernel.org
Signed-off-by: Sheng Yang sh...@linux.intel.com
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/vmx.c |6 +++---
 1 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2b7e7bd..1ee811c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1642,7 +1642,6 @@ static void ept_update_paging_mode_cr0(unsigned long 
*hw_cr0,
  CPU_BASED_CR3_STORE_EXITING));
vcpu-arch.cr0 = cr0;
vmx_set_cr4(vcpu, vcpu-arch.cr4);
-   *hw_cr0 = ~X86_CR0_WP;
} else if (!is_paging(vcpu)) {
/* From nonpaging to paging */
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
@@ -1651,9 +1650,10 @@ static void ept_update_paging_mode_cr0(unsigned long 
*hw_cr0,
   CPU_BASED_CR3_STORE_EXITING));
vcpu-arch.cr0 = cr0;
vmx_set_cr4(vcpu, vcpu-arch.cr4);
-   if (!(vcpu-arch.cr0  X86_CR0_WP))
-   *hw_cr0 = ~X86_CR0_WP;
}
+
+   if (!(cr0  X86_CR0_WP))
+   *hw_cr0 = ~X86_CR0_WP;
 }
 
 static void ept_update_paging_mode_cr4(unsigned long *hw_cr4,
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 38/47] KVM: Rename x86_emulate.c to emulate.c

2009-08-26 Thread Avi Kivity
We're in arch/x86, what could we possibly be emulating?

Signed-off-by: Avi Kivity a...@redhat.com
---
 .../asm/{kvm_x86_emulate.h = kvm_emulate.h}   |0
 arch/x86/include/asm/kvm_host.h|2 +-
 arch/x86/kvm/Makefile  |2 +-
 arch/x86/kvm/{x86_emulate.c = emulate.c}  |4 ++--
 arch/x86/kvm/x86.c |2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)
 rename arch/x86/include/asm/{kvm_x86_emulate.h = kvm_emulate.h} (100%)
 rename arch/x86/kvm/{x86_emulate.c = emulate.c} (99%)

diff --git a/arch/x86/include/asm/kvm_x86_emulate.h 
b/arch/x86/include/asm/kvm_emulate.h
similarity index 100%
rename from arch/x86/include/asm/kvm_x86_emulate.h
rename to arch/x86/include/asm/kvm_emulate.h
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b17d845..33901be 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -138,7 +138,7 @@ enum {
VCPU_SREG_LDTR,
 };
 
-#include asm/kvm_x86_emulate.h
+#include asm/kvm_emulate.h
 
 #define KVM_NR_MEM_OBJS 40
 
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index afaaa76..0e7fe78 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -9,7 +9,7 @@ kvm-y   += $(addprefix ../../../virt/kvm/, 
kvm_main.o ioapic.o \
coalesced_mmio.o irq_comm.o eventfd.o)
 kvm-$(CONFIG_IOMMU_API)+= $(addprefix ../../../virt/kvm/, iommu.o)
 
-kvm-y  += x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \
+kvm-y  += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
   i8254.o timer.o
 kvm-intel-y+= vmx.o
 kvm-amd-y  += svm.o
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/emulate.c
similarity index 99%
rename from arch/x86/kvm/x86_emulate.c
rename to arch/x86/kvm/emulate.c
index c6663d4..2eb807a 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1,5 +1,5 @@
 /**
- * x86_emulate.c
+ * emulate.c
  *
  * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
  *
@@ -30,7 +30,7 @@
 #define DPRINTF(x...) do {} while (0)
 #endif
 #include linux/module.h
-#include asm/kvm_x86_emulate.h
+#include asm/kvm_emulate.h
 
 #include mmu.h   /* for is_long_mode() */
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1aa7e6d..c0e9427 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2759,7 +2759,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
kvm_clear_exception_queue(vcpu);
vcpu-arch.mmio_fault_cr2 = cr2;
/*
-* TODO: fix x86_emulate.c to use guest_read/write_register
+* TODO: fix emulate.c to use guest_read/write_register
 * instead of direct -regs accesses, can save hundred cycles
 * on Intel for instructions that don't read/change RSP, for
 * for example.
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 39/47] Documentation: Update KVM list email address

2009-08-26 Thread Avi Kivity
From: Amit Shah amit.s...@redhat.com

The KVM list moved to vger.kernel.org last year

Signed-off-by: Amit Shah amit.s...@redhat.com
Signed-off-by: Avi Kivity a...@redhat.com
---
 Documentation/ioctl/ioctl-number.txt |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/Documentation/ioctl/ioctl-number.txt 
b/Documentation/ioctl/ioctl-number.txt
index 7bb0d93..3223e12 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -191,7 +191,7 @@ CodeSeq#Include FileComments
 0xAD   00  Netfilter devicein development:
mailto:ru...@rustcorp.com.au  
 0xAE   all linux/kvm.h Kernel-based Virtual Machine
-   mailto:kvm-de...@lists.sourceforge.net
+   mailto:kvm@vger.kernel.org
 0xB0   all RATIO devices   in development:
mailto:v...@ratio.de
 0xB1   00-1F   PPPoX   mailto:mostr...@styx.uwaterloo.ca
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 28/47] KVM: SVM: move special nested exit handling to separate function

2009-08-26 Thread Avi Kivity
From: Joerg Roedel joerg.roe...@amd.com

This patch moves the handling for special nested vmexits like #pf to a
separate function. This makes the kvm_override parameter obsolete and
makes the code more readable.

Signed-off-by: Joerg Roedel joerg.roe...@amd.com
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/svm.c |   80 ---
 1 files changed, 50 insertions(+), 30 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 2edf2dd..e9e3931 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -47,6 +47,10 @@ MODULE_LICENSE(GPL);
 #define SVM_FEATURE_LBRV (1  1)
 #define SVM_FEATURE_SVML (1  2)
 
+#define NESTED_EXIT_HOST   0   /* Exit handled on host level */
+#define NESTED_EXIT_DONE   1   /* Exit caused nested vmexit  */
+#define NESTED_EXIT_CONTINUE   2   /* Further checks needed  */
+
 #define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
 
 /* Turn on to get debugging output*/
@@ -126,7 +130,7 @@ module_param(nested, int, S_IRUGO);
 static void svm_flush_tlb(struct kvm_vcpu *vcpu);
 static void svm_complete_interrupts(struct vcpu_svm *svm);
 
-static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override);
+static int nested_svm_exit_handled(struct vcpu_svm *svm);
 static int nested_svm_vmexit(struct vcpu_svm *svm);
 static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
  bool has_error_code, u32 error_code);
@@ -1365,7 +1369,7 @@ static int nested_svm_check_exception(struct vcpu_svm 
*svm, unsigned nr,
svm-vmcb-control.exit_info_1 = error_code;
svm-vmcb-control.exit_info_2 = svm-vcpu.arch.cr2;
 
-   return nested_svm_exit_handled(svm, false);
+   return nested_svm_exit_handled(svm);
 }
 
 static inline int nested_svm_intr(struct vcpu_svm *svm)
@@ -1379,7 +1383,7 @@ static inline int nested_svm_intr(struct vcpu_svm *svm)
 
svm-vmcb-control.exit_code = SVM_EXIT_INTR;
 
-   if (nested_svm_exit_handled(svm, false)) {
+   if (nested_svm_exit_handled(svm)) {
nsvm_printk(VMexit - INTR\n);
return 1;
}
@@ -1468,31 +1472,39 @@ out:
return ret;
 }
 
-static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override)
+static int nested_svm_exit_special(struct vcpu_svm *svm)
 {
u32 exit_code = svm-vmcb-control.exit_code;
-   bool vmexit = false;
 
-   if (kvm_override) {
-   switch (exit_code) {
-   case SVM_EXIT_INTR:
-   case SVM_EXIT_NMI:
-   return 0;
+   switch (exit_code) {
+   case SVM_EXIT_INTR:
+   case SVM_EXIT_NMI:
+   return NESTED_EXIT_HOST;
/* For now we are always handling NPFs when using them */
-   case SVM_EXIT_NPF:
-   if (npt_enabled)
-   return 0;
-   break;
-   /* When we're shadowing, trap PFs */
-   case SVM_EXIT_EXCP_BASE + PF_VECTOR:
-   if (!npt_enabled)
-   return 0;
-   break;
-   default:
-   break;
-   }
+   case SVM_EXIT_NPF:
+   if (npt_enabled)
+   return NESTED_EXIT_HOST;
+   break;
+   /* When we're shadowing, trap PFs */
+   case SVM_EXIT_EXCP_BASE + PF_VECTOR:
+   if (!npt_enabled)
+   return NESTED_EXIT_HOST;
+   break;
+   default:
+   break;
}
 
+   return NESTED_EXIT_CONTINUE;
+}
+
+/*
+ * If this function returns true, this #vmexit was already handled
+ */
+static int nested_svm_exit_handled(struct vcpu_svm *svm)
+{
+   u32 exit_code = svm-vmcb-control.exit_code;
+   int vmexit = NESTED_EXIT_HOST;
+
switch (exit_code) {
case SVM_EXIT_MSR:
vmexit = nested_svm_exit_handled_msr(svm);
@@ -1500,42 +1512,42 @@ static int nested_svm_exit_handled(struct vcpu_svm 
*svm, bool kvm_override)
case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: {
u32 cr_bits = 1  (exit_code - SVM_EXIT_READ_CR0);
if (svm-nested.intercept_cr_read  cr_bits)
-   vmexit = true;
+   vmexit = NESTED_EXIT_DONE;
break;
}
case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: {
u32 cr_bits = 1  (exit_code - SVM_EXIT_WRITE_CR0);
if (svm-nested.intercept_cr_write  cr_bits)
-   vmexit = true;
+   vmexit = NESTED_EXIT_DONE;
break;
}
case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: {
u32 dr_bits = 1  (exit_code - SVM_EXIT_READ_DR0);
if (svm-nested.intercept_dr_read  dr_bits)
-   

[PATCH 24/47] KVM: SVM: clean up nestec vmload/vmsave paths

2009-08-26 Thread Avi Kivity
From: Joerg Roedel joerg.roe...@amd.com

This patch removes the usage of nested_svm_do from the vmload and
vmsave emulation code paths.

Signed-off-by: Joerg Roedel joerg.roe...@amd.com
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/svm.c |   36 +---
 1 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 518d578..419e3fa 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -128,8 +128,6 @@ static void svm_complete_interrupts(struct vcpu_svm *svm);
 
 static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override);
 static int nested_svm_vmexit(struct vcpu_svm *svm);
-static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb,
-void *arg2, void *opaque);
 static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
  bool has_error_code, u32 error_code);
 
@@ -1868,7 +1866,7 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void 
*arg1,
return 0;
 }
 
-static int nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
+static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
 {
to_vmcb-save.fs = from_vmcb-save.fs;
to_vmcb-save.gs = from_vmcb-save.gs;
@@ -1882,44 +1880,44 @@ static int nested_svm_vmloadsave(struct vmcb 
*from_vmcb, struct vmcb *to_vmcb)
to_vmcb-save.sysenter_cs = from_vmcb-save.sysenter_cs;
to_vmcb-save.sysenter_esp = from_vmcb-save.sysenter_esp;
to_vmcb-save.sysenter_eip = from_vmcb-save.sysenter_eip;
-
-   return 1;
-}
-
-static int nested_svm_vmload(struct vcpu_svm *svm, void *nested_vmcb,
-void *arg2, void *opaque)
-{
-   return nested_svm_vmloadsave((struct vmcb *)nested_vmcb, svm-vmcb);
-}
-
-static int nested_svm_vmsave(struct vcpu_svm *svm, void *nested_vmcb,
-void *arg2, void *opaque)
-{
-   return nested_svm_vmloadsave(svm-vmcb, (struct vmcb *)nested_vmcb);
 }
 
 static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
+   struct vmcb *nested_vmcb;
+
if (nested_svm_check_permissions(svm))
return 1;
 
svm-next_rip = kvm_rip_read(svm-vcpu) + 3;
skip_emulated_instruction(svm-vcpu);
 
-   nested_svm_do(svm, svm-vmcb-save.rax, 0, NULL, nested_svm_vmload);
+   nested_vmcb = nested_svm_map(svm, svm-vmcb-save.rax, KM_USER0);
+   if (!nested_vmcb)
+   return 1;
+
+   nested_svm_vmloadsave(nested_vmcb, svm-vmcb);
+   nested_svm_unmap(nested_vmcb, KM_USER0);
 
return 1;
 }
 
 static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
+   struct vmcb *nested_vmcb;
+
if (nested_svm_check_permissions(svm))
return 1;
 
svm-next_rip = kvm_rip_read(svm-vcpu) + 3;
skip_emulated_instruction(svm-vcpu);
 
-   nested_svm_do(svm, svm-vmcb-save.rax, 0, NULL, nested_svm_vmsave);
+   nested_vmcb = nested_svm_map(svm, svm-vmcb-save.rax, KM_USER0);
+   if (!nested_vmcb)
+   return 1;
+
+   nested_svm_vmloadsave(svm-vmcb, nested_vmcb);
+   nested_svm_unmap(nested_vmcb, KM_USER0);
 
return 1;
 }
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 29/47] KVM: SVM: remove unnecessary is_nested check from svm_cpu_run

2009-08-26 Thread Avi Kivity
From: Joerg Roedel joerg.roe...@amd.com

This check is not necessary. We have to sync the vcpu-arch.cr2 always
back to the VMCB. This patch remove the is_nested check.

Signed-off-by: Joerg Roedel joerg.roe...@amd.com
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/svm.c |3 +--
 1 files changed, 1 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e9e3931..f275d77 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2605,8 +2605,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct 
kvm_run *kvm_run)
fs_selector = kvm_read_fs();
gs_selector = kvm_read_gs();
ldt_selector = kvm_read_ldt();
-   if (!is_nested(svm))
-   svm-vmcb-save.cr2 = vcpu-arch.cr2;
+   svm-vmcb-save.cr2 = vcpu-arch.cr2;
/* required for live migration with NPT */
if (npt_enabled)
svm-vmcb-save.cr3 = vcpu-arch.cr3;
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 35/47] KVM: Move #endif KVM_CAP_IRQ_ROUTING to correct place

2009-08-26 Thread Avi Kivity
The symbol only controls irq routing, not MSI-X.

Signed-off-by: Avi Kivity a...@redhat.com
---
 virt/kvm/kvm_main.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4470251..1df4c04 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2236,6 +2236,7 @@ static long kvm_vm_ioctl(struct file *filp,
vfree(entries);
break;
}
+#endif /* KVM_CAP_IRQ_ROUTING */
 #ifdef __KVM_HAVE_MSIX
case KVM_ASSIGN_SET_MSIX_NR: {
struct kvm_assigned_msix_nr entry_nr;
@@ -2258,7 +2259,6 @@ static long kvm_vm_ioctl(struct file *filp,
break;
}
 #endif
-#endif /* KVM_CAP_IRQ_ROUTING */
case KVM_IRQFD: {
struct kvm_irqfd data;
 
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 16/47] KVM: SVM: complete interrupts after handling nested exits

2009-08-26 Thread Avi Kivity
From: Joerg Roedel joerg.roe...@amd.com

The interrupt completion code must run after nested exits are handled
because not injected interrupts or exceptions may be handled by the l1
guest first.

Signed-off-by: Joerg Roedel joerg.roe...@amd.com
Acked-by: Alexander Graf ag...@suse.de
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/svm.c |5 +++--
 1 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index df795bc..825b825 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -111,6 +111,7 @@ static int nested = 0;
 module_param(nested, int, S_IRUGO);
 
 static void svm_flush_tlb(struct kvm_vcpu *vcpu);
+static void svm_complete_interrupts(struct vcpu_svm *svm);
 
 static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override);
 static int nested_svm_vmexit(struct vcpu_svm *svm);
@@ -2324,6 +2325,8 @@ static int handle_exit(struct kvm_run *kvm_run, struct 
kvm_vcpu *vcpu)
}
}
 
+   svm_complete_interrupts(svm);
+
if (npt_enabled) {
int mmu_reload = 0;
if ((vcpu-arch.cr0 ^ svm-vmcb-save.cr0)  X86_CR0_PG) {
@@ -2690,8 +2693,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct 
kvm_run *kvm_run)
vcpu-arch.regs_avail = ~(1  VCPU_EXREG_PDPTR);
vcpu-arch.regs_dirty = ~(1  VCPU_EXREG_PDPTR);
}
-
-   svm_complete_interrupts(svm);
 }
 
 #undef R
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 27/47] KVM: SVM: handle errors in vmrun emulation path appropriatly

2009-08-26 Thread Avi Kivity
From: Joerg Roedel joerg.roe...@amd.com

If nested svm fails to load the msrpm the vmrun succeeds with the old
msrpm which is not correct. This patch changes the logic to roll back
to host mode in case the msrpm cannot be loaded.

Signed-off-by: Joerg Roedel joerg.roe...@amd.com
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/svm.c |   14 +-
 1 files changed, 13 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 42b8b67..2edf2dd 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1877,6 +1877,7 @@ static int vmsave_interception(struct vcpu_svm *svm, 
struct kvm_run *kvm_run)
 static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
nsvm_printk(VMrun\n);
+
if (nested_svm_check_permissions(svm))
return 1;
 
@@ -1887,7 +1888,18 @@ static int vmrun_interception(struct vcpu_svm *svm, 
struct kvm_run *kvm_run)
return 1;
 
if (!nested_svm_vmrun_msrpm(svm))
-   return 1;
+   goto failed;
+
+   return 1;
+
+failed:
+
+   svm-vmcb-control.exit_code= SVM_EXIT_ERR;
+   svm-vmcb-control.exit_code_hi = 0;
+   svm-vmcb-control.exit_info_1  = 0;
+   svm-vmcb-control.exit_info_2  = 0;
+
+   nested_svm_vmexit(svm);
 
return 1;
 }
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 19/47] KVM: SVM: consolidate nested_svm_exit_handled

2009-08-26 Thread Avi Kivity
From: Joerg Roedel joerg.roe...@amd.com

When caching guest intercepts there is no need anymore for the
nested_svm_exit_handled_real function. So move its code into
nested_svm_exit_handled.

Signed-off-by: Joerg Roedel joerg.roe...@amd.com
Acked-by: Alexander Graf ag...@suse.de
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/svm.c |  109 +++
 1 files changed, 49 insertions(+), 60 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 4426c63..bdd73fd 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1456,15 +1456,58 @@ static int nested_svm_do(struct vcpu_svm *svm,
return retval;
 }
 
-static int nested_svm_exit_handled_real(struct vcpu_svm *svm,
-   void *arg1,
-   void *arg2,
-   void *opaque)
+static int nested_svm_exit_handled_msr(struct vcpu_svm *svm,
+  void *arg1, void *arg2,
+  void *opaque)
+{
+   struct vmcb *nested_vmcb = (struct vmcb *)arg1;
+   u8 *msrpm = (u8 *)arg2;
+   u32 t0, t1;
+   u32 msr = svm-vcpu.arch.regs[VCPU_REGS_RCX];
+   u32 param = svm-vmcb-control.exit_info_1  1;
+
+   if (!(nested_vmcb-control.intercept  (1ULL  INTERCEPT_MSR_PROT)))
+   return 0;
+
+   switch (msr) {
+   case 0 ... 0x1fff:
+   t0 = (msr * 2) % 8;
+   t1 = msr / 8;
+   break;
+   case 0xc000 ... 0xc0001fff:
+   t0 = (8192 + msr - 0xc000) * 2;
+   t1 = (t0 / 8);
+   t0 %= 8;
+   break;
+   case 0xc001 ... 0xc0011fff:
+   t0 = (16384 + msr - 0xc001) * 2;
+   t1 = (t0 / 8);
+   t0 %= 8;
+   break;
+   default:
+   return 1;
+   break;
+   }
+   if (msrpm[t1]  ((1  param)  t0))
+   return 1;
+
+   return 0;
+}
+
+static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override)
 {
-   bool kvm_overrides = *(bool *)opaque;
u32 exit_code = svm-vmcb-control.exit_code;
 
-   if (kvm_overrides) {
+   switch (svm-vmcb-control.exit_code) {
+   case SVM_EXIT_MSR:
+   return nested_svm_do(svm, svm-nested.vmcb,
+svm-nested.vmcb_msrpm, NULL,
+nested_svm_exit_handled_msr);
+   default:
+   break;
+   }
+
+   if (kvm_override) {
switch (exit_code) {
case SVM_EXIT_INTR:
case SVM_EXIT_NMI:
@@ -1526,60 +1569,6 @@ static int nested_svm_exit_handled_real(struct vcpu_svm 
*svm,
return 0;
 }
 
-static int nested_svm_exit_handled_msr(struct vcpu_svm *svm,
-  void *arg1, void *arg2,
-  void *opaque)
-{
-   struct vmcb *nested_vmcb = (struct vmcb *)arg1;
-   u8 *msrpm = (u8 *)arg2;
-u32 t0, t1;
-   u32 msr = svm-vcpu.arch.regs[VCPU_REGS_RCX];
-   u32 param = svm-vmcb-control.exit_info_1  1;
-
-   if (!(nested_vmcb-control.intercept  (1ULL  INTERCEPT_MSR_PROT)))
-   return 0;
-
-   switch(msr) {
-   case 0 ... 0x1fff:
-   t0 = (msr * 2) % 8;
-   t1 = msr / 8;
-   break;
-   case 0xc000 ... 0xc0001fff:
-   t0 = (8192 + msr - 0xc000) * 2;
-   t1 = (t0 / 8);
-   t0 %= 8;
-   break;
-   case 0xc001 ... 0xc0011fff:
-   t0 = (16384 + msr - 0xc001) * 2;
-   t1 = (t0 / 8);
-   t0 %= 8;
-   break;
-   default:
-   return 1;
-   break;
-   }
-   if (msrpm[t1]  ((1  param)  t0))
-   return 1;
-
-   return 0;
-}
-
-static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override)
-{
-   bool k = kvm_override;
-
-   switch (svm-vmcb-control.exit_code) {
-   case SVM_EXIT_MSR:
-   return nested_svm_do(svm, svm-nested.vmcb,
-svm-nested.vmcb_msrpm, NULL,
-nested_svm_exit_handled_msr);
-   default: break;
-   }
-
-   return nested_svm_do(svm, svm-nested.vmcb, 0, k,
-nested_svm_exit_handled_real);
-}
-
 static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb 
*from_vmcb)
 {
struct vmcb_control_area *dst  = dst_vmcb-control;
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 20/47] KVM: SVM: do nested vmexit in nested_svm_exit_handled

2009-08-26 Thread Avi Kivity
From: Joerg Roedel joerg.roe...@amd.com

If this function returns true a nested vmexit is required. Move that
vmexit into the nested_svm_exit_handled function. This also simplifies
the handling of nested #pf intercepts in this function.

Signed-off-by: Joerg Roedel joerg.roe...@amd.com
Acked-by: Alexander Graf ag...@suse.de
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/svm.c |   42 +++---
 1 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index bdd73fd..3bb6d4b 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1366,8 +1366,6 @@ static int nested_svm_check_exception(struct vcpu_svm 
*svm, unsigned nr,
svm-vmcb-control.exit_info_2 = svm-vcpu.arch.cr2;
if (nested_svm_exit_handled(svm, false)) {
nsvm_printk(VMexit - EXCP 0x%x\n, nr);
-
-   nested_svm_vmexit(svm);
return 1;
}
}
@@ -1388,7 +1386,6 @@ static inline int nested_svm_intr(struct vcpu_svm *svm)
 
if (nested_svm_exit_handled(svm, false)) {
nsvm_printk(VMexit - INTR\n);
-   nested_svm_vmexit(svm);
return 1;
}
}
@@ -1497,15 +1494,7 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm 
*svm,
 static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override)
 {
u32 exit_code = svm-vmcb-control.exit_code;
-
-   switch (svm-vmcb-control.exit_code) {
-   case SVM_EXIT_MSR:
-   return nested_svm_do(svm, svm-nested.vmcb,
-svm-nested.vmcb_msrpm, NULL,
-nested_svm_exit_handled_msr);
-   default:
-   break;
-   }
+   bool vmexit = false;
 
if (kvm_override) {
switch (exit_code) {
@@ -1528,45 +1517,55 @@ static int nested_svm_exit_handled(struct vcpu_svm 
*svm, bool kvm_override)
}
 
switch (exit_code) {
+   case SVM_EXIT_MSR:
+   if (nested_svm_do(svm, svm-nested.vmcb, svm-nested.vmcb_msrpm,
+ NULL, nested_svm_exit_handled_msr))
+   vmexit = true;
+   break;
case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: {
u32 cr_bits = 1  (exit_code - SVM_EXIT_READ_CR0);
if (svm-nested.intercept_cr_read  cr_bits)
-   return 1;
+   vmexit = true;
break;
}
case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: {
u32 cr_bits = 1  (exit_code - SVM_EXIT_WRITE_CR0);
if (svm-nested.intercept_cr_write  cr_bits)
-   return 1;
+   vmexit = true;
break;
}
case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: {
u32 dr_bits = 1  (exit_code - SVM_EXIT_READ_DR0);
if (svm-nested.intercept_dr_read  dr_bits)
-   return 1;
+   vmexit = true;
break;
}
case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: {
u32 dr_bits = 1  (exit_code - SVM_EXIT_WRITE_DR0);
if (svm-nested.intercept_dr_write  dr_bits)
-   return 1;
+   vmexit = true;
break;
}
case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
u32 excp_bits = 1  (exit_code - SVM_EXIT_EXCP_BASE);
if (svm-nested.intercept_exceptions  excp_bits)
-   return 1;
+   vmexit = true;
break;
}
default: {
u64 exit_bits = 1ULL  (exit_code - SVM_EXIT_INTR);
nsvm_printk(exit code: 0x%x\n, exit_code);
if (svm-nested.intercept  exit_bits)
-   return 1;
+   vmexit = true;
}
}
 
-   return 0;
+   if (vmexit) {
+   nsvm_printk(#VMEXIT reason=%04x\n, exit_code);
+   nested_svm_vmexit(svm);
+   }
+
+   return vmexit;
 }
 
 static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb 
*from_vmcb)
@@ -2327,11 +2326,8 @@ static int handle_exit(struct kvm_run *kvm_run, struct 
kvm_vcpu *vcpu)
nsvm_printk(nested handle_exit: 0x%x | 0x%lx | 0x%lx | 
0x%lx\n,
exit_code, svm-vmcb-control.exit_info_1,
svm-vmcb-control.exit_info_2, 
svm-vmcb-save.rip);
-   if (nested_svm_exit_handled(svm, true)) {
-   nested_svm_vmexit(svm);
-   nsvm_printk(- #VMEXIT\n);
+   if (nested_svm_exit_handled(svm, true))
return 1;
-   }
}
 

[PATCH 33/47] KVM: Update cr8 intercept when APIC TPR is changed by userspace

2009-08-26 Thread Avi Kivity
From: Gleb Natapov g...@redhat.com

Since on vcpu entry we do it only if apic is enabled we should do
it when TPR is changed while apic is disabled. This happens when windows
resets HW without setting TPR to zero.

Signed-off-by: Gleb Natapov g...@redhat.com
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/x86.c |2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 132c510..31bf984 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -77,6 +77,7 @@ static u64 __read_mostly efer_reserved_bits = 
0xfffeULL;
 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
 
+static void update_cr8_intercept(struct kvm_vcpu *vcpu);
 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
struct kvm_cpuid_entry2 __user *entries);
 
@@ -1629,6 +1630,7 @@ static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
vcpu_load(vcpu);
memcpy(vcpu-arch.apic-regs, s-regs, sizeof *s);
kvm_apic_post_state_restore(vcpu);
+   update_cr8_intercept(vcpu);
vcpu_put(vcpu);
 
return 0;
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 25/47] KVM: SVM: clean up nested vmrun path

2009-08-26 Thread Avi Kivity
From: Joerg Roedel joerg.roe...@amd.com

This patch removes the usage of nested_svm_do from the vmrun emulation
path.

Signed-off-by: Joerg Roedel joerg.roe...@amd.com
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/svm.c |   34 ++
 1 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 419e3fa..1a915f3 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1727,25 +1727,35 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
return 0;
 }
 
-static int nested_svm_vmrun_msrpm(struct vcpu_svm *svm, void *arg1,
- void *arg2, void *opaque)
+static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
 {
+   u32 *nested_msrpm;
int i;
-   u32 *nested_msrpm = (u32*)arg1;
+
+   nested_msrpm = nested_svm_map(svm, svm-nested.vmcb_msrpm, KM_USER0);
+   if (!nested_msrpm)
+   return false;
+
for (i=0; i PAGE_SIZE * (1  MSRPM_ALLOC_ORDER) / 4; i++)
svm-nested.msrpm[i] = svm-msrpm[i] | nested_msrpm[i];
+
svm-vmcb-control.msrpm_base_pa = __pa(svm-nested.msrpm);
 
-   return 0;
+   nested_svm_unmap(nested_msrpm, KM_USER0);
+
+   return true;
 }
 
-static int nested_svm_vmrun(struct vcpu_svm *svm, void *arg1,
-   void *arg2, void *opaque)
+static bool nested_svm_vmrun(struct vcpu_svm *svm)
 {
-   struct vmcb *nested_vmcb = (struct vmcb *)arg1;
+   struct vmcb *nested_vmcb;
struct vmcb *hsave = svm-nested.hsave;
struct vmcb *vmcb = svm-vmcb;
 
+   nested_vmcb = nested_svm_map(svm, svm-vmcb-save.rax, KM_USER0);
+   if (!nested_vmcb)
+   return false;
+
/* nested_vmcb is our indicator if nested SVM is activated */
svm-nested.vmcb = svm-vmcb-save.rax;
 
@@ -1861,9 +1871,11 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void 
*arg1,
svm-vmcb-control.event_inj = nested_vmcb-control.event_inj;
svm-vmcb-control.event_inj_err = nested_vmcb-control.event_inj_err;
 
+   nested_svm_unmap(nested_vmcb, KM_USER0);
+
enable_gif(svm);
 
-   return 0;
+   return true;
 }
 
 static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
@@ -1931,12 +1943,10 @@ static int vmrun_interception(struct vcpu_svm *svm, 
struct kvm_run *kvm_run)
svm-next_rip = kvm_rip_read(svm-vcpu) + 3;
skip_emulated_instruction(svm-vcpu);
 
-   if (nested_svm_do(svm, svm-vmcb-save.rax, 0,
- NULL, nested_svm_vmrun))
+   if (!nested_svm_vmrun(svm))
return 1;
 
-   if (nested_svm_do(svm, svm-nested.vmcb_msrpm, 0,
- NULL, nested_svm_vmrun_msrpm))
+   if (!nested_svm_vmrun_msrpm(svm))
return 1;
 
return 1;
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 36/47] KVM: VMX: Adjust rflags if in real mode emulation

2009-08-26 Thread Avi Kivity
We set rflags.vm86 when virtualizing real mode to do through vm8086 mode;
so we need to take it out again when reading rflags.

Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/vmx.c |7 ++-
 1 files changed, 6 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 31c3a87..2b7e7bd 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -781,7 +781,12 @@ static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
 
 static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
 {
-   return vmcs_readl(GUEST_RFLAGS);
+   unsigned long rflags;
+
+   rflags = vmcs_readl(GUEST_RFLAGS);
+   if (to_vmx(vcpu)-rmode.vm86_active)
+   rflags = ~(unsigned long)(X86_EFLAGS_IOPL | X86_EFLAGS_VM);
+   return rflags;
 }
 
 static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 34/47] KVM: SVM: Drop tlb flush workaround in npt

2009-08-26 Thread Avi Kivity
It is no longer possible to reproduce the problem any more, so presumably
it has been fixed.

Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/svm.c |   13 ++---
 1 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index be0f6ef..7853dd3 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1187,17 +1187,8 @@ static int pf_interception(struct vcpu_svm *svm, struct 
kvm_run *kvm_run)
error_code = svm-vmcb-control.exit_info_1;
 
trace_kvm_page_fault(fault_address, error_code);
-   /*
-* FIXME: Tis shouldn't be necessary here, but there is a flush
-* missing in the MMU code. Until we find this bug, flush the
-* complete TLB here on an NPF
-*/
-   if (npt_enabled)
-   svm_flush_tlb(svm-vcpu);
-   else {
-   if (kvm_event_needs_reinjection(svm-vcpu))
-   kvm_mmu_unprotect_page_virt(svm-vcpu, fault_address);
-   }
+   if (!npt_enabled  kvm_event_needs_reinjection(svm-vcpu))
+   kvm_mmu_unprotect_page_virt(svm-vcpu, fault_address);
return kvm_mmu_page_fault(svm-vcpu, fault_address, error_code);
 }
 
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 40/47] KVM: export kvm_para.h

2009-08-26 Thread Avi Kivity
From: Michael S. Tsirkin m...@redhat.com

kvm_para.h contains userspace interface and so
should be exported.

Signed-off-by: Michael S. Tsirkin m...@redhat.com
Signed-off-by: Avi Kivity a...@redhat.com
---
 include/asm-generic/Kbuild.asm |5 +
 include/linux/Kbuild   |4 
 2 files changed, 9 insertions(+), 0 deletions(-)

diff --git a/include/asm-generic/Kbuild.asm b/include/asm-generic/Kbuild.asm
index 290910e..96d7c98 100644
--- a/include/asm-generic/Kbuild.asm
+++ b/include/asm-generic/Kbuild.asm
@@ -3,6 +3,11 @@ ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/asm/kvm.h 
\
 header-y  += kvm.h
 endif
 
+ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/asm/kvm_para.h \
+ $(srctree)/include/asm-$(SRCARCH)/kvm_para.h),)
+header-y  += kvm_para.h
+endif
+
 ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/asm/a.out.h \
  $(srctree)/include/asm-$(SRCARCH)/a.out.h),)
 unifdef-y += a.out.h
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 334a359..cff4a10 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -268,6 +268,10 @@ ifneq ($(wildcard 
$(srctree)/arch/$(SRCARCH)/include/asm/kvm.h \
  $(srctree)/include/asm-$(SRCARCH)/kvm.h),)
 unifdef-y += kvm.h
 endif
+ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/asm/kvm_para.h \
+ $(srctree)/include/asm-$(SRCARCH)/kvm_para.h),)
+unifdef-y += kvm_para.h
+endif
 unifdef-y += llc.h
 unifdef-y += loop.h
 unifdef-y += lp.h
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 30/47] KVM: SVM: move nested_svm_intr main logic out of if-clause

2009-08-26 Thread Avi Kivity
From: Joerg Roedel joerg.roe...@amd.com

This patch removes one indentation level from nested_svm_intr and
makes the logic more readable.

Signed-off-by: Joerg Roedel joerg.roe...@amd.com
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/svm.c |   21 +++--
 1 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f275d77..ff04a4b 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1374,19 +1374,20 @@ static int nested_svm_check_exception(struct vcpu_svm 
*svm, unsigned nr,
 
 static inline int nested_svm_intr(struct vcpu_svm *svm)
 {
-   if (is_nested(svm)) {
-   if (!(svm-vcpu.arch.hflags  HF_VINTR_MASK))
-   return 0;
+   if (!is_nested(svm))
+   return 0;
 
-   if (!(svm-vcpu.arch.hflags  HF_HIF_MASK))
-   return 0;
+   if (!(svm-vcpu.arch.hflags  HF_VINTR_MASK))
+   return 0;
 
-   svm-vmcb-control.exit_code = SVM_EXIT_INTR;
+   if (!(svm-vcpu.arch.hflags  HF_HIF_MASK))
+   return 0;
 
-   if (nested_svm_exit_handled(svm)) {
-   nsvm_printk(VMexit - INTR\n);
-   return 1;
-   }
+   svm-vmcb-control.exit_code = SVM_EXIT_INTR;
+
+   if (nested_svm_exit_handled(svm)) {
+   nsvm_printk(VMexit - INTR\n);
+   return 1;
}
 
return 0;
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 37/47] KVM: When switching to a vm8086 task, load segments as 16-bit

2009-08-26 Thread Avi Kivity
From: Anthony Liguori aligu...@us.ibm.com

According to 16.2.5 in the SDM, eflags.vm in the tss is consulted before loading
and new segments.  If eflags.vm == 1, then the segments are treated as 16-bit
segments.  The LDTR and TR are not normally available in vm86 mode so if they
happen to somehow get loaded, they need to be treated as 32-bit segments.

This fixes an invalid vmentry failure in a custom OS that was happening after
a task switch into vm8086 mode.  Since the segments were being mistakenly
treated as 32-bit, we loaded garbage state.

Signed-off-by: Anthony Liguori aligu...@us.ibm.com
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/x86.c |9 -
 1 files changed, 8 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 31bf984..1aa7e6d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4101,12 +4101,19 @@ static int kvm_load_realmode_segment(struct kvm_vcpu 
*vcpu, u16 selector, int se
return 0;
 }
 
+static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
+{
+   return (seg != VCPU_SREG_LDTR) 
+   (seg != VCPU_SREG_TR) 
+   (kvm_x86_ops-get_rflags(vcpu)  X86_EFLAGS_VM);
+}
+
 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
int type_bits, int seg)
 {
struct kvm_segment kvm_seg;
 
-   if (!(vcpu-arch.cr0  X86_CR0_PE))
+   if (is_vm86_segment(vcpu, seg) || !(vcpu-arch.cr0  X86_CR0_PE))
return kvm_load_realmode_segment(vcpu, selector, seg);
if (load_segment_descriptor_to_kvm_desct(vcpu, selector, kvm_seg))
return 1;
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 41/47] KVM: Add __KERNEL__ guards to exported headers

2009-08-26 Thread Avi Kivity
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/ia64/include/asm/kvm_para.h |4 
 arch/s390/include/asm/kvm_para.h |4 
 2 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/arch/ia64/include/asm/kvm_para.h b/arch/ia64/include/asm/kvm_para.h
index 0d6d8ca..1588aee 100644
--- a/arch/ia64/include/asm/kvm_para.h
+++ b/arch/ia64/include/asm/kvm_para.h
@@ -19,9 +19,13 @@
  *
  */
 
+#ifdef __KERNEL__
+
 static inline unsigned int kvm_arch_para_features(void)
 {
return 0;
 }
 
 #endif
+
+#endif
diff --git a/arch/s390/include/asm/kvm_para.h b/arch/s390/include/asm/kvm_para.h
index 2c50379..6964db2 100644
--- a/arch/s390/include/asm/kvm_para.h
+++ b/arch/s390/include/asm/kvm_para.h
@@ -13,6 +13,8 @@
 #ifndef __S390_KVM_PARA_H
 #define __S390_KVM_PARA_H
 
+#ifdef __KERNEL__
+
 /*
  * Hypercalls for KVM on s390. The calling convention is similar to the
  * s390 ABI, so we use R2-R6 for parameters 1-5. In addition we use R1
@@ -147,4 +149,6 @@ static inline unsigned int kvm_arch_para_features(void)
return 0;
 }
 
+#endif
+
 #endif /* __S390_KVM_PARA_H */
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 32/47] KVM: SVM: enable nested svm by default

2009-08-26 Thread Avi Kivity
From: Joerg Roedel joerg.roe...@amd.com

Nested SVM is (in my experience) stable enough to be enabled by
default. So omit the requirement to pass a module parameter.

Signed-off-by: Joerg Roedel joerg.roe...@amd.com
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/svm.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 825035e..be0f6ef 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -124,7 +124,7 @@ static int npt = 1;
 
 module_param(npt, int, S_IRUGO);
 
-static int nested = 0;
+static int nested = 1;
 module_param(nested, int, S_IRUGO);
 
 static void svm_flush_tlb(struct kvm_vcpu *vcpu);
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 44/47] KVM: Use kvm_{read,write}_guest_virt() to read and write segment descriptors

2009-08-26 Thread Avi Kivity
From: Mikhail Ershov mike.ers...@gmail.com

Segment descriptors tables can be placed on two non-contiguous pages.
This patch makes reading segment descriptors by linear address.

Signed-off-by: Mikhail Ershov mike.ers...@gmail.com
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/x86.c |   10 ++
 1 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c0e9427..59a8ba4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4021,7 +4021,6 @@ static void get_segment_descriptor_dtable(struct kvm_vcpu 
*vcpu,
 static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
 struct desc_struct *seg_desc)
 {
-   gpa_t gpa;
struct descriptor_table dtable;
u16 index = selector  3;
 
@@ -4031,16 +4030,13 @@ static int load_guest_segment_descriptor(struct 
kvm_vcpu *vcpu, u16 selector,
kvm_queue_exception_e(vcpu, GP_VECTOR, selector  0xfffc);
return 1;
}
-   gpa = vcpu-arch.mmu.gva_to_gpa(vcpu, dtable.base);
-   gpa += index * 8;
-   return kvm_read_guest(vcpu-kvm, gpa, seg_desc, 8);
+   return kvm_read_guest_virt(dtable.base + index*8, seg_desc, 
sizeof(*seg_desc), vcpu);
 }
 
 /* allowed just for 8 bytes segments */
 static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
 struct desc_struct *seg_desc)
 {
-   gpa_t gpa;
struct descriptor_table dtable;
u16 index = selector  3;
 
@@ -4048,9 +4044,7 @@ static int save_guest_segment_descriptor(struct kvm_vcpu 
*vcpu, u16 selector,
 
if (dtable.limit  index * 8 + 7)
return 1;
-   gpa = vcpu-arch.mmu.gva_to_gpa(vcpu, dtable.base);
-   gpa += index * 8;
-   return kvm_write_guest(vcpu-kvm, gpa, seg_desc, 8);
+   return kvm_write_guest_virt(dtable.base + index*8, seg_desc, 
sizeof(*seg_desc), vcpu);
 }
 
 static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 42/47] KVM: Add missing #include

2009-08-26 Thread Avi Kivity
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/include/asm/kvm_para.h |2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index b8a3305..c584076 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -1,6 +1,8 @@
 #ifndef _ASM_X86_KVM_PARA_H
 #define _ASM_X86_KVM_PARA_H
 
+#include linux/types.h
+
 /* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx.  It
  * should be used to determine that a VM is running under KVM.
  */
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 01/47] KVM: remove superfluous NULL pointer check in kvm_inject_pit_timer_irqs()

2009-08-26 Thread Avi Kivity
From: Bartlomiej Zolnierkiewicz bzoln...@gmail.com

This takes care of the following entries from Dan's list:

arch/x86/kvm/i8254.c +714 kvm_inject_pit_timer_irqs(6) warning: variable 
derefenced in initializer 'vcpu'
arch/x86/kvm/i8254.c +714 kvm_inject_pit_timer_irqs(6) warning: variable 
derefenced before check 'vcpu'

Reported-by: Dan Carpenter erro...@gmail.com
Cc: cor...@lwn.net
Cc: e...@redhat.com
Cc: Julia Lawall ju...@diku.dk
Signed-off-by: Bartlomiej Zolnierkiewicz bzoln...@gmail.com
Acked-by: Sheng Yang sh...@linux.intel.com
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/i8254.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 472653c..82ad523 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -713,7 +713,7 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
struct kvm *kvm = vcpu-kvm;
struct kvm_kpit_state *ps;
 
-   if (vcpu  pit) {
+   if (pit) {
int inject = 0;
ps = pit-pit_state;
 
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 15/47] KVM: SVM: copy only necessary parts of the control area on vmrun/vmexit

2009-08-26 Thread Avi Kivity
From: Joerg Roedel joerg.roe...@amd.com

The vmcb control area contains more then 800 bytes of reserved fields
which are unnecessarily copied. Fix this by introducing a copy
function which only copies the relevant part and saves time.

Signed-off-by: Joerg Roedel joerg.roe...@amd.com
Acked-by: Alexander Graf ag...@suse.de
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/svm.c |   36 ++--
 1 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f11f880..df795bc 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1567,6 +1567,38 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm, 
bool kvm_override)
 nested_svm_exit_handled_real);
 }
 
+static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb 
*from_vmcb)
+{
+   struct vmcb_control_area *dst  = dst_vmcb-control;
+   struct vmcb_control_area *from = from_vmcb-control;
+
+   dst-intercept_cr_read= from-intercept_cr_read;
+   dst-intercept_cr_write   = from-intercept_cr_write;
+   dst-intercept_dr_read= from-intercept_dr_read;
+   dst-intercept_dr_write   = from-intercept_dr_write;
+   dst-intercept_exceptions = from-intercept_exceptions;
+   dst-intercept= from-intercept;
+   dst-iopm_base_pa = from-iopm_base_pa;
+   dst-msrpm_base_pa= from-msrpm_base_pa;
+   dst-tsc_offset   = from-tsc_offset;
+   dst-asid = from-asid;
+   dst-tlb_ctl  = from-tlb_ctl;
+   dst-int_ctl  = from-int_ctl;
+   dst-int_vector   = from-int_vector;
+   dst-int_state= from-int_state;
+   dst-exit_code= from-exit_code;
+   dst-exit_code_hi = from-exit_code_hi;
+   dst-exit_info_1  = from-exit_info_1;
+   dst-exit_info_2  = from-exit_info_2;
+   dst-exit_int_info= from-exit_int_info;
+   dst-exit_int_info_err= from-exit_int_info_err;
+   dst-nested_ctl   = from-nested_ctl;
+   dst-event_inj= from-event_inj;
+   dst-event_inj_err= from-event_inj_err;
+   dst-nested_cr3   = from-nested_cr3;
+   dst-lbr_ctl  = from-lbr_ctl;
+}
+
 static int nested_svm_vmexit_real(struct vcpu_svm *svm, void *arg1,
  void *arg2, void *opaque)
 {
@@ -1612,7 +1644,7 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, 
void *arg1,
nested_vmcb-control.int_ctl = ~V_INTR_MASKING_MASK;
 
/* Restore the original control entries */
-   svm-vmcb-control = hsave-control;
+   copy_vmcb_control_area(vmcb, hsave);
 
/* Kill any pending exceptions */
if (svm-vcpu.arch.exception.pending == true)
@@ -1710,7 +1742,7 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void 
*arg1,
else
hsave-save.cr3= svm-vcpu.arch.cr3;
 
-   hsave-control = vmcb-control;
+   copy_vmcb_control_area(hsave, vmcb);
 
if (svm-vmcb-save.rflags  X86_EFLAGS_IF)
svm-vcpu.arch.hflags |= HF_HIF_MASK;
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 14/47] KVM: SVM: optimize nested vmrun

2009-08-26 Thread Avi Kivity
From: Joerg Roedel joerg.roe...@amd.com

Only copy the necessary parts of the vmcb save area on vmrun and save
precious time.

Signed-off-by: Joerg Roedel joerg.roe...@amd.com
Acked-by: Alexander Graf ag...@suse.de
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/svm.c |   28 +---
 1 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 2f5f223..f11f880 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1681,6 +1681,7 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void 
*arg1,
 {
struct vmcb *nested_vmcb = (struct vmcb *)arg1;
struct vmcb *hsave = svm-hsave;
+   struct vmcb *vmcb = svm-vmcb;
 
/* nested_vmcb is our indicator if nested SVM is activated */
svm-nested_vmcb = svm-vmcb-save.rax;
@@ -1691,12 +1692,25 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void 
*arg1,
 
/* Save the old vmcb, so we don't need to pick what we save, but
   can restore everything when a VMEXIT occurs */
-   memcpy(hsave, svm-vmcb, sizeof(struct vmcb));
-   /* We need to remember the original CR3 in the SPT case */
-   if (!npt_enabled)
-   hsave-save.cr3 = svm-vcpu.arch.cr3;
-   hsave-save.cr4 = svm-vcpu.arch.cr4;
-   hsave-save.rip = svm-next_rip;
+   hsave-save.es = vmcb-save.es;
+   hsave-save.cs = vmcb-save.cs;
+   hsave-save.ss = vmcb-save.ss;
+   hsave-save.ds = vmcb-save.ds;
+   hsave-save.gdtr   = vmcb-save.gdtr;
+   hsave-save.idtr   = vmcb-save.idtr;
+   hsave-save.efer   = svm-vcpu.arch.shadow_efer;
+   hsave-save.cr0= svm-vcpu.arch.cr0;
+   hsave-save.cr4= svm-vcpu.arch.cr4;
+   hsave-save.rflags = vmcb-save.rflags;
+   hsave-save.rip= svm-next_rip;
+   hsave-save.rsp= vmcb-save.rsp;
+   hsave-save.rax= vmcb-save.rax;
+   if (npt_enabled)
+   hsave-save.cr3= vmcb-save.cr3;
+   else
+   hsave-save.cr3= svm-vcpu.arch.cr3;
+
+   hsave-control = vmcb-control;
 
if (svm-vmcb-save.rflags  X86_EFLAGS_IF)
svm-vcpu.arch.hflags |= HF_HIF_MASK;
@@ -1721,7 +1735,7 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void 
*arg1,
kvm_set_cr3(svm-vcpu, nested_vmcb-save.cr3);
kvm_mmu_reset_context(svm-vcpu);
}
-   svm-vmcb-save.cr2 = nested_vmcb-save.cr2;
+   svm-vmcb-save.cr2 = svm-vcpu.arch.cr2 = nested_vmcb-save.cr2;
kvm_register_write(svm-vcpu, VCPU_REGS_RAX, nested_vmcb-save.rax);
kvm_register_write(svm-vcpu, VCPU_REGS_RSP, nested_vmcb-save.rsp);
kvm_register_write(svm-vcpu, VCPU_REGS_RIP, nested_vmcb-save.rip);
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 13/47] KVM: SVM: optimize nested #vmexit

2009-08-26 Thread Avi Kivity
From: Joerg Roedel joerg.roe...@amd.com

It is more efficient to copy only the relevant parts of the vmcb back to
the nested vmcb when we emulate an vmexit.

Signed-off-by: Joerg Roedel joerg.roe...@amd.com
Acked-by: Alexander Graf ag...@suse.de
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/svm.c |   68 +--
 1 files changed, 33 insertions(+), 35 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 9f72772..2f5f223 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1572,53 +1572,52 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, 
void *arg1,
 {
struct vmcb *nested_vmcb = (struct vmcb *)arg1;
struct vmcb *hsave = svm-hsave;
-   u64 nested_save[] = { nested_vmcb-save.cr0,
- nested_vmcb-save.cr3,
- nested_vmcb-save.cr4,
- nested_vmcb-save.efer,
- nested_vmcb-control.intercept_cr_read,
- nested_vmcb-control.intercept_cr_write,
- nested_vmcb-control.intercept_dr_read,
- nested_vmcb-control.intercept_dr_write,
- nested_vmcb-control.intercept_exceptions,
- nested_vmcb-control.intercept,
- nested_vmcb-control.msrpm_base_pa,
- nested_vmcb-control.iopm_base_pa,
- nested_vmcb-control.tsc_offset };
+   struct vmcb *vmcb = svm-vmcb;
 
/* Give the current vmcb to the guest */
-   memcpy(nested_vmcb, svm-vmcb, sizeof(struct vmcb));
-   nested_vmcb-save.cr0 = nested_save[0];
-   if (!npt_enabled)
-   nested_vmcb-save.cr3 = nested_save[1];
-   nested_vmcb-save.cr4 = nested_save[2];
-   nested_vmcb-save.efer = nested_save[3];
-   nested_vmcb-control.intercept_cr_read = nested_save[4];
-   nested_vmcb-control.intercept_cr_write = nested_save[5];
-   nested_vmcb-control.intercept_dr_read = nested_save[6];
-   nested_vmcb-control.intercept_dr_write = nested_save[7];
-   nested_vmcb-control.intercept_exceptions = nested_save[8];
-   nested_vmcb-control.intercept = nested_save[9];
-   nested_vmcb-control.msrpm_base_pa = nested_save[10];
-   nested_vmcb-control.iopm_base_pa = nested_save[11];
-   nested_vmcb-control.tsc_offset = nested_save[12];
+   disable_gif(svm);
+
+   nested_vmcb-save.es = vmcb-save.es;
+   nested_vmcb-save.cs = vmcb-save.cs;
+   nested_vmcb-save.ss = vmcb-save.ss;
+   nested_vmcb-save.ds = vmcb-save.ds;
+   nested_vmcb-save.gdtr   = vmcb-save.gdtr;
+   nested_vmcb-save.idtr   = vmcb-save.idtr;
+   if (npt_enabled)
+   nested_vmcb-save.cr3= vmcb-save.cr3;
+   nested_vmcb-save.cr2= vmcb-save.cr2;
+   nested_vmcb-save.rflags = vmcb-save.rflags;
+   nested_vmcb-save.rip= vmcb-save.rip;
+   nested_vmcb-save.rsp= vmcb-save.rsp;
+   nested_vmcb-save.rax= vmcb-save.rax;
+   nested_vmcb-save.dr7= vmcb-save.dr7;
+   nested_vmcb-save.dr6= vmcb-save.dr6;
+   nested_vmcb-save.cpl= vmcb-save.cpl;
+
+   nested_vmcb-control.int_ctl   = vmcb-control.int_ctl;
+   nested_vmcb-control.int_vector= vmcb-control.int_vector;
+   nested_vmcb-control.int_state = vmcb-control.int_state;
+   nested_vmcb-control.exit_code = vmcb-control.exit_code;
+   nested_vmcb-control.exit_code_hi  = vmcb-control.exit_code_hi;
+   nested_vmcb-control.exit_info_1   = vmcb-control.exit_info_1;
+   nested_vmcb-control.exit_info_2   = vmcb-control.exit_info_2;
+   nested_vmcb-control.exit_int_info = vmcb-control.exit_int_info;
+   nested_vmcb-control.exit_int_info_err = 
vmcb-control.exit_int_info_err;
+   nested_vmcb-control.tlb_ctl   = 0;
+   nested_vmcb-control.event_inj = 0;
+   nested_vmcb-control.event_inj_err = 0;
 
/* We always set V_INTR_MASKING and remember the old value in hflags */
if (!(svm-vcpu.arch.hflags  HF_VINTR_MASK))
nested_vmcb-control.int_ctl = ~V_INTR_MASKING_MASK;
 
-   if ((nested_vmcb-control.int_ctl  V_IRQ_MASK) 
-   (nested_vmcb-control.int_vector)) {
-   nsvm_printk(WARNING: IRQ 0x%x still enabled on #VMEXIT\n,
-   nested_vmcb-control.int_vector);
-   }
-
/* Restore the original control entries */
svm-vmcb-control = hsave-control;
 
/* Kill any pending exceptions */
if (svm-vcpu.arch.exception.pending == true)
nsvm_printk(WARNING: Pending Exception\n);
+
kvm_clear_exception_queue(svm-vcpu);
kvm_clear_interrupt_queue(svm-vcpu);
 
@@ -1646,7 +1645,6 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, 
void 

[PATCH 08/47] KVM: Call kvm_vcpu_kick() inside pic spinlock

2009-08-26 Thread Avi Kivity
From: Gleb Natapov g...@redhat.com

d5ecfdd25 moved it out because back than it was impossible to
call it inside spinlock. This restriction no longer exists.

Signed-off-by: Gleb Natapov g...@redhat.com
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/i8259.c |   10 +-
 arch/x86/kvm/irq.h   |1 -
 2 files changed, 1 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index daf4606..d27320c 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -43,11 +43,9 @@ static void pic_unlock(struct kvm_pic *s)
 {
struct kvm *kvm = s-kvm;
unsigned acks = s-pending_acks;
-   bool wakeup = s-wakeup_needed;
struct kvm_vcpu *vcpu;
 
s-pending_acks = 0;
-   s-wakeup_needed = false;
 
spin_unlock(s-lock);
 
@@ -56,12 +54,6 @@ static void pic_unlock(struct kvm_pic *s)
 __ffs(acks));
acks = acks - 1;
}
-
-   if (wakeup) {
-   vcpu = s-kvm-bsp_vcpu;
-   if (vcpu)
-   kvm_vcpu_kick(vcpu);
-   }
 }
 
 static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
@@ -527,7 +519,7 @@ static void pic_irq_request(void *opaque, int level)
s-output = level;
if (vcpu  level  (s-pics[0].isr_ack  (1  irq))) {
s-pics[0].isr_ack = ~(1  irq);
-   s-wakeup_needed = true;
+   kvm_vcpu_kick(vcpu);
}
 }
 
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 9f59318..7d6058a 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -63,7 +63,6 @@ struct kvm_kpic_state {
 
 struct kvm_pic {
spinlock_t lock;
-   bool wakeup_needed;
unsigned pending_acks;
struct kvm *kvm;
struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 23/47] KVM: SVM: clean up nested_svm_exit_handled_msr

2009-08-26 Thread Avi Kivity
From: Joerg Roedel joerg.roe...@amd.com

This patch changes nested svm to call nested_svm_exit_handled_msr
directly and not through nested_svm_do.

Signed-off-by: Joerg Roedel joerg.roe...@amd.com
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/svm.c |   37 ++---
 1 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 5e55a1b..518d578 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1483,15 +1483,20 @@ static int nested_svm_do(struct vcpu_svm *svm,
return retval;
 }
 
-static int nested_svm_exit_handled_msr(struct vcpu_svm *svm,
-  void *arg1, void *arg2,
-  void *opaque)
+static bool nested_svm_exit_handled_msr(struct vcpu_svm *svm)
 {
-   struct vmcb *nested_vmcb = (struct vmcb *)arg1;
-   u8 *msrpm = (u8 *)arg2;
-   u32 t0, t1;
-   u32 msr = svm-vcpu.arch.regs[VCPU_REGS_RCX];
u32 param = svm-vmcb-control.exit_info_1  1;
+   u32 msr = svm-vcpu.arch.regs[VCPU_REGS_RCX];
+   struct vmcb *nested_vmcb;
+   bool ret = false;
+   u32 t0, t1;
+   u8 *msrpm;
+
+   nested_vmcb = nested_svm_map(svm, svm-nested.vmcb, KM_USER0);
+   msrpm   = nested_svm_map(svm, svm-nested.vmcb_msrpm, KM_USER1);
+
+   if (!nested_vmcb || !msrpm)
+   goto out;
 
if (!(nested_vmcb-control.intercept  (1ULL  INTERCEPT_MSR_PROT)))
return 0;
@@ -1512,13 +1517,17 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm 
*svm,
t0 %= 8;
break;
default:
-   return 1;
-   break;
+   ret = true;
+   goto out;
}
-   if (msrpm[t1]  ((1  param)  t0))
-   return 1;
 
-   return 0;
+   ret = msrpm[t1]  ((1  param)  t0);
+
+out:
+   nested_svm_unmap(nested_vmcb, KM_USER0);
+   nested_svm_unmap(msrpm, KM_USER1);
+
+   return ret;
 }
 
 static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override)
@@ -1548,9 +1557,7 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm, 
bool kvm_override)
 
switch (exit_code) {
case SVM_EXIT_MSR:
-   if (nested_svm_do(svm, svm-nested.vmcb, svm-nested.vmcb_msrpm,
- NULL, nested_svm_exit_handled_msr))
-   vmexit = true;
+   vmexit = nested_svm_exit_handled_msr(svm);
break;
case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: {
u32 cr_bits = 1  (exit_code - SVM_EXIT_READ_CR0);
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 17/47] KVM: SVM: move nested svm state into seperate struct

2009-08-26 Thread Avi Kivity
From: Joerg Roedel joerg.roe...@amd.com

This makes it more clear for which purpose these members in the vcpu_svm
exist.

Signed-off-by: Joerg Roedel joerg.roe...@amd.com
Acked-by: Alexander Graf ag...@suse.de
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/svm.c |   62 +++
 1 files changed, 33 insertions(+), 29 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 825b825..fbadaa7 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -70,6 +70,18 @@ static const u32 host_save_user_msrs[] = {
 
 struct kvm_vcpu;
 
+struct nested_state {
+   struct vmcb *hsave;
+   u64 hsave_msr;
+   u64 vmcb;
+
+   /* These are the merged vectors */
+   u32 *msrpm;
+
+   /* gpa pointers to the real vectors */
+   u64 vmcb_msrpm;
+};
+
 struct vcpu_svm {
struct kvm_vcpu vcpu;
struct vmcb *vmcb;
@@ -85,16 +97,8 @@ struct vcpu_svm {
u64 host_gs_base;
 
u32 *msrpm;
-   struct vmcb *hsave;
-   u64 hsave_msr;
-
-   u64 nested_vmcb;
 
-   /* These are the merged vectors */
-   u32 *nested_msrpm;
-
-   /* gpa pointers to the real vectors */
-   u64 nested_vmcb_msrpm;
+   struct nested_state nested;
 };
 
 /* enable NPT for AMD64 and X86 with PAE */
@@ -127,7 +131,7 @@ static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
 
 static inline bool is_nested(struct vcpu_svm *svm)
 {
-   return svm-nested_vmcb;
+   return svm-nested.vmcb;
 }
 
 static inline void enable_gif(struct vcpu_svm *svm)
@@ -636,7 +640,7 @@ static void init_vmcb(struct vcpu_svm *svm)
}
force_new_asid(svm-vcpu);
 
-   svm-nested_vmcb = 0;
+   svm-nested.vmcb = 0;
svm-vcpu.arch.hflags = 0;
 
enable_gif(svm);
@@ -699,9 +703,9 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, 
unsigned int id)
hsave_page = alloc_page(GFP_KERNEL);
if (!hsave_page)
goto uninit;
-   svm-hsave = page_address(hsave_page);
+   svm-nested.hsave = page_address(hsave_page);
 
-   svm-nested_msrpm = page_address(nested_msrpm_pages);
+   svm-nested.msrpm = page_address(nested_msrpm_pages);
 
svm-vmcb = page_address(page);
clear_page(svm-vmcb);
@@ -731,8 +735,8 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
 
__free_page(pfn_to_page(svm-vmcb_pa  PAGE_SHIFT));
__free_pages(virt_to_page(svm-msrpm), MSRPM_ALLOC_ORDER);
-   __free_page(virt_to_page(svm-hsave));
-   __free_pages(virt_to_page(svm-nested_msrpm), MSRPM_ALLOC_ORDER);
+   __free_page(virt_to_page(svm-nested.hsave));
+   __free_pages(virt_to_page(svm-nested.msrpm), MSRPM_ALLOC_ORDER);
kvm_vcpu_uninit(vcpu);
kmem_cache_free(kvm_vcpu_cache, svm);
 }
@@ -1558,13 +1562,13 @@ static int nested_svm_exit_handled(struct vcpu_svm 
*svm, bool kvm_override)
 
switch (svm-vmcb-control.exit_code) {
case SVM_EXIT_MSR:
-   return nested_svm_do(svm, svm-nested_vmcb,
-svm-nested_vmcb_msrpm, NULL,
+   return nested_svm_do(svm, svm-nested.vmcb,
+svm-nested.vmcb_msrpm, NULL,
 nested_svm_exit_handled_msr);
default: break;
}
 
-   return nested_svm_do(svm, svm-nested_vmcb, 0, k,
+   return nested_svm_do(svm, svm-nested.vmcb, 0, k,
 nested_svm_exit_handled_real);
 }
 
@@ -1604,7 +1608,7 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, 
void *arg1,
  void *arg2, void *opaque)
 {
struct vmcb *nested_vmcb = (struct vmcb *)arg1;
-   struct vmcb *hsave = svm-hsave;
+   struct vmcb *hsave = svm-nested.hsave;
struct vmcb *vmcb = svm-vmcb;
 
/* Give the current vmcb to the guest */
@@ -1679,7 +1683,7 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, 
void *arg1,
svm-vmcb-control.exit_int_info = 0;
 
/* Exit nested SVM mode */
-   svm-nested_vmcb = 0;
+   svm-nested.vmcb = 0;
 
return 0;
 }
@@ -1687,7 +1691,7 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, 
void *arg1,
 static int nested_svm_vmexit(struct vcpu_svm *svm)
 {
nsvm_printk(VMexit\n);
-   if (nested_svm_do(svm, svm-nested_vmcb, 0,
+   if (nested_svm_do(svm, svm-nested.vmcb, 0,
  NULL, nested_svm_vmexit_real))
return 1;
 
@@ -1703,8 +1707,8 @@ static int nested_svm_vmrun_msrpm(struct vcpu_svm *svm, 
void *arg1,
int i;
u32 *nested_msrpm = (u32*)arg1;
for (i=0; i PAGE_SIZE * (1  MSRPM_ALLOC_ORDER) / 4; i++)
-   svm-nested_msrpm[i] = svm-msrpm[i] | nested_msrpm[i];
-   svm-vmcb-control.msrpm_base_pa = __pa(svm-nested_msrpm);
+   svm-nested.msrpm[i] = svm-msrpm[i] | nested_msrpm[i];
+   svm-vmcb-control.msrpm_base_pa = 

[PATCH 12/47] KVM: SVM: add helper functions for global interrupt flag

2009-08-26 Thread Avi Kivity
From: Joerg Roedel joerg.roe...@amd.com

This patch makes the code easier to read when it comes to setting,
clearing and checking the status of the virtualized global
interrupt flag for the VCPU.

Signed-off-by: Joerg Roedel joerg.roe...@amd.com
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/svm.c |   33 +
 1 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 10e718d..9f72772 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -129,6 +129,21 @@ static inline bool is_nested(struct vcpu_svm *svm)
return svm-nested_vmcb;
 }
 
+static inline void enable_gif(struct vcpu_svm *svm)
+{
+   svm-vcpu.arch.hflags |= HF_GIF_MASK;
+}
+
+static inline void disable_gif(struct vcpu_svm *svm)
+{
+   svm-vcpu.arch.hflags = ~HF_GIF_MASK;
+}
+
+static inline bool gif_set(struct vcpu_svm *svm)
+{
+   return !!(svm-vcpu.arch.hflags  HF_GIF_MASK);
+}
+
 static unsigned long iopm_base;
 
 struct kvm_ldttss_desc {
@@ -621,7 +636,9 @@ static void init_vmcb(struct vcpu_svm *svm)
force_new_asid(svm-vcpu);
 
svm-nested_vmcb = 0;
-   svm-vcpu.arch.hflags = HF_GIF_MASK;
+   svm-vcpu.arch.hflags = 0;
+
+   enable_gif(svm);
 }
 
 static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
@@ -1629,7 +1646,7 @@ static int nested_svm_vmexit_real(struct vcpu_svm *svm, 
void *arg1,
svm-vmcb-save.cpl = 0;
svm-vmcb-control.exit_int_info = 0;
 
-   svm-vcpu.arch.hflags = ~HF_GIF_MASK;
+   disable_gif(svm);
/* Exit nested SVM mode */
svm-nested_vmcb = 0;
 
@@ -1761,7 +1778,7 @@ static int nested_svm_vmrun(struct vcpu_svm *svm, void 
*arg1,
svm-vmcb-control.event_inj = nested_vmcb-control.event_inj;
svm-vmcb-control.event_inj_err = nested_vmcb-control.event_inj_err;
 
-   svm-vcpu.arch.hflags |= HF_GIF_MASK;
+   enable_gif(svm);
 
return 0;
 }
@@ -1850,7 +1867,7 @@ static int stgi_interception(struct vcpu_svm *svm, struct 
kvm_run *kvm_run)
svm-next_rip = kvm_rip_read(svm-vcpu) + 3;
skip_emulated_instruction(svm-vcpu);
 
-   svm-vcpu.arch.hflags |= HF_GIF_MASK;
+   enable_gif(svm);
 
return 1;
 }
@@ -1863,7 +1880,7 @@ static int clgi_interception(struct vcpu_svm *svm, struct 
kvm_run *kvm_run)
svm-next_rip = kvm_rip_read(svm-vcpu) + 3;
skip_emulated_instruction(svm-vcpu);
 
-   svm-vcpu.arch.hflags = ~HF_GIF_MASK;
+   disable_gif(svm);
 
/* After a CLGI no interrupts should come */
svm_clear_vintr(svm);
@@ -2352,7 +2369,7 @@ static void svm_set_irq(struct kvm_vcpu *vcpu)
 {
struct vcpu_svm *svm = to_svm(vcpu);
 
-   BUG_ON(!(svm-vcpu.arch.hflags  HF_GIF_MASK));
+   BUG_ON(!(gif_set(svm)));
 
svm-vmcb-control.event_inj = vcpu-arch.interrupt.nr |
SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
@@ -2383,7 +2400,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
struct vmcb *vmcb = svm-vmcb;
return (vmcb-save.rflags  X86_EFLAGS_IF) 
!(vmcb-control.int_state  SVM_INTERRUPT_SHADOW_MASK) 
-   (svm-vcpu.arch.hflags  HF_GIF_MASK) 
+   gif_set(svm) 
!is_nested(svm);
 }
 
@@ -2398,7 +2415,7 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
 * GIF becomes 1, because that's a separate STGI/VMRUN intercept.
 * The next time we get that intercept, this function will be
 * called again though and we'll get the vintr intercept. */
-   if (svm-vcpu.arch.hflags  HF_GIF_MASK) {
+   if (gif_set(svm)) {
svm_set_vintr(svm);
svm_inject_irq(svm, 0x0);
}
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 02/47] KVM: MMU: make __kvm_mmu_free_some_pages handle empty list

2009-08-26 Thread Avi Kivity
From: Izik Eidus iei...@redhat.com

First check if the list is empty before attempting to look at list
entries.

Signed-off-by: Izik Eidus iei...@redhat.com
Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/mmu.c |3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 1249c12..28be35c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2705,7 +2705,8 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
 
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 {
-   while (vcpu-kvm-arch.n_free_mmu_pages  KVM_REFILL_PAGES) {
+   while (vcpu-kvm-arch.n_free_mmu_pages  KVM_REFILL_PAGES 
+  !list_empty(vcpu-kvm-arch.active_mmu_pages)) {
struct kvm_mmu_page *sp;
 
sp = container_of(vcpu-kvm-arch.active_mmu_pages.prev,
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 10/47] KVM: Replace pic_lock()/pic_unlock() with direct call to spinlock functions

2009-08-26 Thread Avi Kivity
From: Gleb Natapov g...@redhat.com

They are not doing anything else now.

Signed-off-by: Gleb Natapov g...@redhat.com
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/i8259.c |   36 
 1 files changed, 12 insertions(+), 24 deletions(-)

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 3aacd33..01f1516 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -32,18 +32,6 @@
 #include linux/kvm_host.h
 #include trace.h
 
-static void pic_lock(struct kvm_pic *s)
-   __acquires(s-lock)
-{
-   spin_lock(s-lock);
-}
-
-static void pic_unlock(struct kvm_pic *s)
-   __releases(s-lock)
-{
-   spin_unlock(s-lock);
-}
-
 static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
 {
s-isr = ~(1  irq);
@@ -56,10 +44,10 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
 void kvm_pic_clear_isr_ack(struct kvm *kvm)
 {
struct kvm_pic *s = pic_irqchip(kvm);
-   pic_lock(s);
+   spin_lock(s-lock);
s-pics[0].isr_ack = 0xff;
s-pics[1].isr_ack = 0xff;
-   pic_unlock(s);
+   spin_unlock(s-lock);
 }
 
 /*
@@ -160,9 +148,9 @@ static void pic_update_irq(struct kvm_pic *s)
 
 void kvm_pic_update_irq(struct kvm_pic *s)
 {
-   pic_lock(s);
+   spin_lock(s-lock);
pic_update_irq(s);
-   pic_unlock(s);
+   spin_unlock(s-lock);
 }
 
 int kvm_pic_set_irq(void *opaque, int irq, int level)
@@ -170,14 +158,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
struct kvm_pic *s = opaque;
int ret = -1;
 
-   pic_lock(s);
+   spin_lock(s-lock);
if (irq = 0  irq  PIC_NUM_PINS) {
ret = pic_set_irq1(s-pics[irq  3], irq  7, level);
pic_update_irq(s);
trace_kvm_pic_set_irq(irq  3, irq  7, s-pics[irq  3].elcr,
  s-pics[irq  3].imr, ret == 0);
}
-   pic_unlock(s);
+   spin_unlock(s-lock);
 
return ret;
 }
@@ -205,7 +193,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
int irq, irq2, intno;
struct kvm_pic *s = pic_irqchip(kvm);
 
-   pic_lock(s);
+   spin_lock(s-lock);
irq = pic_get_irq(s-pics[0]);
if (irq = 0) {
pic_intack(s-pics[0], irq);
@@ -230,7 +218,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
intno = s-pics[0].irq_base + irq;
}
pic_update_irq(s);
-   pic_unlock(s);
+   spin_unlock(s-lock);
 
return intno;
 }
@@ -448,7 +436,7 @@ static int picdev_write(struct kvm_io_device *this,
printk(KERN_ERR PIC: non byte write\n);
return 0;
}
-   pic_lock(s);
+   spin_lock(s-lock);
switch (addr) {
case 0x20:
case 0x21:
@@ -461,7 +449,7 @@ static int picdev_write(struct kvm_io_device *this,
elcr_ioport_write(s-pics[addr  1], addr, data);
break;
}
-   pic_unlock(s);
+   spin_unlock(s-lock);
return 0;
 }
 
@@ -478,7 +466,7 @@ static int picdev_read(struct kvm_io_device *this,
printk(KERN_ERR PIC: non byte read\n);
return 0;
}
-   pic_lock(s);
+   spin_lock(s-lock);
switch (addr) {
case 0x20:
case 0x21:
@@ -492,7 +480,7 @@ static int picdev_read(struct kvm_io_device *this,
break;
}
*(unsigned char *)val = data;
-   pic_unlock(s);
+   spin_unlock(s-lock);
return 0;
 }
 
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 06/47] KVM: ignore reads to perfctr msrs

2009-08-26 Thread Avi Kivity
From: Amit Shah amit.s...@redhat.com

We ignore writes to the perfctr msrs. Ignore reads as well.

Kaspersky antivirus crashes Windows guests if it can't read
these MSRs.

Signed-off-by: Amit Shah amit.s...@redhat.com
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/x86.c |3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 92b5edd..132c510 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1048,9 +1048,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, 
u64 *pdata)
case MSR_K8_SYSCFG:
case MSR_K7_HWCR:
case MSR_VM_HSAVE_PA:
+   case MSR_P6_PERFCTR0:
+   case MSR_P6_PERFCTR1:
case MSR_P6_EVNTSEL0:
case MSR_P6_EVNTSEL1:
case MSR_K7_EVNTSEL0:
+   case MSR_K7_PERFCTR0:
case MSR_K8_INT_PENDING_MSG:
case MSR_AMD64_NB_CFG:
case MSR_FAM10H_MMIO_CONF_BASE:
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 05/47] KVM: VMX: Optimize vmx_get_cpl()

2009-08-26 Thread Avi Kivity
Instead of calling vmx_get_segment() (which reads a whole bunch of
vmcs fields), read only the cs selector which contains the cpl.

Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/vmx.c |5 +
 1 files changed, 1 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 32e6d20..0ba706e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1773,16 +1773,13 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
 
 static int vmx_get_cpl(struct kvm_vcpu *vcpu)
 {
-   struct kvm_segment kvm_seg;
-
if (!(vcpu-arch.cr0  X86_CR0_PE)) /* if real mode */
return 0;
 
if (vmx_get_rflags(vcpu)  X86_EFLAGS_VM) /* if virtual 8086 */
return 3;
 
-   vmx_get_segment(vcpu, kvm_seg, VCPU_SREG_CS);
-   return kvm_seg.selector  3;
+   return vmcs_read16(GUEST_CS_SELECTOR)  3;
 }
 
 static u32 vmx_segment_access_rights(struct kvm_segment *var)
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/2] eventfd: new EFD_STATE flag

2009-08-26 Thread Avi Kivity

On 08/26/2009 01:29 PM, Michael S. Tsirkin wrote:

How we wanted to solve it with EFD_STATE: Share a separate eventfd
between each device and the hypervisor.  device sets state to either 0
or 1.  hypervisor polls all eventfds, reads interrupt line on changes,
calculates the interrupt level and updates guest.

Alternative solution: shared memory where each device writes interrupt
line value. This makes setup more complex (need to share around much more
than just an fd), and makes access from interrupt impossible unless we
lock the memory (and locking userspace memory introduces yet another set
of issues).
   


For completeness:

If the device is implemented in the same process as the hypervisor, an 
eventfd isn't really needed, as there is an ioctl which performs the 
same operation.


An important class of device implementations is real devices that are 
assigned to a guest.  We would like to forward the interrupt directly 
from the host interrupt handler to qemu.  Currently, we have a 
kvm-specific interrupt handler that forwards the interrupt using 
kvm-specific interfaces.  We would like to use a generic interrupt 
handler implemented by uio, so we want a generic interrupt transfer 
mechanism.


uio already supports edge-triggered interrupts using an eventfd-like 
mechanism.  So it makes sense to extend uio to support real eventfds, 
and to make it also support level-triggered interrupts.


We can work around the lack of state eventfd by having userspace wait on 
whatever mechanism uio uses to make the interrupt state visible, and 
then use the ioctl mentioned above to inform the hypervisor of this 
state.  But it's faster and nicer to give both components an eventfd and 
let them communicate directly.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 07/47] KVM: fix EFER read buffer overflow

2009-08-26 Thread Avi Kivity
From: Roel Kluin roel.kl...@gmail.com

Check whether index is within bounds before grabbing the element.

Signed-off-by: Roel Kluin roel.kl...@gmail.com
Cc: Avi Kivity a...@redhat.com
Signed-off-by: Andrew Morton a...@linux-foundation.org
Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/kvm/vmx.c |7 +--
 1 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0ba706e..31c3a87 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -571,12 +571,15 @@ static void reload_tss(void)
 static void load_transition_efer(struct vcpu_vmx *vmx)
 {
int efer_offset = vmx-msr_offset_efer;
-   u64 host_efer = vmx-host_msrs[efer_offset].data;
-   u64 guest_efer = vmx-guest_msrs[efer_offset].data;
+   u64 host_efer;
+   u64 guest_efer;
u64 ignore_bits;
 
if (efer_offset  0)
return;
+   host_efer = vmx-host_msrs[efer_offset].data;
+   guest_efer = vmx-guest_msrs[efer_offset].data;
+
/*
 * NX is emulated; LMA and LME handled by hardware; SCE meaninless
 * outside long mode
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 00/47] KVM updates for 2.6.32 merge window (4/4)

2009-08-26 Thread Avi Kivity
Fourth and final batch of the 2.6.32 KVM patch queue.

Amit Shah (2):
  KVM: ignore reads to perfctr msrs
  Documentation: Update KVM list email address

Anthony Liguori (1):
  KVM: When switching to a vm8086 task, load segments as 16-bit

Avi Kivity (10):
  KVM: VMX: Optimize vmx_get_cpl()
  x86: Export kmap_atomic_to_page()
  KVM: SVM: Drop tlb flush workaround in npt
  KVM: Move #endif KVM_CAP_IRQ_ROUTING to correct place
  KVM: VMX: Adjust rflags if in real mode emulation
  KVM: Rename x86_emulate.c to emulate.c
  KVM: Add __KERNEL__ guards to exported headers
  KVM: Add missing #include
  KVM: Protect update_cr8_intercept() when running without an apic
  KVM: Document KVM_CAP_IRQCHIP

Bartlomiej Zolnierkiewicz (1):
  KVM: remove superfluous NULL pointer check in
kvm_inject_pit_timer_irqs()

Gleb Natapov (4):
  KVM: Call kvm_vcpu_kick() inside pic spinlock
  KVM: Call ack notifiers from PIC when guest OS acks an IRQ.
  KVM: Replace pic_lock()/pic_unlock() with direct call to spinlock
functions
  KVM: Update cr8 intercept when APIC TPR is changed by userspace

Izik Eidus (1):
  KVM: MMU: make __kvm_mmu_free_some_pages handle empty list

Jan Kiszka (1):
  KVM: x86: Disallow hypercalls for guest callers in rings  0

Joerg Roedel (21):
  KVM: SVM: add helper functions for global interrupt flag
  KVM: SVM: optimize nested #vmexit
  KVM: SVM: optimize nested vmrun
  KVM: SVM: copy only necessary parts of the control area on
vmrun/vmexit
  KVM: SVM: complete interrupts after handling nested exits
  KVM: SVM: move nested svm state into seperate struct
  KVM: SVM: cache nested intercepts
  KVM: SVM: consolidate nested_svm_exit_handled
  KVM: SVM: do nested vmexit in nested_svm_exit_handled
  KVM: SVM: simplify nested_svm_check_exception
  KVM: SVM: get rid of nested_svm_vmexit_real
  KVM: SVM: clean up nested_svm_exit_handled_msr
  KVM: SVM: clean up nestec vmload/vmsave paths
  KVM: SVM: clean up nested vmrun path
  KVM: SVM: remove nested_svm_do and helper functions
  KVM: SVM: handle errors in vmrun emulation path appropriatly
  KVM: SVM: move special nested exit handling to separate function
  KVM: SVM: remove unnecessary is_nested check from svm_cpu_run
  KVM: SVM: move nested_svm_intr main logic out of if-clause
  KVM: SVM: check for nested VINTR flag in svm_interrupt_allowed
  KVM: SVM: enable nested svm by default

Marcelo Tosatti (1):
  KVM: MMU: fix bogus alloc_mmu_pages assignment

Michael S. Tsirkin (1):
  KVM: export kvm_para.h

Mikhail Ershov (1):
  KVM: Use kvm_{read,write}_guest_virt() to read and write segment
descriptors

Mohammed Gamal (1):
  KVM: x86 emulator: Add adc and sbb missing decoder flags

Roel Kluin (1):
  KVM: fix EFER read buffer overflow

Sheng Yang (1):
  KVM: VMX: Fix EPT with WP bit change during paging

 Documentation/ioctl/ioctl-number.txt   |2 +-
 Documentation/kvm/api.txt  |   76 +++
 arch/ia64/include/asm/kvm_para.h   |4 +
 arch/s390/include/asm/kvm_para.h   |4 +
 .../asm/{kvm_x86_emulate.h = kvm_emulate.h}   |0
 arch/x86/include/asm/kvm_host.h|2 +-
 arch/x86/include/asm/kvm_para.h|2 +
 arch/x86/kvm/Makefile  |2 +-
 arch/x86/kvm/{x86_emulate.c = emulate.c}  |8 +-
 arch/x86/kvm/i8254.c   |2 +-
 arch/x86/kvm/i8259.c   |   64 +--
 arch/x86/kvm/irq.h |1 -
 arch/x86/kvm/mmu.c |   11 +-
 arch/x86/kvm/svm.c |  650 +++-
 arch/x86/kvm/vmx.c |   25 +-
 arch/x86/kvm/x86.c |   35 +-
 arch/x86/mm/highmem_32.c   |1 +
 include/asm-generic/Kbuild.asm |5 +
 include/linux/Kbuild   |4 +
 include/linux/kvm_para.h   |1 +
 virt/kvm/kvm_main.c|2 +-
 21 files changed, 525 insertions(+), 376 deletions(-)
 rename arch/x86/include/asm/{kvm_x86_emulate.h = kvm_emulate.h} (100%)
 rename arch/x86/kvm/{x86_emulate.c = emulate.c} (99%)

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 11/47] x86: Export kmap_atomic_to_page()

2009-08-26 Thread Avi Kivity
Needed by KVM.

Signed-off-by: Avi Kivity a...@redhat.com
---
 arch/x86/mm/highmem_32.c |1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 2112ed5..572f47c 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -104,6 +104,7 @@ EXPORT_SYMBOL(kunmap);
 EXPORT_SYMBOL(kmap_atomic);
 EXPORT_SYMBOL(kunmap_atomic);
 EXPORT_SYMBOL(kmap_atomic_prot);
+EXPORT_SYMBOL(kmap_atomic_to_page);
 
 void __init set_highmem_pages_init(void)
 {
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 0/9] make interrupt injection lockless (almost)

2009-08-26 Thread Avi Kivity

On 08/24/2009 11:54 AM, Gleb Natapov wrote:

kvm-irq_lock protects too much stuff, but still fail to protect
everything it was design to protect (see ack notifiers call in pic). I
want to make IRQ injection fast path as lockless as possible. This patch
series removes kvm-irq_lock from irq injection path effectively making
interrupt injection to lapic lockless (several kvm_irq_delivery_to_apic()
may run in parallel), but access to lapic was never fully locked in the
first place. VCPU could access lapic in parallel with interrupt injection.
Patches 2-3 changes irq routing data structure to much more efficient one.
   


Applied all, thanks.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Extending virtio_console to support multiple ports

2009-08-26 Thread Amit Shah
On (Tue) Aug 25 2009 [11:47:20], Amit Shah wrote:
 
 Hello all,
 
 Here is a new iteration of the patch series that implements a
 transport for guest and host communications.
 
 The code has been updated to reuse the virtio-console device instead
 of creating a new virtio-serial device.

And the problem now is that hvc calls the put_chars function with
spinlocks held and we now allocate pages in send_buf(), called from
put_chars.

A few solutions:
- Keep things as they are, virtio_console.c remains as it is and
  virtio_serial.c gets added

- Have separate write paths for console devices in virtio_console.c,
  which would beat the purpose of merging the two drivers and then
  they'd be better off standalone

- Convert hvc's usage of spinlocks to mutexes. I've no idea how this
  will play out; I'm no expert here. But I did try doing this and so far
  it all looks OK. No lockups, lockdep warnings, nothing. I have full
  debugging enabled. But this doesn't mean it's right.

Comments?

Amit
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] virtio-blk: set QUEUE_ORDERED_DRAIN by default

2009-08-26 Thread Rusty Russell
On Tue, 25 Aug 2009 11:46:08 pm Christoph Hellwig wrote:
 On Tue, Aug 25, 2009 at 11:41:37PM +0930, Rusty Russell wrote:
  On Fri, 21 Aug 2009 06:26:16 am Christoph Hellwig wrote:
   Currently virtio-blk doesn't set any QUEUE_ORDERED_ flag by default, which
   means it does not allow filesystems to use barriers.  But the typical use
   case for virtio-blk is to use a backed that uses synchronous I/O
  
  Really?  Does qemu open with O_SYNC?
  
  I'm definitely no block expert, but this seems strange...
  Rusty.
 
 Qemu can open it various ways, but the only one that is fully safe
 is O_SYNC (cache=writethrough).

(Rusty goes away and reads the qemu man page).

By default, if no explicit caching is specified for a qcow2 disk image,
cache=writeback will be used.

Are you claiming qcow2 is unusual?  I can believe snapshot is less common,
though I use it all the time.

You'd normally have to add a feature for something like this.  I don't
think this is different.

Sorry,
Rusty.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Page allocation failures in guest

2009-08-26 Thread Rusty Russell
On Wed, 26 Aug 2009 02:25:01 pm Pierre Ossman wrote:
 On Wed, 26 Aug 2009 11:47:17 +0930
 Rusty Russell ru...@rustcorp.com.au wrote:
 
  On Fri, 14 Aug 2009 05:55:48 am Pierre Ossman wrote:
   On Wed, 12 Aug 2009 15:01:52 +0930
   Rusty Russell ru...@rustcorp.com.au wrote:
Subject: virtio: net refill on out-of-memory
  ... 
   Patch applied. Now we wait. :)
  
  Any results?
  
 
 It's been up for 12 days, so I'd say it works. But there is nothing in
 dmesg, which suggests I haven't triggered the condition yet.

No, that's totally expected.  I wouldn't expect a GFP_ATOMIC order 0 alloc
failure to be noted, and the patch doesn't add any printks.

Dave, can you push this to Linus ASAP?

Thanks,
Rusty.

Subject: virtio: net refill on out-of-memory

If we run out of memory, use keventd to fill the buffer.  There's a
report of this happening: Page allocation failures in guest,
Message-ID: 20090713115158.0a489...@mjolnir.ossman.eu

Signed-off-by: Rusty Russell ru...@rustcorp.com.au
---
 drivers/net/virtio_net.c |   61 +++
 1 file changed, 46 insertions(+), 15 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -71,6 +71,9 @@ struct virtnet_info
struct sk_buff_head recv;
struct sk_buff_head send;
 
+   /* Work struct for refilling if we run low on memory. */
+   struct delayed_work refill;
+
/* Chain pages by the private ptr. */
struct page *pages;
 };
@@ -274,19 +277,22 @@ drop:
dev_kfree_skb(skb);
 }
 
-static void try_fill_recv_maxbufs(struct virtnet_info *vi)
+static bool try_fill_recv_maxbufs(struct virtnet_info *vi, gfp_t gfp)
 {
struct sk_buff *skb;
struct scatterlist sg[2+MAX_SKB_FRAGS];
int num, err, i;
+   bool oom = false;
 
sg_init_table(sg, 2+MAX_SKB_FRAGS);
for (;;) {
struct virtio_net_hdr *hdr;
 
skb = netdev_alloc_skb(vi-dev, MAX_PACKET_LEN + NET_IP_ALIGN);
-   if (unlikely(!skb))
+   if (unlikely(!skb)) {
+   oom = true;
break;
+   }
 
skb_reserve(skb, NET_IP_ALIGN);
skb_put(skb, MAX_PACKET_LEN);
@@ -297,7 +303,7 @@ static void try_fill_recv_maxbufs(struct
if (vi-big_packets) {
for (i = 0; i  MAX_SKB_FRAGS; i++) {
skb_frag_t *f = skb_shinfo(skb)-frags[i];
-   f-page = get_a_page(vi, GFP_ATOMIC);
+   f-page = get_a_page(vi, gfp);
if (!f-page)
break;
 
@@ -326,31 +332,35 @@ static void try_fill_recv_maxbufs(struct
if (unlikely(vi-num  vi-max))
vi-max = vi-num;
vi-rvq-vq_ops-kick(vi-rvq);
+   return !oom;
 }
 
-static void try_fill_recv(struct virtnet_info *vi)
+/* Returns false if we couldn't fill entirely (OOM). */
+static bool try_fill_recv(struct virtnet_info *vi, gfp_t gfp)
 {
struct sk_buff *skb;
struct scatterlist sg[1];
int err;
+   bool oom = false;
 
-   if (!vi-mergeable_rx_bufs) {
-   try_fill_recv_maxbufs(vi);
-   return;
-   }
+   if (!vi-mergeable_rx_bufs)
+   return try_fill_recv_maxbufs(vi, gfp);
 
for (;;) {
skb_frag_t *f;
 
skb = netdev_alloc_skb(vi-dev, GOOD_COPY_LEN + NET_IP_ALIGN);
-   if (unlikely(!skb))
+   if (unlikely(!skb)) {
+   oom = true;
break;
+   }
 
skb_reserve(skb, NET_IP_ALIGN);
 
f = skb_shinfo(skb)-frags[0];
-   f-page = get_a_page(vi, GFP_ATOMIC);
+   f-page = get_a_page(vi, gfp);
if (!f-page) {
+   oom = true;
kfree_skb(skb);
break;
}
@@ -374,6 +384,7 @@ static void try_fill_recv(struct virtnet
if (unlikely(vi-num  vi-max))
vi-max = vi-num;
vi-rvq-vq_ops-kick(vi-rvq);
+   return !oom;
 }
 
 static void skb_recv_done(struct virtqueue *rvq)
@@ -386,6 +397,23 @@ static void skb_recv_done(struct virtque
}
 }
 
+static void refill_work(struct work_struct *work)
+{
+   struct virtnet_info *vi;
+   bool still_empty;
+
+   vi = container_of(work, struct virtnet_info, refill.work);
+   napi_disable(vi-napi);
+   try_fill_recv(vi, GFP_KERNEL);
+   still_empty = (vi-num == 0);
+   napi_enable(vi-napi);
+
+   /* In theory, this can happen: if we don't get any buffers in
+* we will *never* try to fill again. */
+   if (still_empty)
+   schedule_delayed_work(vi-refill, HZ/2);
+}
+
 static int virtnet_poll(struct napi_struct 

Re: [PATCH] virtio-blk: set QUEUE_ORDERED_DRAIN by default

2009-08-26 Thread Avi Kivity

On 08/26/2009 03:06 PM, Rusty Russell wrote:

On Tue, 25 Aug 2009 11:46:08 pm Christoph Hellwig wrote:
   

On Tue, Aug 25, 2009 at 11:41:37PM +0930, Rusty Russell wrote:
 

On Fri, 21 Aug 2009 06:26:16 am Christoph Hellwig wrote:
   

Currently virtio-blk doesn't set any QUEUE_ORDERED_ flag by default, which
means it does not allow filesystems to use barriers.  But the typical use
case for virtio-blk is to use a backed that uses synchronous I/O
 

Really?  Does qemu open with O_SYNC?

I'm definitely no block expert, but this seems strange...
Rusty.
   

Qemu can open it various ways, but the only one that is fully safe
is O_SYNC (cache=writethrough).
 

(Rusty goes away and reads the qemu man page).

By default, if no explicit caching is specified for a qcow2 disk image,
cache=writeback will be used.
   


It's now switched to writethrough.  In any case, cache=writeback means 
lie to the guest, we don't care about integrity.



Are you claiming qcow2 is unusual?  I can believe snapshot is less common,
though I use it all the time.

You'd normally have to add a feature for something like this.  I don't
think this is different.
   


Why do we need to add a feature for this?

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH][RFC] Use return value from kvm_set_irq() to re-inject PIT interrupts.

2009-08-26 Thread Marcelo Tosatti
On Mon, Aug 24, 2009 at 09:19:05PM +0300, Gleb Natapov wrote:
   Current code very fragile and relies on hacks to work. Lets take calling
   of ack notifiers on pic reset as an example. Why is it needed? 
  
  To signal the ack notifiers users that, in case of reset with pending
  IRR, the given interrupt has been acked (its an artificial ack event).
  
 But IRR was not acked. The reason it is done is that otherwise the
 current logic will prevent further interrupt injection. 

Or will keep the host irq disabled, for the assigned device case (in
case you drop the hackish ack notification from pic_reset).

I don't think it exists there because of PIT reinjection only, it seems
a generic problem for users of ack notifiers (a reset notifier as you
mentioned would also do it, and be cleaner).

  Is there a need to differentiate between actual interrupt ack and reset
  with pending IRR? At the time this code was written, there was no
  indication that differentation would be necessary.
 This is two different things. Ack notifiers should be called when guest
 acks interrupt. Calling it on reset is wrong (see below). We can add reset
 notifiers, but we just build yet another infrastructure to support
 current reinjection scheme.

Its not specific to PIT reinjection.

Anything that relies on ack notification to perform some action (either
reinjection or host irq line enablement or some other use) suffers from
the same thing.

You might argue that a separate reset notification is more appropriate.

   It is obviously wrong thing to do from assigned devices POV.
  
  Thats not entirely clear to me. So what happens if a guest with PIC
  assigned device resets with a pending IRR? The host interrupt line will
  be kept disabled, even though the guest is able to process further
  interrupts?
 The host interrupt line will be enabled (assigned device ack notifier
 does this) without clearing interrupt condition in assigned device
 (guest hasn't acked irq so how can we be sure it ran device's irq
 handler?). Host will hang.
 
   Why ioapic calls mask notifiers but pic doesn't?
  
  Because it is not implemented.
 I see that. Why? Why it was important to implement for ioapic but not
 for pic? 

4780c65904f0fc4e312ee2da9383eacbe04e61ea

 Do we know what doesn't work now?

What you mean?

   Besides diffstat for the patch shows:
   2 files changed, 16 insertions(+), 59 deletions(-)
   
   43 lines less for the same functionality. Looks like clear win to me.
   
Ack notifiers are asynchronous notifications. Using the return value
from kvm_set_irq implies that timer emulation is based on a tick
generating device on the host side.
   No notification is needed in the first place. You know immediately
   if injection fails or not. I don't see why using return value from
   kvm_set_irq implies that timer emulation is based on a tick generating
   device on the host side? What can you do with ack notifiers that can't
   be done without?
  
  If you don't have a host timer emulating the guest PIT, to periodically
  bang on kvm_set_irq, how do you know when to attempt reinjection?
  
  You keep calling kvm_set_irq on every guest entry to figure out when 
  reinjection is possible?
 If we have timer to inject then yes. It is relatively cheap. Most of the
 time pending count will be zero.

Won't work with non-tick-based emulation on the host.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH][RFC] Use return value from kvm_set_irq() to re-inject PIT interrupts.

2009-08-26 Thread Marcelo Tosatti
On Mon, Aug 24, 2009 at 10:01:50PM +0300, Gleb Natapov wrote:
 On Mon, Aug 24, 2009 at 09:19:05PM +0300, Gleb Natapov wrote:
It is obviously wrong thing to do from assigned devices POV.
   
   Thats not entirely clear to me. So what happens if a guest with PIC
   assigned device resets with a pending IRR? The host interrupt line will
   be kept disabled, even though the guest is able to process further
   interrupts?
  The host interrupt line will be enabled (assigned device ack notifier
  does this) without clearing interrupt condition in assigned device
  (guest hasn't acked irq so how can we be sure it ran device's irq
  handler?). Host will hang.
  
 Actually, on the second thought, it will not hang. Next time host
 interrupt handler runs it will disable interrupt once again.

Right. And if you don't signal ack notification on reset with pending
IRR the host line will be kept disabled.

As said on other email, its not specific to PIT reinjection logic. 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH][RFC] Use return value from kvm_set_irq() to re-inject PIT interrupts.

2009-08-26 Thread Gleb Natapov
On Wed, Aug 26, 2009 at 09:43:48AM -0300, Marcelo Tosatti wrote:
 On Mon, Aug 24, 2009 at 09:19:05PM +0300, Gleb Natapov wrote:
Current code very fragile and relies on hacks to work. Lets take calling
of ack notifiers on pic reset as an example. Why is it needed? 
   
   To signal the ack notifiers users that, in case of reset with pending
   IRR, the given interrupt has been acked (its an artificial ack event).
   
  But IRR was not acked. The reason it is done is that otherwise the
  current logic will prevent further interrupt injection. 
 
 Or will keep the host irq disabled, for the assigned device case (in
 case you drop the hackish ack notification from pic_reset).
 
 I don't think it exists there because of PIT reinjection only, it seems
 a generic problem for users of ack notifiers (a reset notifier as you
 mentioned would also do it, and be cleaner).
 
Yes, I agree pic reset should be propagated to assigned devices somehow.

   Is there a need to differentiate between actual interrupt ack and reset
   with pending IRR? At the time this code was written, there was no
   indication that differentation would be necessary.
  This is two different things. Ack notifiers should be called when guest
  acks interrupt. Calling it on reset is wrong (see below). We can add reset
  notifiers, but we just build yet another infrastructure to support
  current reinjection scheme.
 
 Its not specific to PIT reinjection.
 
 Anything that relies on ack notification to perform some action (either
 reinjection or host irq line enablement or some other use) suffers from
 the same thing.
 
 You might argue that a separate reset notification is more appropriate.
 
It is obviously wrong thing to do from assigned devices POV.
   
   Thats not entirely clear to me. So what happens if a guest with PIC
   assigned device resets with a pending IRR? The host interrupt line will
   be kept disabled, even though the guest is able to process further
   interrupts?
  The host interrupt line will be enabled (assigned device ack notifier
  does this) without clearing interrupt condition in assigned device
  (guest hasn't acked irq so how can we be sure it ran device's irq
  handler?). Host will hang.
  
Why ioapic calls mask notifiers but pic doesn't?
   
   Because it is not implemented.
  I see that. Why? Why it was important to implement for ioapic but not
  for pic? 
 
 4780c65904f0fc4e312ee2da9383eacbe04e61ea
 
This commit and previous one adds infrastructure to fix a bug that is
there only because how we choose to do pit reinjection. Do it differently
and you can revert both of them.

  Do we know what doesn't work now?
 
 What you mean?
I mean that pit doesn't call mask notifier so similar bug to 4780c65
hides somewhere out there. How can we test it?

 
Besides diffstat for the patch shows:
2 files changed, 16 insertions(+), 59 deletions(-)

43 lines less for the same functionality. Looks like clear win to me.

 Ack notifiers are asynchronous notifications. Using the return value
 from kvm_set_irq implies that timer emulation is based on a tick
 generating device on the host side.
No notification is needed in the first place. You know immediately
if injection fails or not. I don't see why using return value from
kvm_set_irq implies that timer emulation is based on a tick generating
device on the host side? What can you do with ack notifiers that can't
be done without?
   
   If you don't have a host timer emulating the guest PIT, to periodically
   bang on kvm_set_irq, how do you know when to attempt reinjection?
   
   You keep calling kvm_set_irq on every guest entry to figure out when 
   reinjection is possible?
  If we have timer to inject then yes. It is relatively cheap. Most of the
  time pending count will be zero.
 
 Won't work with non-tick-based emulation on the host.
Why? This is the most important point, can you elaborate?

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[KVM-AUTOTEST PATCH] kvm: specify nic_model explicitly for rtl8139

2009-08-26 Thread Avi Kivity
Instead of relying on the default nic_model, specify it explicitly.  Different
qemu branches use different defaults, and the default may change.

Signed-off-by: Avi Kivity a...@redhat.com
---
 client/tests/kvm/kvm_tests.cfg.sample |1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/client/tests/kvm/kvm_tests.cfg.sample 
b/client/tests/kvm/kvm_tests.cfg.sample
index a83ef9b..aa6162f 100644
--- a/client/tests/kvm/kvm_tests.cfg.sample
+++ b/client/tests/kvm/kvm_tests.cfg.sample
@@ -145,6 +145,7 @@ variants:
 # NICs
 variants:
 - @rtl8139:
+nic_model = rtl8139
 - virtio:   rtl8139.install rtl8139.setup
 no install setup
 nic_model = virtio
-- 
1.6.4.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH][RFC] Use return value from kvm_set_irq() to re-inject PIT interrupts.

2009-08-26 Thread Marcelo Tosatti
On Wed, Aug 26, 2009 at 04:19:17PM +0300, Gleb Natapov wrote:
 On Wed, Aug 26, 2009 at 09:43:48AM -0300, Marcelo Tosatti wrote:
  On Mon, Aug 24, 2009 at 09:19:05PM +0300, Gleb Natapov wrote:
 Current code very fragile and relies on hacks to work. Lets take 
 calling
 of ack notifiers on pic reset as an example. Why is it needed? 

To signal the ack notifiers users that, in case of reset with pending
IRR, the given interrupt has been acked (its an artificial ack event).

   But IRR was not acked. The reason it is done is that otherwise the
   current logic will prevent further interrupt injection. 
  
  Or will keep the host irq disabled, for the assigned device case (in
  case you drop the hackish ack notification from pic_reset).
  
  I don't think it exists there because of PIT reinjection only, it seems
  a generic problem for users of ack notifiers (a reset notifier as you
  mentioned would also do it, and be cleaner).
  
 Yes, I agree pic reset should be propagated to assigned devices somehow.
 
Is there a need to differentiate between actual interrupt ack and reset
with pending IRR? At the time this code was written, there was no
indication that differentation would be necessary.
   This is two different things. Ack notifiers should be called when guest
   acks interrupt. Calling it on reset is wrong (see below). We can add reset
   notifiers, but we just build yet another infrastructure to support
   current reinjection scheme.
  
  Its not specific to PIT reinjection.
  
  Anything that relies on ack notification to perform some action (either
  reinjection or host irq line enablement or some other use) suffers from
  the same thing.
  
  You might argue that a separate reset notification is more appropriate.
  
 It is obviously wrong thing to do from assigned devices POV.

Thats not entirely clear to me. So what happens if a guest with PIC
assigned device resets with a pending IRR? The host interrupt line will
be kept disabled, even though the guest is able to process further
interrupts?
   The host interrupt line will be enabled (assigned device ack notifier
   does this) without clearing interrupt condition in assigned device
   (guest hasn't acked irq so how can we be sure it ran device's irq
   handler?). Host will hang.
   
 Why ioapic calls mask notifiers but pic doesn't?

Because it is not implemented.
   I see that. Why? Why it was important to implement for ioapic but not
   for pic? 
  
  4780c65904f0fc4e312ee2da9383eacbe04e61ea
  
 This commit and previous one adds infrastructure to fix a bug that is
 there only because how we choose to do pit reinjection. Do it differently
 and you can revert both of them.
 
   Do we know what doesn't work now?
  
  What you mean?
 I mean that pit doesn't call mask notifier so similar bug to 4780c65
 hides somewhere out there. How can we test it?
 
  
 Besides diffstat for the patch shows:
 2 files changed, 16 insertions(+), 59 deletions(-)
 
 43 lines less for the same functionality. Looks like clear win to me.
 
  Ack notifiers are asynchronous notifications. Using the return value
  from kvm_set_irq implies that timer emulation is based on a tick
  generating device on the host side.
 No notification is needed in the first place. You know immediately
 if injection fails or not. I don't see why using return value from
 kvm_set_irq implies that timer emulation is based on a tick 
 generating
 device on the host side? What can you do with ack notifiers that 
 can't
 be done without?

If you don't have a host timer emulating the guest PIT, to periodically
bang on kvm_set_irq, how do you know when to attempt reinjection?

You keep calling kvm_set_irq on every guest entry to figure out when 
reinjection is possible?
   If we have timer to inject then yes. It is relatively cheap. Most of the
   time pending count will be zero.
  
  Won't work with non-tick-based emulation on the host.
 Why? This is the most important point, can you elaborate?

From http://www.mail-archive.com/kvm@vger.kernel.org/msg18644.html.

An injectable timer interrupt is defined by:

- time(now) = time(next_expiration)
- Previous timer interrupt has been acked (thus we can inject).

The thing is, sure you can drop ack notifiers and check IRR
on every guest entry, but why bother if you can receive an
asynchronous notification?

Would you prefer to replace

+   if (!ktimer-can_inject)

With
kvm_set_irq()

?

Not relatively cheap.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH][RFC] Use return value from kvm_set_irq() to re-inject PIT interrupts.

2009-08-26 Thread Avi Kivity

On 08/26/2009 04:38 PM, Marcelo Tosatti wrote:


An injectable timer interrupt is defined by:

- time(now)= time(next_expiration)
- Previous timer interrupt has been acked (thus we can inject).

The thing is, sure you can drop ack notifiers and check IRR
on every guest entry, but why bother if you can receive an
asynchronous notification?

Would you prefer to replace

+   if (!ktimer-can_inject)

With
kvm_set_irq()

?

Not relatively cheap.
   


Well, we expect it to be a rare condition that we have pending timer 
interrupts, so if it leads to significant code simplification, it can be 
worthwhile.



--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH][RFC] Use return value from kvm_set_irq() to re-inject PIT interrupts.

2009-08-26 Thread Marcelo Tosatti
On Wed, Aug 26, 2009 at 04:19:17PM +0300, Gleb Natapov wrote:
 On Wed, Aug 26, 2009 at 09:43:48AM -0300, Marcelo Tosatti wrote:
  On Mon, Aug 24, 2009 at 09:19:05PM +0300, Gleb Natapov wrote:
 Current code very fragile and relies on hacks to work. Lets take 
 calling
 of ack notifiers on pic reset as an example. Why is it needed? 

To signal the ack notifiers users that, in case of reset with pending
IRR, the given interrupt has been acked (its an artificial ack event).

   But IRR was not acked. The reason it is done is that otherwise the
   current logic will prevent further interrupt injection. 
  
  Or will keep the host irq disabled, for the assigned device case (in
  case you drop the hackish ack notification from pic_reset).
  
  I don't think it exists there because of PIT reinjection only, it seems
  a generic problem for users of ack notifiers (a reset notifier as you
  mentioned would also do it, and be cleaner).
  
 Yes, I agree pic reset should be propagated to assigned devices somehow.
 
Is there a need to differentiate between actual interrupt ack and reset
with pending IRR? At the time this code was written, there was no
indication that differentation would be necessary.
   This is two different things. Ack notifiers should be called when guest
   acks interrupt. Calling it on reset is wrong (see below). We can add reset
   notifiers, but we just build yet another infrastructure to support
   current reinjection scheme.
  
  Its not specific to PIT reinjection.
  
  Anything that relies on ack notification to perform some action (either
  reinjection or host irq line enablement or some other use) suffers from
  the same thing.
  
  You might argue that a separate reset notification is more appropriate.
  
 It is obviously wrong thing to do from assigned devices POV.

Thats not entirely clear to me. So what happens if a guest with PIC
assigned device resets with a pending IRR? The host interrupt line will
be kept disabled, even though the guest is able to process further
interrupts?
   The host interrupt line will be enabled (assigned device ack notifier
   does this) without clearing interrupt condition in assigned device
   (guest hasn't acked irq so how can we be sure it ran device's irq
   handler?). Host will hang.
   
 Why ioapic calls mask notifiers but pic doesn't?

Because it is not implemented.
   I see that. Why? Why it was important to implement for ioapic but not
   for pic? 
  
  4780c65904f0fc4e312ee2da9383eacbe04e61ea
  
 This commit and previous one adds infrastructure to fix a bug that is
 there only because how we choose to do pit reinjection. Do it differently
 and you can revert both of them.
 
   Do we know what doesn't work now?
  
  What you mean?
 I mean that pit doesn't call mask notifier so similar bug to 4780c65
 hides somewhere out there. How can we test it?

Program periodic PIT, mask irq0, wait a while, unmask irq0 ?


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH][RFC] Use return value from kvm_set_irq() to re-inject PIT interrupts.

2009-08-26 Thread Gleb Natapov
On Wed, Aug 26, 2009 at 10:38:42AM -0300, Marcelo Tosatti wrote:
 If you don't have a host timer emulating the guest PIT, to 
 periodically
 bang on kvm_set_irq, how do you know when to attempt reinjection?
 
 You keep calling kvm_set_irq on every guest entry to figure out when 
 reinjection is possible?
If we have timer to inject then yes. It is relatively cheap. Most of the
time pending count will be zero.
   
   Won't work with non-tick-based emulation on the host.
  Why? This is the most important point, can you elaborate?
 
 From http://www.mail-archive.com/kvm@vger.kernel.org/msg18644.html.
 
 An injectable timer interrupt is defined by:
 
 - time(now) = time(next_expiration)
 - Previous timer interrupt has been acked (thus we can inject).
 
 The thing is, sure you can drop ack notifiers and check IRR
 on every guest entry, but why bother if you can receive an
 asynchronous notification?
 
 Would you prefer to replace
 
 +   if (!ktimer-can_inject)
 
 With
   kvm_set_irq()
 
 ?
 
 Not relatively cheap.
Most of the times time(now) will be less then time(next_expiration) so
on most entries kvm_set_irq() will not be called at all. When interrupt
has to be injected I prefer to try to inject it ASAP. PIC and APIC
effectively have 2 element interrupt queue (irr/isr) so injection may
succeed even though ack was not yet received.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv4 2/2] vhost_net: a kernel-level virtio server

2009-08-26 Thread Michael S. Tsirkin
On Wed, Aug 26, 2009 at 03:40:59PM +0200, Arnd Bergmann wrote:
 On Tuesday 25 August 2009, Michael S. Tsirkin wrote:
I'd like to avoid that here,
   though it's kind of ugly.  We'd need VHOST_GET_FEATURES (and ACK) to take 
   a
   struct like:
   
 u32 feature_size;
 u32 features[];
 
 Hmm, variable length ioctl arguments, I'd rather not go there.
 The ioctl infrastructure already has a length argument encoded
 in the ioctl number. We can use that if we need more, e.g.
 
 /* now */
 #define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64)
 /*
  * uncomment if we run out of feature bits:
 
 struct vhost_get_features2 {
   __u64 bits[2];
 };
 #define VHOST_GET_FEATURES2 _IOR(VHOST_VIRTIO, 0x00, \
   struct  vhost_get_features2)
  */
 
  Thinking about this proposal some more, how will the guest
  determine the size to supply the GET_FEATURES ioctl?
 
 Wait, the *guest*?

Sorry. the userspace hypervisor.

 Maybe I misunderstood something in a major way here, but
 I expected the features to be negotiated between host
 user space (qemu) and host kernel, as well as between
 guest and qemu (as they are already), but never between
 guest and kernel.

Yes.

 I would certainly expect the bits to be distinct from
 the virtio-net feature bits. E.g. stuff like TAP frame
 format opposed to TCP socket frame format (length+date)
 is something we need to negotiate here but that the
 guest does not care about.

My idea is to use virtio format for things I share with virtio (e.g.
mergeable buffers).  Since we are a kind of transport, I thought that I
will use the transport bits, that is bits 28 and up for vhost things.

  Since we are a bit tight in 32 bit space already,
  let's just use a 64 bit integer and be done with it?
 
 Can't hurt, but don't use a struct unless you think
 we are going to need more than 64 bits.
 
   Arnd 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 05/47] KVM: VMX: Optimize vmx_get_cpl()

2009-08-26 Thread Roel Kluin
Op 26-08-09 12:29, Avi Kivity schreef:
 Instead of calling vmx_get_segment() (which reads a whole bunch of
 vmcs fields), read only the cs selector which contains the cpl.
 
 Signed-off-by: Avi Kivity a...@redhat.com

Can't we also optimise cs_ss_rpl_check()? (Please review, untested.)

- 8 -- 8 
Instead of calling vmx_get_segment() (which reads a whole bunch of
vmcs fields), read only the cs/ss selectors which contains the rpls.

Signed-off-by: Roel Kluin roel.kl...@gmail.com
---
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 29f9129..5d8512a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1932,13 +1932,8 @@ static bool ldtr_valid(struct kvm_vcpu *vcpu)
 
 static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
 {
-   struct kvm_segment cs, ss;
-
-   vmx_get_segment(vcpu, cs, VCPU_SREG_CS);
-   vmx_get_segment(vcpu, ss, VCPU_SREG_SS);
-
-   return ((cs.selector  SELECTOR_RPL_MASK) ==
-(ss.selector  SELECTOR_RPL_MASK));
+   return ((vmcs_read16(GUEST_CS_SELECTOR)  SELECTOR_RPL_MASK) ==
+(vmcs_read16(GUEST_SS_SELECTOR)  SELECTOR_RPL_MASK));
 }
 
 /*
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 05/47] KVM: VMX: Optimize vmx_get_cpl()

2009-08-26 Thread Avi Kivity

On 08/26/2009 05:15 PM, Roel Kluin wrote:

Op 26-08-09 12:29, Avi Kivity schreef:
   

Instead of calling vmx_get_segment() (which reads a whole bunch of
vmcs fields), read only the cs selector which contains the cpl.

Signed-off-by: Avi Kivitya...@redhat.com
 

Can't we also optimise cs_ss_rpl_check()? (Please review, untested.)

-8 -- 8  
Instead of calling vmx_get_segment() (which reads a whole bunch of
vmcs fields), read only the cs/ss selectors which contains the rpls.

   


It's really a slowpath, so I prefer not to touch it.  We're likely to 
start caching guest segment fields soon, so the less code that reads 
them directly, the better.



Signed-off-by: Roel Kluinroel.kl...@gmail.com
---
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 29f9129..5d8512a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1932,13 +1932,8 @@ static bool ldtr_valid(struct kvm_vcpu *vcpu)

  static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
  {
-   struct kvm_segment cs, ss;
-
-   vmx_get_segment(vcpu,cs, VCPU_SREG_CS);
-   vmx_get_segment(vcpu,ss, VCPU_SREG_SS);
-
-   return ((cs.selector  SELECTOR_RPL_MASK) ==
-(ss.selector  SELECTOR_RPL_MASK));
+   return ((vmcs_read16(GUEST_CS_SELECTOR)  SELECTOR_RPL_MASK) ==
+(vmcs_read16(GUEST_SS_SELECTOR)  SELECTOR_RPL_MASK));
  }

  /*
   



--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv4 2/2] vhost_net: a kernel-level virtio server

2009-08-26 Thread Arnd Bergmann
On Tuesday 25 August 2009, Michael S. Tsirkin wrote:
   I'd like to avoid that here,
  though it's kind of ugly.  We'd need VHOST_GET_FEATURES (and ACK) to take a
  struct like:
  
u32 feature_size;
u32 features[];

Hmm, variable length ioctl arguments, I'd rather not go there.
The ioctl infrastructure already has a length argument encoded
in the ioctl number. We can use that if we need more, e.g.

/* now */
#define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64)
/*
 * uncomment if we run out of feature bits:

struct vhost_get_features2 {
__u64 bits[2];
};
#define VHOST_GET_FEATURES2 _IOR(VHOST_VIRTIO, 0x00, \
struct  vhost_get_features2)
 */

 Thinking about this proposal some more, how will the guest
 determine the size to supply the GET_FEATURES ioctl?

Wait, the *guest*?

Maybe I misunderstood something in a major way here, but
I expected the features to be negotiated between host
user space (qemu) and host kernel, as well as between
guest and qemu (as they are already), but never between
guest and kernel.

I would certainly expect the bits to be distinct from
the virtio-net feature bits. E.g. stuff like TAP frame
format opposed to TCP socket frame format (length+date)
is something we need to negotiate here but that the
guest does not care about.

 Since we are a bit tight in 32 bit space already,
 let's just use a 64 bit integer and be done with it?

Can't hurt, but don't use a struct unless you think
we are going to need more than 64 bits.

Arnd 
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Performace data when running Windows VMs

2009-08-26 Thread Andrew Theurer
I recently gathered some performance data when running Windows Server
2008 VMs, and I wanted to share it here.  There are 12 Windows
Server2008 64-bit VMs (1 vcpu, 2 GB) running which handle the concurrent
execution of 6 J2EE type benchmarks.  Each benchmark needs a App VM and
a Database VM.  The benchmark clients inject a fixed rate of requests
which yields X% CPU utilization on the host.  A different hypervisor was
compared; KVM used about 60% more CPU cycles to complete the same amount
of work.  Both had their hypervisor specific paravirt IO drivers in the
VMs.

Server is a 2 socket Core/i7, SMT off, with 72 GB memory

Host kernel used was kvm.git v2.6.31-rc3-3419-g6df4865
Qemu was kvm-87.  I tried a few newer versions of Qemu; none of them
worked with the RedHat virtIO Windows drivers.  I tried:

f3600c589a9ee5ea4c0fec74ed4e06a15b461d52
0.11.0-rc1
0.10.6
kvm-88

All but 0.10.6 had Problem code 10 driver error in the VM.  0.10.6 had
a disk read error occurred very early in the booting of the VM.

I/O on the host was not what I would call very high:  outbound network
averaged at 163 Mbit/s inbound was 8 Mbit/s, while disk read ops was
243/sec and write ops was 561/sec

Host CPU breakdown was the following:

user  nice  system irq  softirq guest  idle  iowait
5.67  0.00  11.64  0.09 1.0531.90  46.06 3.59


The amount of kernel time had me concerned.  Here is oprofile:


 samples  %app name symbol name
 1163422  52.3744  kvm-intel.ko vmx_vcpu_run
 1039964.6816  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
 native_set_debugreg
 81036 3.6480  kvm.ko   kvm_arch_vcpu_ioctl_run
 37913 1.7068  qemu-system-x86_64   cpu_physical_memory_rw
 34720 1.5630  qemu-system-x86_64   phys_page_find_alloc
 23234 1.0459  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
 native_write_msr_safe
 20964 0.9437  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
 native_get_debugreg
 17628 0.7936  libc-2.5.so  memcpy
 16587 0.7467  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
 __down_read
 15681 0.7059  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
 __up_read
 15466 0.6962  kvm.ko   find_highest_vector
 14611 0.6578  qemu-system-x86_64   qemu_get_ram_ptr
 11254 0.5066  kvm-intel.ko vmcs_writel
 11133 0.5012  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
 copy_user_generic_string
 10917 0.4915  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
 native_read_msr_safe
 10760 0.4844  qemu-system-x86_64   virtqueue_get_head
 9025  0.4063  kvm-intel.ko vmx_handle_exit
 8953  0.4030  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
 schedule
 8753  0.3940  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
 fget_light
 8465  0.3811  qemu-system-x86_64   virtqueue_avail_bytes
 8185  0.3685  kvm-intel.ko handle_cr
 8069  0.3632  kvm.ko   kvm_set_irq
 7697  0.3465  kvm.ko   kvm_lapic_sync_from_vapic
 7586  0.3415  qemu-system-x86_64   main_loop_wait
 7480  0.3367  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
 do_select
 7121  0.3206  qemu-system-x86_64   lduw_phys
 7003  0.3153  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
 audit_syscall_exit
 6062  0.2729  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 kfree
 5477  0.2466  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 fput
 5454  0.2455  kvm.ko   kvm_lapic_get_cr8
 5096  0.2294  kvm.ko   kvm_load_guest_fpu
 5057  0.2277  kvm.ko   apic_update_ppr
 4929  0.2219  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
 up_read
 4900  0.2206  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
 audit_syscall_entry
 4866  0.2191  kvm.ko   kvm_apic_has_interrupt
 4670  0.2102  kvm-intel.ko skip_emulated_instruction
 4644  0.2091  kvm.ko   kvm_cpu_has_interrupt
 4548  0.2047  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
 __switch_to
 4328  0.1948  kvm.ko   kvm_apic_accept_pic_intr
 4303  0.1937  libpthread-2.5.sopthread_mutex_lock
 4235  0.1906  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
 system_call
 4175  0.1879  kvm.ko   kvm_put_guest_fpu
 4170  0.1877  qemu-system-x86_64   ldl_phys
 4098  0.1845  kvm-intel.ko vmx_set_interrupt_shadow
 4003  0.1802  qemu-system-x86_64   kvm_run

I was wondering why the get/set debugreg was so high.  I don't recall
seeing this much with Linux VMs.

Here is an average of kvm_stat:


 efer_relo  0
 exits  1262814
 fpu_reloa  103842
 halt_exit  9918
 halt_wake  9763
 host_stat  103846
 hypercall  0
 insn_emul  23277
 insn_emul 

Re: Performace data when running Windows VMs

2009-08-26 Thread Avi Kivity

On 08/26/2009 05:57 PM, Andrew Theurer wrote:

I recently gathered some performance data when running Windows Server
2008 VMs, and I wanted to share it here.  There are 12 Windows
Server2008 64-bit VMs (1 vcpu, 2 GB) running which handle the concurrent
execution of 6 J2EE type benchmarks.  Each benchmark needs a App VM and
a Database VM.  The benchmark clients inject a fixed rate of requests
which yields X% CPU utilization on the host.  A different hypervisor was
compared; KVM used about 60% more CPU cycles to complete the same amount
of work.  Both had their hypervisor specific paravirt IO drivers in the
VMs.

Server is a 2 socket Core/i7, SMT off, with 72 GB memory
   


Did you use large pages?


Host kernel used was kvm.git v2.6.31-rc3-3419-g6df4865
Qemu was kvm-87.  I tried a few newer versions of Qemu; none of them
worked with the RedHat virtIO Windows drivers.  I tried:

f3600c589a9ee5ea4c0fec74ed4e06a15b461d52
0.11.0-rc1
0.10.6
kvm-88

All but 0.10.6 had Problem code 10 driver error in the VM.  0.10.6 had
a disk read error occurred very early in the booting of the VM.
   


Yan?


I/O on the host was not what I would call very high:  outbound network
averaged at 163 Mbit/s inbound was 8 Mbit/s, while disk read ops was
243/sec and write ops was 561/sec
   


What was the disk bandwidth used?  Presumably, direct access to the 
volume with cache=off?


linux-aio should help reduce cpu usage.


Host CPU breakdown was the following:

user  nice  system irq  softirq guest  idle  iowait
5.67  0.00  11.64  0.09 1.0531.90  46.06 3.59


The amount of kernel time had me concerned.  Here is oprofile:
   


user+system is about 55% of guest time, and it's all overhead.


samples  %app name symbol name
1163422  52.3744  kvm-intel.ko vmx_vcpu_run
1039964.6816  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
native_set_debugreg
81036 3.6480  kvm.ko   kvm_arch_vcpu_ioctl_run
37913 1.7068  qemu-system-x86_64   cpu_physical_memory_rw
34720 1.5630  qemu-system-x86_64   phys_page_find_alloc
 


We should really optimize these two.


23234 1.0459  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
native_write_msr_safe
20964 0.9437  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
native_get_debugreg
17628 0.7936  libc-2.5.so  memcpy
16587 0.7467  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
__down_read
15681 0.7059  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
__up_read
15466 0.6962  kvm.ko   find_highest_vector
14611 0.6578  qemu-system-x86_64   qemu_get_ram_ptr
11254 0.5066  kvm-intel.ko vmcs_writel
11133 0.5012  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
copy_user_generic_string
10917 0.4915  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
native_read_msr_safe
10760 0.4844  qemu-system-x86_64   virtqueue_get_head
9025  0.4063  kvm-intel.ko vmx_handle_exit
8953  0.4030  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
schedule
8753  0.3940  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
fget_light
8465  0.3811  qemu-system-x86_64   virtqueue_avail_bytes
8185  0.3685  kvm-intel.ko handle_cr
8069  0.3632  kvm.ko   kvm_set_irq
7697  0.3465  kvm.ko   kvm_lapic_sync_from_vapic
7586  0.3415  qemu-system-x86_64   main_loop_wait
7480  0.3367  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
do_select
7121  0.3206  qemu-system-x86_64   lduw_phys
7003  0.3153  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
audit_syscall_exit
6062  0.2729  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 kfree
5477  0.2466  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 fput
5454  0.2455  kvm.ko   kvm_lapic_get_cr8
5096  0.2294  kvm.ko   kvm_load_guest_fpu
5057  0.2277  kvm.ko   apic_update_ppr
4929  0.2219  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 up_read
4900  0.2206  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
audit_syscall_entry
4866  0.2191  kvm.ko   kvm_apic_has_interrupt
4670  0.2102  kvm-intel.ko skip_emulated_instruction
4644  0.2091  kvm.ko   kvm_cpu_has_interrupt
4548  0.2047  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
__switch_to
4328  0.1948  kvm.ko   kvm_apic_accept_pic_intr
4303  0.1937  libpthread-2.5.sopthread_mutex_lock
4235  0.1906  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
system_call
4175  0.1879  kvm.ko   kvm_put_guest_fpu
4170  0.1877  qemu-system-x86_64   ldl_phys
4098  0.1845  kvm-intel.ko vmx_set_interrupt_shadow
4003  0.1802  qemu-system-x86_64   kvm_run
 

Re: Extending virtio_console to support multiple ports

2009-08-26 Thread Amit Shah
[cc'ing some people who have made some commits in hvc_console.c]

On (Wed) Aug 26 2009 [16:57:18], Amit Shah wrote:
 On (Tue) Aug 25 2009 [11:47:20], Amit Shah wrote:
  
  Hello all,
  
  Here is a new iteration of the patch series that implements a
  transport for guest and host communications.
  
  The code has been updated to reuse the virtio-console device instead
  of creating a new virtio-serial device.
 
 And the problem now is that hvc calls the put_chars function with
 spinlocks held and we now allocate pages in send_buf(), called from
 put_chars.
 
 A few solutions:

[snip]

 - Convert hvc's usage of spinlocks to mutexes. I've no idea how this
   will play out; I'm no expert here. But I did try doing this and so far
   it all looks OK. No lockups, lockdep warnings, nothing. I have full
   debugging enabled. But this doesn't mean it's right.

So just to test this further I added the capability to have more than
one hvc console spawn from virtio_console, created two consoles and did
a 'cat' of a file in each of the virtio-consoles. It's been running for
half an hour now without any badness. No spew in debug logs too.

I also checked the code in hvc_console.c that takes the spin_locks.
Nothing there that runs from (or needs to run from) interrupt context.
So the change to mutexes does seem reasonable. Also, the spinlock code
was added really long back -- git blame shows Linus' first git commit
introduced them in the git history, so it's pure legacy baggage.

Also found a bug: hvc_resize() wants to be called with a lock held
(hp-lock) but virtio_console just calls it directly.

Anyway I'm wondering whether all those locks are needed.

Amit


diff --git a/drivers/char/hvc_console.c b/drivers/char/hvc_console.c
index d97779e..51078a3 100644
--- a/drivers/char/hvc_console.c
+++ b/drivers/char/hvc_console.c
@@ -35,7 +35,7 @@
 #include linux/tty.h
 #include linux/tty_flip.h
 #include linux/sched.h
-#include linux/spinlock.h
+#include linux/mutex.h
 #include linux/delay.h
 #include linux/freezer.h
 
@@ -81,7 +81,7 @@ static LIST_HEAD(hvc_structs);
  * Protect the list of hvc_struct instances from inserts and removals during
  * list traversal.
  */
-static DEFINE_SPINLOCK(hvc_structs_lock);
+static DEFINE_MUTEX(hvc_structs_lock);
 
 /*
  * This value is used to assign a tty-index value to a hvc_struct based
@@ -98,23 +98,22 @@ static int last_hvc = -1;
 static struct hvc_struct *hvc_get_by_index(int index)
 {
struct hvc_struct *hp;
-   unsigned long flags;
 
-   spin_lock(hvc_structs_lock);
+   mutex_lock(hvc_structs_lock);
 
list_for_each_entry(hp, hvc_structs, next) {
-   spin_lock_irqsave(hp-lock, flags);
+   mutex_lock(hp-lock);
if (hp-index == index) {
kref_get(hp-kref);
-   spin_unlock_irqrestore(hp-lock, flags);
-   spin_unlock(hvc_structs_lock);
+   mutex_unlock(hp-lock);
+   mutex_unlock(hvc_structs_lock);
return hp;
}
-   spin_unlock_irqrestore(hp-lock, flags);
+   mutex_unlock(hp-lock);
}
hp = NULL;
 
-   spin_unlock(hvc_structs_lock);
+   mutex_unlock(hvc_structs_lock);
return hp;
 }
 
@@ -228,15 +227,14 @@ console_initcall(hvc_console_init);
 static void destroy_hvc_struct(struct kref *kref)
 {
struct hvc_struct *hp = container_of(kref, struct hvc_struct, kref);
-   unsigned long flags;
 
-   spin_lock(hvc_structs_lock);
+   mutex_lock(hvc_structs_lock);
 
-   spin_lock_irqsave(hp-lock, flags);
+   mutex_lock(hp-lock);
list_del((hp-next));
-   spin_unlock_irqrestore(hp-lock, flags);
+   mutex_unlock(hp-lock);
 
-   spin_unlock(hvc_structs_lock);
+   mutex_unlock(hvc_structs_lock);
 
kfree(hp);
 }
@@ -302,17 +300,16 @@ static void hvc_unthrottle(struct tty_struct *tty)
 static int hvc_open(struct tty_struct *tty, struct file * filp)
 {
struct hvc_struct *hp;
-   unsigned long flags;
int rc = 0;
 
/* Auto increments kref reference if found. */
if (!(hp = hvc_get_by_index(tty-index)))
return -ENODEV;
 
-   spin_lock_irqsave(hp-lock, flags);
+   mutex_lock(hp-lock);
/* Check and then increment for fast path open. */
if (hp-count++  0) {
-   spin_unlock_irqrestore(hp-lock, flags);
+   mutex_unlock(hp-lock);
hvc_kick();
return 0;
} /* else count == 0 */
@@ -321,7 +318,7 @@ static int hvc_open(struct tty_struct *tty, struct file * 
filp)
 
hp-tty = tty;
 
-   spin_unlock_irqrestore(hp-lock, flags);
+   mutex_unlock(hp-lock);
 
if (hp-ops-notifier_add)
rc = hp-ops-notifier_add(hp, hp-data);
@@ -333,9 +330,9 @@ static int hvc_open(struct tty_struct *tty, struct file * 
filp)
 * 

Re: Performace data when running Windows VMs

2009-08-26 Thread Andrew Theurer
On Wed, 2009-08-26 at 18:44 +0300, Avi Kivity wrote:
 On 08/26/2009 05:57 PM, Andrew Theurer wrote:
  I recently gathered some performance data when running Windows Server
  2008 VMs, and I wanted to share it here.  There are 12 Windows
  Server2008 64-bit VMs (1 vcpu, 2 GB) running which handle the concurrent
  execution of 6 J2EE type benchmarks.  Each benchmark needs a App VM and
  a Database VM.  The benchmark clients inject a fixed rate of requests
  which yields X% CPU utilization on the host.  A different hypervisor was
  compared; KVM used about 60% more CPU cycles to complete the same amount
  of work.  Both had their hypervisor specific paravirt IO drivers in the
  VMs.
 
  Server is a 2 socket Core/i7, SMT off, with 72 GB memory
 
 
 Did you use large pages?

Yes.
 
  Host kernel used was kvm.git v2.6.31-rc3-3419-g6df4865
  Qemu was kvm-87.  I tried a few newer versions of Qemu; none of them
  worked with the RedHat virtIO Windows drivers.  I tried:
 
  f3600c589a9ee5ea4c0fec74ed4e06a15b461d52
  0.11.0-rc1
  0.10.6
  kvm-88
 
  All but 0.10.6 had Problem code 10 driver error in the VM.  0.10.6 had
  a disk read error occurred very early in the booting of the VM.
 
 
 Yan?
 
  I/O on the host was not what I would call very high:  outbound network
  averaged at 163 Mbit/s inbound was 8 Mbit/s, while disk read ops was
  243/sec and write ops was 561/sec
 
 
 What was the disk bandwidth used?  Presumably, direct access to the 
 volume with cache=off?

2.4 MB/sec write, 0.6MB/sec read, cache=none
The VMs' boot disks are IDE, but apps use their second disk which is
virtio.

 linux-aio should help reduce cpu usage.

I assume this is in a newer version of Qemu?

  Host CPU breakdown was the following:
 
  user  nice  system irq  softirq guest  idle  iowait
  5.67  0.00  11.64  0.09 1.0531.90  46.06 3.59
 
 
  The amount of kernel time had me concerned.  Here is oprofile:
 
 
 user+system is about 55% of guest time, and it's all overhead.
 
  samples  %app name symbol name
  1163422  52.3744  kvm-intel.ko vmx_vcpu_run
  1039964.6816  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
  native_set_debugreg
  81036 3.6480  kvm.ko   kvm_arch_vcpu_ioctl_run
  37913 1.7068  qemu-system-x86_64   cpu_physical_memory_rw
  34720 1.5630  qemu-system-x86_64   phys_page_find_alloc
   
 
 We should really optimize these two.
 
  23234 1.0459  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
  native_write_msr_safe
  20964 0.9437  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
  native_get_debugreg
  17628 0.7936  libc-2.5.so  memcpy
  16587 0.7467  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
  __down_read
  15681 0.7059  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
  __up_read
  15466 0.6962  kvm.ko   find_highest_vector
  14611 0.6578  qemu-system-x86_64   qemu_get_ram_ptr
  11254 0.5066  kvm-intel.ko vmcs_writel
  11133 0.5012  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
  copy_user_generic_string
  10917 0.4915  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
  native_read_msr_safe
  10760 0.4844  qemu-system-x86_64   virtqueue_get_head
  9025  0.4063  kvm-intel.ko vmx_handle_exit
  8953  0.4030  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
  schedule
  8753  0.3940  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
  fget_light
  8465  0.3811  qemu-system-x86_64   virtqueue_avail_bytes
  8185  0.3685  kvm-intel.ko handle_cr
  8069  0.3632  kvm.ko   kvm_set_irq
  7697  0.3465  kvm.ko   kvm_lapic_sync_from_vapic
  7586  0.3415  qemu-system-x86_64   main_loop_wait
  7480  0.3367  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
  do_select
  7121  0.3206  qemu-system-x86_64   lduw_phys
  7003  0.3153  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
  audit_syscall_exit
  6062  0.2729  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
  kfree
  5477  0.2466  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
  fput
  5454  0.2455  kvm.ko   kvm_lapic_get_cr8
  5096  0.2294  kvm.ko   kvm_load_guest_fpu
  5057  0.2277  kvm.ko   apic_update_ppr
  4929  0.2219  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
  up_read
  4900  0.2206  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
  audit_syscall_entry
  4866  0.2191  kvm.ko   kvm_apic_has_interrupt
  4670  0.2102  kvm-intel.ko skip_emulated_instruction
  4644  0.2091  kvm.ko   kvm_cpu_has_interrupt
  4548  0.2047  vmlinux-2.6.31-rc5-v2.6.31-rc3-3419-g6df4865-autokern1 
  __switch_to
  4328  0.1948  kvm.ko  

Re: Performace data when running Windows VMs

2009-08-26 Thread Avi Kivity

On 08/26/2009 07:14 PM, Andrew Theurer wrote:

On Wed, 2009-08-26 at 18:44 +0300, Avi Kivity wrote:
   

On 08/26/2009 05:57 PM, Andrew Theurer wrote:
 

I recently gathered some performance data when running Windows Server
2008 VMs, and I wanted to share it here.  There are 12 Windows
Server2008 64-bit VMs (1 vcpu, 2 GB) running which handle the concurrent
execution of 6 J2EE type benchmarks.  Each benchmark needs a App VM and
a Database VM.  The benchmark clients inject a fixed rate of requests
which yields X% CPU utilization on the host.  A different hypervisor was
compared; KVM used about 60% more CPU cycles to complete the same amount
of work.  Both had their hypervisor specific paravirt IO drivers in the
VMs.

Server is a 2 socket Core/i7, SMT off, with 72 GB memory

   

Did you use large pages?
 

Yes.
   


The stats show 'largepage = 12'.  Something's wrong.  There's a commit 
(7736d680) that's supposed to fix largepage support for kvm-87, maybe 
it's incomplete.



I/O on the host was not what I would call very high:  outbound network
averaged at 163 Mbit/s inbound was 8 Mbit/s, while disk read ops was
243/sec and write ops was 561/sec

   

What was the disk bandwidth used?  Presumably, direct access to the
volume with cache=off?
 

2.4 MB/sec write, 0.6MB/sec read, cache=none
The VMs' boot disks are IDE, but apps use their second disk which is
virtio.
   


Chickenfeed.

Do the network stats include interguest traffic?  I presume *all* of the 
traffic was interguest.



linux-aio should help reduce cpu usage.
 

I assume this is in a newer version of Qemu?
   


No, posted and awaiting merge.


Could it be that Windows uses the debug registers?  Maybe we're
incorrectly deciding to switch them.
 

I was wondering about that.  I was thinking of just backing out the
support for debugregs and see what happens.

Did the up/down_read seem kind of high?  Are we doing a lock of locking?
   


It is.  We do.  Marcelo made some threats to remove this lock.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Performace data when running Windows VMs

2009-08-26 Thread Brian Jackson
On Wednesday 26 August 2009 11:14:57 am Andrew Theurer wrote:
snip
 
   I/O on the host was not what I would call very high:  outbound network
   averaged at 163 Mbit/s inbound was 8 Mbit/s, while disk read ops was
   243/sec and write ops was 561/sec
 
  What was the disk bandwidth used?  Presumably, direct access to the
  volume with cache=off?

 2.4 MB/sec write, 0.6MB/sec read, cache=none
 The VMs' boot disks are IDE, but apps use their second disk which is
 virtio.


In my testing, I got better performance from IDE than the new virtio block 
driver for windows. There appears to be some optimization left to do on them.


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv4 2/2] vhost_net: a kernel-level virtio server

2009-08-26 Thread Michael S. Tsirkin
On Tue, Aug 25, 2009 at 04:16:34PM +0300, Michael S. Tsirkin wrote:
   + /* If they don't want an interrupt, don't send one, unless empty. */
   + if ((flags  VRING_AVAIL_F_NO_INTERRUPT)  vq-inflight)
   + return;
  
  And I wouldn't support notify on empty at all, TBH.
 
 If I don't, virtio net in guest uses a timer, which might be expensive.
 Will need to check what this does.
 
   It should
  definitely be conditional on the guest accepting the NOTIFY_ON_EMPTY
  feature.

lguest does not do it this way though, do it?
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/2] eventfd: new EFD_STATE flag

2009-08-26 Thread Davide Libenzi
On Wed, 26 Aug 2009, Michael S. Tsirkin wrote:

 On Tue, Aug 25, 2009 at 02:57:01PM -0700, Davide Libenzi wrote:
  On Tue, 25 Aug 2009, Michael S. Tsirkin wrote:
  
   Yes, we don't want that. The best thing is to try to restate the problem
   in a way that is generic, and then either solve or best use existing
   solution. Right?
   
   I thought I had that, but apparently not.  The reason I'm Cc-ing you is
   not to try and spam you until you give up and accept the patch, it's
   hoping that you see the pattern behind our usage, and help generalize
   it.
   
   If I understand it correctly, you believe this is not possible and so
   any solution will have to be in KVM? Or maybe I didn't state the problem
   clearly enough and should restate it?
  
  Please do.
  
  
  
  - Davide
 
 
 Problem looks like this:
 
 There are multiple processes (devices) where each has a condition
 (interrupt line) which it has logic to determine is either true or
 false.
 
 A single other process (hypervisor) is interested in a condition
 (interrupt level) which is a logical OR of all interrupt lines.
 On changes, an interrupt level value needs to be read and copied to
 guest virtual cpu.
 
 We also want ability to replace some or all processes above by a kernel
 components, with condition changes done potentially from hardware
 interrupt context.
 
 
 How we wanted to solve it with EFD_STATE: Share a separate eventfd
 between each device and the hypervisor.  device sets state to either 0
 or 1.  hypervisor polls all eventfds, reads interrupt line on changes,
 calculates the interrupt level and updates guest.
 
 Alternative solution: shared memory where each device writes interrupt
 line value. This makes setup more complex (need to share around much more
 than just an fd), and makes access from interrupt impossible unless we
 lock the memory (and locking userspace memory introduces yet another set
 of issues).

OK, if I get it correctly, there is one eventfd signaler (the device), and 
one eventfd reader (the hypervisor), right?
Each hypervisor listens for multiple devices detecting state changes, and 
associating the eventfd line to the IRQ number by some configuration 
(ala PCI), right?



- Davide


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Performace data when running Windows VMs

2009-08-26 Thread Andrew Theurer
On Wed, 2009-08-26 at 19:26 +0300, Avi Kivity wrote:
 On 08/26/2009 07:14 PM, Andrew Theurer wrote:
  On Wed, 2009-08-26 at 18:44 +0300, Avi Kivity wrote:
 
  On 08/26/2009 05:57 PM, Andrew Theurer wrote:
   
  I recently gathered some performance data when running Windows Server
  2008 VMs, and I wanted to share it here.  There are 12 Windows
  Server2008 64-bit VMs (1 vcpu, 2 GB) running which handle the concurrent
  execution of 6 J2EE type benchmarks.  Each benchmark needs a App VM and
  a Database VM.  The benchmark clients inject a fixed rate of requests
  which yields X% CPU utilization on the host.  A different hypervisor was
  compared; KVM used about 60% more CPU cycles to complete the same amount
  of work.  Both had their hypervisor specific paravirt IO drivers in the
  VMs.
 
  Server is a 2 socket Core/i7, SMT off, with 72 GB memory
 
 
  Did you use large pages?
   
  Yes.
 
 
 The stats show 'largepage = 12'.  Something's wrong.  There's a commit 
 (7736d680) that's supposed to fix largepage support for kvm-87, maybe 
 it's incomplete.

How strange.  /proc/meminfo showed that almost all of the pages were
used:

HugePages_Total:   12556
HugePages_Free:  220
HugePages_Rsvd:0
HugePages_Surp:0
Hugepagesize:   2048 kB

I just assumed they were used properly.  Maybe not.

  I/O on the host was not what I would call very high:  outbound network
  averaged at 163 Mbit/s inbound was 8 Mbit/s, while disk read ops was
  243/sec and write ops was 561/sec
 
 
  What was the disk bandwidth used?  Presumably, direct access to the
  volume with cache=off?
   
  2.4 MB/sec write, 0.6MB/sec read, cache=none
  The VMs' boot disks are IDE, but apps use their second disk which is
  virtio.
 
 
 Chickenfeed.
 
 Do the network stats include interguest traffic?  I presume *all* of the 
 traffic was interguest.

Sar network data:

  IFACE   rxpck/s   txpck/srxkB/stxkB/s
 Average:   lo  0.00  0.00  0.00  0.00 
 Average: usb0  0.39  0.19  0.02  0.01 
 Average: eth0   2968.83   5093.02340.13   6966.64
 Average: eth1   2992.92   5124.08342.75   7008.53 
 Average: eth2   1455.53   2500.63167.45   3421.64 
 Average: eth3   1500.59   2574.36171.98   3524.82 
 Average:  br0  2.41  0.95  0.32  0.13 
 Average:  br1  1.52  0.00  0.20  0.00 
 Average:  br2  1.52  0.00  0.20  0.00 
 Average:  br3  1.52  0.00  0.20  0.00 
 Average:  br4  0.00  0.00  0.00  0.00 
 Average: tap3669.38708.07290.89140.81 
 Average:   tap109678.53723.58294.07143.31 
 Average:   tap215673.20711.47291.99141.78 
 Average:   tap321675.26719.33293.01142.37 
 Average:tap27679.23729.90293.86143.60 
 Average:   tap133680.17734.08294.33143.85 
 Average: tap2   1002.24   2214.19   3458.54457.95 
 Average:   tap108   1021.85   2246.53   3491.02463.48 
 Average:   tap214   1002.81   2195.22   3411.80457.28 
 Average:   tap320   1017.43   2241.49   3508.20462.54 
 Average:tap26   1028.52   2237.98   3483.84462.53 
 Average:   tap132   1034.05   2240.89   3493.37463.32 

tap0-99 go to eth0, 100-199 to eth1, 200-299 to eth2, 300-399 to eth4.
There is some inter-guest traffic between VM pairs (like taps 23,
108119, etc.) but not that significant.

 
  linux-aio should help reduce cpu usage.
   
  I assume this is in a newer version of Qemu?
 
 
 No, posted and awaiting merge.
 
  Could it be that Windows uses the debug registers?  Maybe we're
  incorrectly deciding to switch them.
   
  I was wondering about that.  I was thinking of just backing out the
  support for debugregs and see what happens.
 
  Did the up/down_read seem kind of high?  Are we doing a lock of locking?
 
 
 It is.  We do.  Marcelo made some threats to remove this lock.

Thanks,

-Andrew


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Performace data when running Windows VMs

2009-08-26 Thread Andrew Theurer
On Wed, 2009-08-26 at 11:27 -0500, Brian Jackson wrote:
 On Wednesday 26 August 2009 11:14:57 am Andrew Theurer wrote:
 snip
  
I/O on the host was not what I would call very high:  outbound network
averaged at 163 Mbit/s inbound was 8 Mbit/s, while disk read ops was
243/sec and write ops was 561/sec
  
   What was the disk bandwidth used?  Presumably, direct access to the
   volume with cache=off?
 
  2.4 MB/sec write, 0.6MB/sec read, cache=none
  The VMs' boot disks are IDE, but apps use their second disk which is
  virtio.
 
 
 In my testing, I got better performance from IDE than the new virtio block 
 driver for windows. There appears to be some optimization left to do on them.

Thanks Brian.  I will try IDE on both VM disks to see how it compares.

-Andrew

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: AlacrityVM benchmark numbers updated

2009-08-26 Thread Gregory Haskins
Avi Kivity wrote:
 On 08/26/2009 04:01 AM, Gregory Haskins wrote:
 We are pleased to announce the availability of the latest networking
 benchmark numbers for AlacrityVM.  We've made several tweaks to the
 original v0.1 release to improve performance.  The most notable is a
 switch from get_user_pages to switch_mm+copy_[to/from]_user thanks to a
 review suggestion from Michael Tsirkin (as well as his patch to
 implement it).

 This change alone accounted for freeing up an additional 1.2Gbps, which
 is over 25% improvement from v0.1.  The previous numbers were 4560Gbps
 before the change, and 5708Gbps after (for 1500mtu over 10GE).  This
 moves us ever closer to the goal of native performance under
 virtualization.

 
 Interesting, it's good to see that copy_*_user() works so well.  Note
 that there's a possible optimization that goes in the opposite direction
 - keep using get_user_pages(), but use the dma engine API to perform the
 actual copy.  I expect that it will only be a win when using tso to
 transfer full pages.  Large pages may also help.
 
 Copyless tx also wants get_user_pages().  It makes sense to check if
 switch_mm() + get_user_pages_fast() gives better performance than
 get_user_pages().

Actually, I have already look at this and it does indeed seem better to
use switch_mm+gupf() over gup() by quite a large margin.  You could then
couple that with your DMA-engine idea to potentially gain even more
benefits (though probably not for networking since most NICs have their
own DMA engine anyway).

Kind Regards,
-Greg




signature.asc
Description: OpenPGP digital signature


Re: [Qemu-devel] Re: Notes on block I/O data integrity

2009-08-26 Thread Jamie Lokier
Nikola Ciprich wrote:
 clustered LVM SHOULD not have problems with it, as we're using just
 striped volumes,

Note that LVM does not implement barriers at all, except for simple
cases of a single backing device (I'm not sure if that includes
dm-crypt).

So your striped volumes may not offer this level of integrity.

-- Jamie
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] Re: Notes on block I/O data integrity

2009-08-26 Thread Jamie Lokier
Christoph Hellwig wrote:
  what about LVM? iv'e read somewhere that it used to just eat barriers
  used by XFS, making it less safe than simple partitions.
 
 Oh, any additional layers open another by cans of worms.  On Linux until
 very recently using LVM or software raid means only disabled
 write caches are safe.

I believe that's still true except if there's more than one backing
drive, so software RAID still isn't safe.  Did that change?

But even with barriers, software RAID may have a consistency problem
if one stripe is updated and the system fails before the matching
parity stripe is updated.

I've been told that some hardware RAID implementations implement a
kind of journalling to deal with this, but Linux software RAID does not.

-- Jamie
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/2] eventfd: new EFD_STATE flag

2009-08-26 Thread Avi Kivity

On 08/26/2009 08:45 PM, Davide Libenzi wrote:

OK, if I get it correctly, there is one eventfd signaler (the device), and
one eventfd reader (the hypervisor), right?
Each hypervisor listens for multiple devices detecting state changes, and
associating the eventfd line to the IRQ number by some configuration
(ala PCI), right?
   


Yes.  The PCI stuff happens in userspace, all the hypervisor sees is 
this eventfd is IRQ 10.  There may be multiple eventfds routed to one 
IRQ (corresponding to a shared IRQ line).


--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/2] eventfd: new EFD_STATE flag

2009-08-26 Thread Davide Libenzi
On Wed, 26 Aug 2009, Avi Kivity wrote:

 On 08/26/2009 08:45 PM, Davide Libenzi wrote:
  OK, if I get it correctly, there is one eventfd signaler (the device), and
  one eventfd reader (the hypervisor), right?
  Each hypervisor listens for multiple devices detecting state changes, and
  associating the eventfd line to the IRQ number by some configuration
  (ala PCI), right?
 
 
 Yes.  The PCI stuff happens in userspace, all the hypervisor sees is this
 eventfd is IRQ 10.  There may be multiple eventfds routed to one IRQ
 (corresponding to a shared IRQ line).

Ok, so why not using the eventfd counter as state?
On the device side:

void write_state(int sfd, int state) {
u64 cnt;

/* Clear the current state, sfd is in non-blocking mode */
read(sfd, cnt, sizeof(cnt));
/* Writes new state */
cnt = 1 + !!state;
write(sfd, cnt, sizeof(cnt));
}


On the hypervisor side:

int read_state(int sfd) {
u64 cnt;

read(sfd, cnt, sizeof(cnt));
return state - 1;
}




- Davide


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Performace data when running Windows VMs

2009-08-26 Thread Avi Kivity

On 08/26/2009 08:51 PM, Andrew Theurer wrote:


The stats show 'largepage = 12'.  Something's wrong.  There's a commit
(7736d680) that's supposed to fix largepage support for kvm-87, maybe
it's incomplete.
 

How strange.  /proc/meminfo showed that almost all of the pages were
used:

HugePages_Total:   12556
HugePages_Free:  220
HugePages_Rsvd:0
HugePages_Surp:0
Hugepagesize:   2048 kB

I just assumed they were used properly.  Maybe not.
   


My mistake.  The kvm_stat numbers you provided were rate (per second), 
so it just means it's still faulting in pages at a rate of 1 per guest 
per second.


   

I/O on the host was not what I would call very high:  outbound network
averaged at 163 Mbit/s inbound was 8 Mbit/s, while disk read ops was
243/sec and write ops was 561/sec


   

What was the disk bandwidth used?  Presumably, direct access to the
volume with cache=off?

 

2.4 MB/sec write, 0.6MB/sec read, cache=none
The VMs' boot disks are IDE, but apps use their second disk which is
virtio.

   

Chickenfeed.

Do the network stats include interguest traffic?  I presume *all* of the
traffic was interguest.
 

Sar network data:

   

  IFACE   rxpck/s   txpck/srxkB/stxkB/s
Average:   lo  0.00  0.00  0.00  0.00
Average: usb0  0.39  0.19  0.02  0.01
Average: eth0   2968.83   5093.02340.13   6966.64
Average: eth1   2992.92   5124.08342.75   7008.53
Average: eth2   1455.53   2500.63167.45   3421.64
Average: eth3   1500.59   2574.36171.98   3524.82
Average:  br0  2.41  0.95  0.32  0.13
Average:  br1  1.52  0.00  0.20  0.00
Average:  br2  1.52  0.00  0.20  0.00
Average:  br3  1.52  0.00  0.20  0.00
Average:  br4  0.00  0.00  0.00  0.00
Average: tap3669.38708.07290.89140.81
Average:   tap109678.53723.58294.07143.31
Average:   tap215673.20711.47291.99141.78
Average:   tap321675.26719.33293.01142.37
Average:tap27679.23729.90293.86143.60
Average:   tap133680.17734.08294.33143.85
Average: tap2   1002.24   2214.19   3458.54457.95
Average:   tap108   1021.85   2246.53   3491.02463.48
Average:   tap214   1002.81   2195.22   3411.80457.28
Average:   tap320   1017.43   2241.49   3508.20462.54
Average:tap26   1028.52   2237.98   3483.84462.53
Average:   tap132   1034.05   2240.89   3493.37463.32
 

tap0-99 go to eth0, 100-199 to eth1, 200-299 to eth2, 300-399 to eth4.
There is some inter-guest traffic between VM pairs (like taps 23,
108119, etc.) but not that significant.
   


Oh, so there are external load generators involved.

Can you run this on kvm.git master, with

CONFIG_TRACEPOINTS=y
CONFIG_TRACER_MAX_TRACE=y
CONFIG_RING_BUFFER=y
CONFIG_FTRACE_NMI_ENTER=y
CONFIG_EVENT_TRACING=y
CONFIG_TRACING=y
CONFIG_GENERIC_TRACER=y
CONFIG_TRACING_SUPPORT=y
CONFIG_FTRACE=y
CONFIG_DYNAMIC_FTRACE=y

(some may be overkill)

and, while the test is running, do:

 cd /sys/kernel/debug/tracing
 echo kvm  set_event
 (wait two seconds)
 cat trace  /tmp/trace

and send me /tmp/trace.bz2?  should be quite big.

--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Page allocation failures in guest

2009-08-26 Thread David Miller
From: Rusty Russell ru...@rustcorp.com.au
Date: Wed, 26 Aug 2009 21:48:58 +0930

 Dave, can you push this to Linus ASAP?

Ok.

 Subject: virtio: net refill on out-of-memory
 
 If we run out of memory, use keventd to fill the buffer.  There's a
 report of this happening: Page allocation failures in guest,
 Message-ID: 20090713115158.0a489...@mjolnir.ossman.eu
 
 Signed-off-by: Rusty Russell ru...@rustcorp.com.au

Applied, thanks.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: AlacrityVM benchmark numbers updated

2009-08-26 Thread Avi Kivity

On 08/26/2009 09:42 PM, Gregory Haskins wrote:

Actually, I have already look at this and it does indeed seem better to
use switch_mm+gupf() over gup() by quite a large margin.  You could then
couple that with your DMA-engine idea to potentially gain even more
benefits (though probably not for networking since most NICs have their
own DMA engine anyway).

   


For tx, we'll just go copyless once we plumb the destructors properly.  
But for rx on a shared interface it is impossible to avoid the copy.  
You can only choose if you want it done by the cpu or a local dma engine.



--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/2] eventfd: new EFD_STATE flag

2009-08-26 Thread Avi Kivity

On 08/26/2009 10:13 PM, Davide Libenzi wrote:

Ok, so why not using the eventfd counter as state?
On the device side:

void write_state(int sfd, int state) {
u64 cnt;

/* Clear the current state, sfd is in non-blocking mode */
read(sfd,cnt, sizeof(cnt));
/* Writes new state */
cnt = 1 + !!state;
write(sfd,cnt, sizeof(cnt));
}


On the hypervisor side:

int read_state(int sfd) {
u64 cnt;

read(sfd,cnt, sizeof(cnt));
return state - 1;
}

   


Hadn't though of read+write as set.  While the 1+ is a little ugly, it's 
workable.


I see no kernel equivalent to read(), but that's easily done.

--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/2] eventfd: new EFD_STATE flag

2009-08-26 Thread Davide Libenzi
On Wed, 26 Aug 2009, Avi Kivity wrote:

 On 08/26/2009 10:13 PM, Davide Libenzi wrote:
  Ok, so why not using the eventfd counter as state?
  On the device side:
  
  void write_state(int sfd, int state) {
  u64 cnt;
  
  /* Clear the current state, sfd is in non-blocking mode */
  read(sfd,cnt, sizeof(cnt));
  /* Writes new state */
  cnt = 1 + !!state;
  write(sfd,cnt, sizeof(cnt));
  }
  
  
  On the hypervisor side:
  
  int read_state(int sfd) {
  u64 cnt;
  
  read(sfd,cnt, sizeof(cnt));
  return state - 1;
  }
  
 
 
 Hadn't though of read+write as set.  While the 1+ is a little ugly, it's
 workable.

Pick what you want, as long as it always writes something != 0 :)


 I see no kernel equivalent to read(), but that's easily done.

Adding an in-kernel read based on ctx, that is no problem at all.



- Davide


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


  1   2   >