[COMMIT master] kvm: Correct kernel headers for PIT2

2009-06-02 Thread Avi Kivity
From: Avi Kivity a...@redhat.com

Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/kvm/include/linux/kvm.h b/kvm/include/linux/kvm.h
index 5b4b90c..ca93871 100644
--- a/kvm/include/linux/kvm.h
+++ b/kvm/include/linux/kvm.h
@@ -463,7 +463,7 @@ struct kvm_trace_rec {
 #define KVM_CAP_ASSIGN_DEV_IRQ 29
 /* Another bug in KVM_SET_USER_MEMORY_REGION fixed: */
 #define KVM_CAP_JOIN_MEMORY_REGIONS_WORKS 30
-#define KVM_CAP_PIT2 31
+#define KVM_CAP_PIT2 33
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -547,7 +547,7 @@ struct kvm_irq_routing {
 #define KVM_ASSIGN_SET_MSIX_ENTRY \
_IOW(KVMIO, 0x74, struct kvm_assigned_msix_entry)
 #define KVM_DEASSIGN_DEV_IRQ   _IOW(KVMIO, 0x75, struct kvm_assigned_irq)
-#define KVM_CREATE_PIT2   _IOW(KVMIO, 0x76, struct 
kvm_pit_config)
+#define KVM_CREATE_PIT2   _IOW(KVMIO, 0x77, struct 
kvm_pit_config)
 
 /*
  * ioctls for vcpu fds
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[COMMIT master] kvm: bios: Read MADT entries from memory in processors _MAT method.

2009-06-02 Thread Avi Kivity
From: Gleb Natapov g...@redhat.com

Also use enable/disable bit from ACPI MADT entries to track CPU hot
plug. This removes assumptions about APIC ids from AML code and simplify
cpu hotplug handling.

Signed-off-by: Gleb Natapov g...@redhat.com
Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/kvm/bios/acpi-dsdt.dsl b/kvm/bios/acpi-dsdt.dsl
index c756fed..db57307 100755
--- a/kvm/bios/acpi-dsdt.dsl
+++ b/kvm/bios/acpi-dsdt.dsl
@@ -27,36 +27,32 @@ DefinitionBlock (
 {
Scope (\_PR)
{
-   OperationRegion(PRST, SystemIO, 0xaf00, 32)
-   Field (PRST, ByteAcc, NoLock, Preserve)
+   /* pointer to first element of MADT APIC structures */
+   OperationRegion(ATPR, SystemMemory, 0x0514, 4)
+   Field (ATPR, DwordAcc, NoLock, Preserve)
{
-   PRS, 256
+   ATP, 32
}
 
-   Name(PRSS, Buffer(32){}) /* shadow CPU status bitmask */
-   Name(SSVL, 0)
-
-   Method(CRST, 1) {
-   If (LEqual(SSVL, 0)) {
-   Store(PRS, PRSS) /* read CPUs status bitmaks from HW */
-   Store(1, SSVL)
-}
-   ShiftRight(Arg0, 3, Local1)
-   Store(DerefOf(Index(PRSS, Local1)), Local2)
-   Return(And(Local2, ShiftLeft(1, And(Arg0, 0x7
-   }
+#define madt_addr(nr)  Add (ATP, Multiply(nr, 8))
 
 #define gen_processor(nr, name)\
Processor (CPU##name, nr, 0xb010, 0x06) {   \
-Name (PREN, Buffer(0x8) {0x0, 0x8, nr, nr, 0x1, 0x0, 0x0, 0x0}) \
-Name (PRDS, Buffer(0x8) {0x0, 0x8, nr, nr, 0x0, 0x0, 0x0, 0x0}) \
+   OperationRegion (MATR, SystemMemory, madt_addr(nr), 8)  \
+   Field (MATR, ByteAcc, NoLock, Preserve) \
+   {   \
+   MAT, 64 \
+   }   \
+   Field (MATR, ByteAcc, NoLock, Preserve) \
+   {   \
+   Offset(4),  \
+   FLG, 1  \
+   }   \
 Method(_MAT, 0) {   \
-If (CRST(nr)) { Return(PREN) }  \
-Else { Return(PRDS) }   \
+   Return(MAT) \
 }   \
 Method (_STA) { \
-If (CRST(nr)) { Return(0xF) }   \
-Else { Return(0x9) }\
+If (FLG) { Return(0xF) } Else { Return(0x9) }   \
 }   \
 }   \
 
@@ -78,9 +74,16 @@ DefinitionBlock (
gen_processor(14, E)
 
Method (NTFY, 2) {
-#define gen_ntfy(nr)  \
-   If (LEqual(Arg0, 0x##nr)) {   \
-   Notify(CPU##nr, Arg1) \
+#define gen_ntfy(nr)\
+   If (LEqual(Arg0, 0x##nr)) { \
+   If (LNotEqual(Arg1, \_PR.CPU##nr.FLG)) {\
+   Store (Arg1, \_PR.CPU##nr.FLG)  \
+   If (LEqual(Arg1, 1)) {  \
+   Notify(CPU##nr, 1)  \
+   } Else {\
+   Notify(CPU##nr, 3)  \
+   }   \
+   }   \
}
gen_ntfy(0)
gen_ntfy(1)
@@ -100,41 +103,26 @@ DefinitionBlock (
Return(One)
}
 
-   /* Works on 8 bit quentity.
- * Arg1 - Shadow status bits
- * Arg2 - Current status bits
-*/
-Method(PR1, 3) {
-   Xor(Arg1, Arg2, Local0) /* figure out what chaged */
-   ShiftLeft(Arg0, 3, Local1)
-While (LNotEqual(Local0, Zero)) {
-   If (And(Local0, 1)) {  /* if staus have changed */
-if(And(Arg2, 1)) { /* check previous status */
-   Store(3, Local3)
-   } Else {
-   Store(1, Local3)
-   }
-   NTFY(Local1, Local3)
-}
-  

[COMMIT master] Provide a kvm-free pic implementation

2009-06-02 Thread Avi Kivity
From: Glauber Costa glom...@redhat.com

This patch separates all i8259 specific kvm implementation
in separate functions. This way, qemu can have an i8259 device
free from any kvm interference. kvm can have a simpler code path
(we may even be able to compile out qemu's pic in the future if
we want to), and everybody gets happy.

Signed-off-by: Glauber Costa glom...@redhat.com
Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/hw/i8259.c b/hw/i8259.c
index bfc7fd9..132f294 100644
--- a/hw/i8259.c
+++ b/hw/i8259.c
@@ -185,16 +185,6 @@ int64_t irq_time[16];
 static void i8259_set_irq(void *opaque, int irq, int level)
 {
 PicState2 *s = opaque;
-#ifdef KVM_CAP_IRQCHIP
-if (kvm_enabled()) {
-int pic_ret;
-if (kvm_set_irq(irq, level, pic_ret)) {
-if (pic_ret != 0)
-apic_set_irq_delivered();
-return;
-}
-}
-#endif
 #if defined(DEBUG_PIC) || defined(DEBUG_IRQ_COUNT)
 if (level != irq_level[irq]) {
 #if defined(DEBUG_PIC)
@@ -478,77 +468,10 @@ static uint32_t elcr_ioport_read(void *opaque, uint32_t 
addr1)
 return s-elcr;
 }
 
-static void kvm_kernel_pic_save_to_user(PicState *s)
-{
-#if defined(KVM_CAP_IRQCHIP)  defined(TARGET_I386)
-struct kvm_irqchip chip;
-struct kvm_pic_state *kpic;
-
-chip.chip_id = (s-pics_state-pics[0] == s) ?
-   KVM_IRQCHIP_PIC_MASTER :
-   KVM_IRQCHIP_PIC_SLAVE;
-kvm_get_irqchip(kvm_context, chip);
-kpic = chip.chip.pic;
-
-s-last_irr = kpic-last_irr;
-s-irr = kpic-irr;
-s-imr = kpic-imr;
-s-isr = kpic-isr;
-s-priority_add = kpic-priority_add;
-s-irq_base = kpic-irq_base;
-s-read_reg_select = kpic-read_reg_select;
-s-poll = kpic-poll;
-s-special_mask = kpic-special_mask;
-s-init_state = kpic-init_state;
-s-auto_eoi = kpic-auto_eoi;
-s-rotate_on_auto_eoi = kpic-rotate_on_auto_eoi;
-s-special_fully_nested_mode = kpic-special_fully_nested_mode;
-s-init4 = kpic-init4;
-s-elcr = kpic-elcr;
-s-elcr_mask = kpic-elcr_mask;
-#endif
-}
-
-static void kvm_kernel_pic_load_from_user(PicState *s)
-{
-#if defined(KVM_CAP_IRQCHIP)  defined(TARGET_I386)
-struct kvm_irqchip chip;
-struct kvm_pic_state *kpic;
-
-chip.chip_id = (s-pics_state-pics[0] == s) ?
-   KVM_IRQCHIP_PIC_MASTER :
-   KVM_IRQCHIP_PIC_SLAVE;
-kpic = chip.chip.pic;
-
-kpic-last_irr = s-last_irr;
-kpic-irr = s-irr;
-kpic-imr = s-imr;
-kpic-isr = s-isr;
-kpic-priority_add = s-priority_add;
-kpic-irq_base = s-irq_base;
-kpic-read_reg_select = s-read_reg_select;
-kpic-poll = s-poll;
-kpic-special_mask = s-special_mask;
-kpic-init_state = s-init_state;
-kpic-auto_eoi = s-auto_eoi;
-kpic-rotate_on_auto_eoi = s-rotate_on_auto_eoi;
-kpic-special_fully_nested_mode = s-special_fully_nested_mode;
-kpic-init4 = s-init4;
-kpic-elcr = s-elcr;
-kpic-elcr_mask = s-elcr_mask;
-
-kvm_set_irqchip(kvm_context, chip);
-#endif
-}
-
-static void pic_save(QEMUFile *f, void *opaque)
+static void pic_save_common(QEMUFile *f, void *opaque)
 {
 PicState *s = opaque;
 
-if (kvm_enabled()  qemu_kvm_irqchip_in_kernel()) {
-kvm_kernel_pic_save_to_user(s);
-}
-
 qemu_put_8s(f, s-last_irr);
 qemu_put_8s(f, s-irr);
 qemu_put_8s(f, s-imr);
@@ -567,7 +490,12 @@ static void pic_save(QEMUFile *f, void *opaque)
 qemu_put_8s(f, s-elcr);
 }
 
-static int pic_load(QEMUFile *f, void *opaque, int version_id)
+static void pic_save(QEMUFile *f, void *opaque)
+{
+pic_save_common(f, opaque);
+}
+
+static int pic_load_common(QEMUFile *f, void *opaque, int version_id)
 {
 PicState *s = opaque;
 
@@ -591,13 +519,14 @@ static int pic_load(QEMUFile *f, void *opaque, int 
version_id)
 qemu_get_8s(f, s-single_mode);
 qemu_get_8s(f, s-elcr);
 
-if (kvm_enabled()  qemu_kvm_irqchip_in_kernel()) {
-kvm_kernel_pic_load_from_user(s);
-}
-
 return 0;
 }
 
+static int pic_load(QEMUFile *f, void *opaque, int version_id)
+{
+return pic_load_common(f, opaque, version_id);
+}
+
 /* XXX: add generic master/slave system */
 static void pic_init1(int io_addr, int elcr_addr, PicState *s)
 {
@@ -668,3 +597,118 @@ void pic_set_alt_irq_func(PicState2 *s, SetIRQFunc 
*alt_irq_func,
 s-alt_irq_func = alt_irq_func;
 s-alt_irq_opaque = alt_irq_opaque;
 }
+
+#ifdef KVM_CAP_IRQCHIP
+static void kvm_kernel_pic_save_to_user(PicState *s)
+{
+#if defined(TARGET_I386)
+struct kvm_irqchip chip;
+struct kvm_pic_state *kpic;
+
+chip.chip_id = (s-pics_state-pics[0] == s) ?
+   KVM_IRQCHIP_PIC_MASTER :
+   KVM_IRQCHIP_PIC_SLAVE;
+kvm_get_irqchip(kvm_context, chip);
+kpic = chip.chip.pic;
+
+s-last_irr = kpic-last_irr;
+s-irr = kpic-irr;
+s-imr = kpic-imr;
+s-isr = kpic-isr;
+s-priority_add = kpic-priority_add;
+s-irq_base = kpic-irq_base;
+

[COMMIT master] Add dummy eventfd.h for older kernels

2009-06-02 Thread Avi Kivity
From: Avi Kivity a...@redhat.com

Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/include-compat/linux/eventfd.h b/include-compat/linux/eventfd.h
new file mode 100644
index 000..c3580fb
--- /dev/null
+++ b/include-compat/linux/eventfd.h
@@ -0,0 +1 @@
+/* Dummy file */
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[COMMIT master] If syncing from git, use 'git describe' as the version.

2009-06-02 Thread Avi Kivity
From: Avi Kivity a...@redhat.com

Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/Makefile b/Makefile
index f51b491..95e4c81 100644
--- a/Makefile
+++ b/Makefile
@@ -37,8 +37,13 @@ include $(MAKEFILE_PRE)
 
 .PHONY: sync
 
+KVM_VERSION_GIT = $(if $(and $(filter kvm-devel,$(KVM_VERSION)), \
+$(wildcard $(LINUX)/.git)), \
+  $(shell git --git-dir=$(LINUX)/.git describe), \
+  $(KVM_VERSION))
+
 sync:
-   ./sync -v $(KVM_VERSION) -l $(LINUX)
+   ./sync -v $(KVM_VERSION_GIT) -l $(LINUX)
 
 install:
mkdir -p $(DESTDIR)/$(INSTALLDIR)
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[COMMIT master] KVM: SVM: Fold kvm_svm.h info svm.c

2009-06-02 Thread Avi Kivity
From: Avi Kivity a...@redhat.com

kvm_svm.h is only included from svm.c, so fold it in.

Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h
deleted file mode 100644
index 39fddc0..000
--- a/arch/x86/kvm/kvm_svm.h
+++ /dev/null
@@ -1,54 +0,0 @@
-#ifndef __KVM_SVM_H
-#define __KVM_SVM_H
-
-#include linux/kernel.h
-#include linux/types.h
-#include linux/list.h
-#include linux/kvm_host.h
-#include asm/msr.h
-
-#include asm/svm.h
-
-static const u32 host_save_user_msrs[] = {
-#ifdef CONFIG_X86_64
-   MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
-   MSR_FS_BASE,
-#endif
-   MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
-};
-
-#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
-
-struct kvm_vcpu;
-
-struct vcpu_svm {
-   struct kvm_vcpu vcpu;
-   struct vmcb *vmcb;
-   unsigned long vmcb_pa;
-   struct svm_cpu_data *svm_data;
-   uint64_t asid_generation;
-   uint64_t sysenter_cs;
-   uint64_t sysenter_esp;
-   uint64_t sysenter_eip;
-
-   u64 next_rip;
-
-   u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
-   u64 host_gs_base;
-   unsigned long host_cr2;
-
-   u32 *msrpm;
-   struct vmcb *hsave;
-   u64 hsave_msr;
-
-   u64 nested_vmcb;
-
-   /* These are the merged vectors */
-   u32 *nested_msrpm;
-
-   /* gpa pointers to the real vectors */
-   u64 nested_vmcb_msrpm;
-};
-
-#endif
-
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f0f2885..3ac45e3 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -15,7 +15,6 @@
  */
 #include linux/kvm_host.h
 
-#include kvm_svm.h
 #include irq.h
 #include mmu.h
 #include kvm_cache_regs.h
@@ -57,6 +56,47 @@ MODULE_LICENSE(GPL);
 #define nsvm_printk(fmt, args...) do {} while(0)
 #endif
 
+static const u32 host_save_user_msrs[] = {
+#ifdef CONFIG_X86_64
+   MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
+   MSR_FS_BASE,
+#endif
+   MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
+};
+
+#define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
+
+struct kvm_vcpu;
+
+struct vcpu_svm {
+   struct kvm_vcpu vcpu;
+   struct vmcb *vmcb;
+   unsigned long vmcb_pa;
+   struct svm_cpu_data *svm_data;
+   uint64_t asid_generation;
+   uint64_t sysenter_cs;
+   uint64_t sysenter_esp;
+   uint64_t sysenter_eip;
+
+   u64 next_rip;
+
+   u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
+   u64 host_gs_base;
+   unsigned long host_cr2;
+
+   u32 *msrpm;
+   struct vmcb *hsave;
+   u64 hsave_msr;
+
+   u64 nested_vmcb;
+
+   /* These are the merged vectors */
+   u32 *nested_msrpm;
+
+   /* gpa pointers to the real vectors */
+   u64 nested_vmcb_msrpm;
+};
+
 /* enable NPT for AMD64 and X86 with PAE */
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 static bool npt_enabled = true;
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[COMMIT master] KVM: Clean up coalesced_mmio destruction

2009-06-02 Thread Avi Kivity
From: Gregory Haskins ghask...@novell.com

We invoke kfree() on a data member instead of the structure.  This works today
because the kvm_io_device is the first element of the private structure, but
this could change in the future, so lets clean this up.

Signed-off-by: Gregory Haskins ghask...@novell.com
Acked-by: Chris Wright chr...@sous-sol.org
Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index 5ae620d..03ea280 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -80,7 +80,10 @@ static void coalesced_mmio_write(struct kvm_io_device *this,
 
 static void coalesced_mmio_destructor(struct kvm_io_device *this)
 {
-   kfree(this);
+   struct kvm_coalesced_mmio_dev *dev =
+   (struct kvm_coalesced_mmio_dev *)this-private;
+
+   kfree(dev);
 }
 
 int kvm_coalesced_mmio_init(struct kvm *kvm)
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[COMMIT master] KVM: powerpc: fix some init/exit annotations

2009-06-02 Thread Avi Kivity
From: Stephen Rothwell s...@canb.auug.org.au

Fixes a couple of warnings like this one:

WARNING: arch/powerpc/kvm/kvm-440.o(.text+0x1e8c): Section mismatch in 
reference from the function kvmppc_44x_exit() to the function 
.exit.text:kvmppc_booke_exit()
The function kvmppc_44x_exit() references a function in an exit section.
Often the function kvmppc_booke_exit() has valid usage outside the exit section
and the fix is to remove the __exit annotation of kvmppc_booke_exit.

Also add some __init annotations on obvious routines.

Signed-off-by: Stephen Rothwell s...@canb.auug.org.au
Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
index 0cef809..f4d1b55 100644
--- a/arch/powerpc/kvm/44x.c
+++ b/arch/powerpc/kvm/44x.c
@@ -138,7 +138,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
kmem_cache_free(kvm_vcpu_cache, vcpu_44x);
 }
 
-static int kvmppc_44x_init(void)
+static int __init kvmppc_44x_init(void)
 {
int r;
 
@@ -149,7 +149,7 @@ static int kvmppc_44x_init(void)
return kvm_init(NULL, sizeof(struct kvmppc_vcpu_44x), THIS_MODULE);
 }
 
-static void kvmppc_44x_exit(void)
+static void __exit kvmppc_44x_exit(void)
 {
kvmppc_booke_exit();
 }
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 642e420..e7bf4d0 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -520,7 +520,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
return kvmppc_core_vcpu_translate(vcpu, tr);
 }
 
-int kvmppc_booke_init(void)
+int __init kvmppc_booke_init(void)
 {
unsigned long ivor[16];
unsigned long max_ivor = 0;
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index d8067fd..674e796 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -132,7 +132,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
kmem_cache_free(kvm_vcpu_cache, vcpu_e500);
 }
 
-static int kvmppc_e500_init(void)
+static int __init kvmppc_e500_init(void)
 {
int r, i;
unsigned long ivor[3];
@@ -160,7 +160,7 @@ static int kvmppc_e500_init(void)
return kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), THIS_MODULE);
 }
 
-static void kvmppc_e500_exit(void)
+static void __init kvmppc_e500_exit(void)
 {
kvmppc_booke_exit();
 }
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[COMMIT master] KVM: do not register i8254 PIO regions until we are initialized

2009-06-02 Thread Avi Kivity
From: Gregory Haskins ghask...@novell.com

We currently publish the i8254 resources to the pio_bus before the devices
are fully initialized.  Since we hold the pit_lock, its probably not
a real issue.  But lets clean this up anyway.

Reported-by: Avi Kivity a...@redhat.com
Signed-off-by: Gregory Haskins ghask...@novell.com
Acked-by: Chris Wright chr...@sous-sol.org
Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 21301a2..f91b0e3 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -601,15 +601,6 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
mutex_lock(pit-pit_state.lock);
spin_lock_init(pit-pit_state.inject_lock);
 
-   /* Initialize PIO device */
-   kvm_iodevice_init(pit-dev, pit_dev_ops);
-   kvm_io_bus_register_dev(kvm-pio_bus, pit-dev);
-
-   if (flags  KVM_PIT_SPEAKER_DUMMY) {
-   kvm_iodevice_init(pit-speaker_dev, speaker_dev_ops);
-   kvm_io_bus_register_dev(kvm-pio_bus, pit-speaker_dev);
-   }
-
kvm-arch.vpit = pit;
pit-kvm = kvm;
 
@@ -628,6 +619,14 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
pit-mask_notifier.func = pit_mask_notifer;
kvm_register_irq_mask_notifier(kvm, 0, pit-mask_notifier);
 
+   kvm_iodevice_init(pit-dev, pit_dev_ops);
+   kvm_io_bus_register_dev(kvm-pio_bus, pit-dev);
+
+   if (flags  KVM_PIT_SPEAKER_DUMMY) {
+   kvm_iodevice_init(pit-speaker_dev, speaker_dev_ops);
+   kvm_io_bus_register_dev(kvm-pio_bus, pit-speaker_dev);
+   }
+
return pit;
 }
 
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[COMMIT master] KVM: cleanup io_device code

2009-06-02 Thread Avi Kivity
From: Gregory Haskins ghask...@novell.com

We modernize the io_device code so that we use container_of() instead of
dev-private, and move the vtable to a separate ops structure
(theoretically allows better caching for multiple instances of the same
ops structure)

Signed-off-by: Gregory Haskins ghask...@novell.com
Acked-by: Chris Wright chr...@sous-sol.org
Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 584e3d3..21301a2 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -347,10 +347,20 @@ void kvm_pit_load_count(struct kvm *kvm, int channel, u32 
val)
mutex_unlock(kvm-arch.vpit-pit_state.lock);
 }
 
+static inline struct kvm_pit *dev_to_pit(struct kvm_io_device *dev)
+{
+   return container_of(dev, struct kvm_pit, dev);
+}
+
+static inline struct kvm_pit *speaker_to_pit(struct kvm_io_device *dev)
+{
+   return container_of(dev, struct kvm_pit, speaker_dev);
+}
+
 static void pit_ioport_write(struct kvm_io_device *this,
 gpa_t addr, int len, const void *data)
 {
-   struct kvm_pit *pit = (struct kvm_pit *)this-private;
+   struct kvm_pit *pit = dev_to_pit(this);
struct kvm_kpit_state *pit_state = pit-pit_state;
struct kvm *kvm = pit-kvm;
int channel, access;
@@ -423,7 +433,7 @@ static void pit_ioport_write(struct kvm_io_device *this,
 static void pit_ioport_read(struct kvm_io_device *this,
gpa_t addr, int len, void *data)
 {
-   struct kvm_pit *pit = (struct kvm_pit *)this-private;
+   struct kvm_pit *pit = dev_to_pit(this);
struct kvm_kpit_state *pit_state = pit-pit_state;
struct kvm *kvm = pit-kvm;
int ret, count;
@@ -494,7 +504,7 @@ static int pit_in_range(struct kvm_io_device *this, gpa_t 
addr,
 static void speaker_ioport_write(struct kvm_io_device *this,
 gpa_t addr, int len, const void *data)
 {
-   struct kvm_pit *pit = (struct kvm_pit *)this-private;
+   struct kvm_pit *pit = speaker_to_pit(this);
struct kvm_kpit_state *pit_state = pit-pit_state;
struct kvm *kvm = pit-kvm;
u32 val = *(u32 *) data;
@@ -508,7 +518,7 @@ static void speaker_ioport_write(struct kvm_io_device *this,
 static void speaker_ioport_read(struct kvm_io_device *this,
gpa_t addr, int len, void *data)
 {
-   struct kvm_pit *pit = (struct kvm_pit *)this-private;
+   struct kvm_pit *pit = speaker_to_pit(this);
struct kvm_kpit_state *pit_state = pit-pit_state;
struct kvm *kvm = pit-kvm;
unsigned int refresh_clock;
@@ -560,6 +570,18 @@ static void pit_mask_notifer(struct kvm_irq_mask_notifier 
*kimn, bool mask)
}
 }
 
+static const struct kvm_io_device_ops pit_dev_ops = {
+   .read = pit_ioport_read,
+   .write= pit_ioport_write,
+   .in_range = pit_in_range,
+};
+
+static const struct kvm_io_device_ops speaker_dev_ops = {
+   .read = speaker_ioport_read,
+   .write= speaker_ioport_write,
+   .in_range = speaker_in_range,
+};
+
 struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 {
struct kvm_pit *pit;
@@ -580,17 +602,11 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
spin_lock_init(pit-pit_state.inject_lock);
 
/* Initialize PIO device */
-   pit-dev.read = pit_ioport_read;
-   pit-dev.write = pit_ioport_write;
-   pit-dev.in_range = pit_in_range;
-   pit-dev.private = pit;
+   kvm_iodevice_init(pit-dev, pit_dev_ops);
kvm_io_bus_register_dev(kvm-pio_bus, pit-dev);
 
if (flags  KVM_PIT_SPEAKER_DUMMY) {
-   pit-speaker_dev.read = speaker_ioport_read;
-   pit-speaker_dev.write = speaker_ioport_write;
-   pit-speaker_dev.in_range = speaker_in_range;
-   pit-speaker_dev.private = pit;
+   kvm_iodevice_init(pit-speaker_dev, speaker_dev_ops);
kvm_io_bus_register_dev(kvm-pio_bus, pit-speaker_dev);
}
 
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 1ccb50c..2520922 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -444,10 +444,15 @@ static int picdev_in_range(struct kvm_io_device *this, 
gpa_t addr,
}
 }
 
+static inline struct kvm_pic *to_pic(struct kvm_io_device *dev)
+{
+   return container_of(dev, struct kvm_pic, dev);
+}
+
 static void picdev_write(struct kvm_io_device *this,
 gpa_t addr, int len, const void *val)
 {
-   struct kvm_pic *s = this-private;
+   struct kvm_pic *s = to_pic(this);
unsigned char data = *(unsigned char *)val;
 
if (len != 1) {
@@ -474,7 +479,7 @@ static void picdev_write(struct kvm_io_device *this,
 static void picdev_read(struct kvm_io_device *this,
gpa_t addr, int len, void *val)
 {
-   struct kvm_pic *s = this-private;
+   struct kvm_pic *s = 

[COMMIT master] KVM: VMX: Avoid duplicate ept tlb flush when setting cr3

2009-06-02 Thread Avi Kivity
From: Avi Kivity a...@redhat.com

vmx_set_cr3() will call vmx_tlb_flush(), which will flush the ept context.
So there is no need to call ept_sync_context() explicitly.

Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 25f1239..5607de8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1642,7 +1642,6 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned 
long cr3)
if (enable_ept) {
eptp = construct_eptp(cr3);
vmcs_write64(EPT_POINTER, eptp);
-   ept_sync_context(eptp);
ept_load_pdptrs(vcpu);
guest_cr3 = is_paging(vcpu) ? vcpu-arch.cr3 :
VMX_EPT_IDENTITY_PAGETABLE_ADDR;
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[COMMIT master] KVM: VMX: Simplify pdptr and cr3 management

2009-06-02 Thread Avi Kivity
From: Avi Kivity a...@redhat.com

Instead of reading the PDPTRs from memory after every exit (which is slow
and wrong, as the PDPTRs are stored on the cpu), sync the PDPTRs from
memory to the VMCS before entry, and from the VMCS to memory after exit.
Do the same for cr3.

Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5607de8..1783606 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1538,10 +1538,6 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vcpu 
*vcpu)
 static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
 {
if (is_paging(vcpu)  is_pae(vcpu)  !is_long_mode(vcpu)) {
-   if (!load_pdptrs(vcpu, vcpu-arch.cr3)) {
-   printk(KERN_ERR EPT: Fail to load pdptrs!\n);
-   return;
-   }
vmcs_write64(GUEST_PDPTR0, vcpu-arch.pdptrs[0]);
vmcs_write64(GUEST_PDPTR1, vcpu-arch.pdptrs[1]);
vmcs_write64(GUEST_PDPTR2, vcpu-arch.pdptrs[2]);
@@ -1549,6 +1545,16 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
}
 }
 
+static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
+{
+   if (is_paging(vcpu)  is_pae(vcpu)  !is_long_mode(vcpu)) {
+   vcpu-arch.pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
+   vcpu-arch.pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
+   vcpu-arch.pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
+   vcpu-arch.pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
+   }
+}
+
 static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
 
 static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
@@ -1642,7 +1648,6 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned 
long cr3)
if (enable_ept) {
eptp = construct_eptp(cr3);
vmcs_write64(EPT_POINTER, eptp);
-   ept_load_pdptrs(vcpu);
guest_cr3 = is_paging(vcpu) ? vcpu-arch.cr3 :
VMX_EPT_IDENTITY_PAGETABLE_ADDR;
}
@@ -3199,7 +3204,7 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, 
struct kvm_vcpu *vcpu)
 * to sync with guest real CR3. */
if (enable_ept  is_paging(vcpu)) {
vcpu-arch.cr3 = vmcs_readl(GUEST_CR3);
-   ept_load_pdptrs(vcpu);
+   ept_save_pdptrs(vcpu);
}
 
if (unlikely(vmx-fail)) {
@@ -3376,6 +3381,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct 
kvm_run *kvm_run)
 {
struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+   if (enable_ept  is_paging(vcpu)) {
+   vmcs_writel(GUEST_CR3, vcpu-arch.cr3);
+   ept_load_pdptrs(vcpu);
+   }
/* Record the guest's net vcpu time for enforced NMI injections. */
if (unlikely(!cpu_has_virtual_nmis()  vmx-soft_vnmi_blocked))
vmx-entry_time = ktime_get();
--
To unsubscribe from this list: send the line unsubscribe kvm-commits in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[COMMIT master] KVM: Cache pdptrs

2009-06-02 Thread Avi Kivity
From: Avi Kivity a...@redhat.com

Instead of reloading the pdptrs on every entry and exit (vmcs writes on vmx,
guest memory access on svm) extract them on demand.

Signed-off-by: Avi Kivity a...@redhat.com

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d2b082d..1951d39 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -120,6 +120,10 @@ enum kvm_reg {
NR_VCPU_REGS
 };
 
+enum kvm_reg_ex {
+   VCPU_EXREG_PDPTR = NR_VCPU_REGS,
+};
+
 enum {
VCPU_SREG_ES,
VCPU_SREG_CS,
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 1ff819d..7bcc5b6 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -29,4 +29,13 @@ static inline void kvm_rip_write(struct kvm_vcpu *vcpu, 
unsigned long val)
kvm_register_write(vcpu, VCPU_REGS_RIP, val);
 }
 
+static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
+{
+   if (!test_bit(VCPU_EXREG_PDPTR,
+ (unsigned long *)vcpu-arch.regs_avail))
+   kvm_x86_ops-cache_reg(vcpu, VCPU_EXREG_PDPTR);
+
+   return vcpu-arch.pdptrs[index];
+}
+
 #endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7030b5f..809cce0 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -18,6 +18,7 @@
  */
 
 #include mmu.h
+#include kvm_cache_regs.h
 
 #include linux/kvm_host.h
 #include linux/types.h
@@ -1930,6 +1931,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
gfn_t root_gfn;
struct kvm_mmu_page *sp;
int direct = 0;
+   u64 pdptr;
 
root_gfn = vcpu-arch.cr3  PAGE_SHIFT;
 
@@ -1957,11 +1959,12 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 
ASSERT(!VALID_PAGE(root));
if (vcpu-arch.mmu.root_level == PT32E_ROOT_LEVEL) {
-   if (!is_present_pte(vcpu-arch.pdptrs[i])) {
+   pdptr = kvm_pdptr_read(vcpu, i);
+   if (!is_present_pte(pdptr)) {
vcpu-arch.mmu.pae_root[i] = 0;
continue;
}
-   root_gfn = vcpu-arch.pdptrs[i]  PAGE_SHIFT;
+   root_gfn = pdptr  PAGE_SHIFT;
} else if (vcpu-arch.mmu.root_level == 0)
root_gfn = 0;
if (mmu_check_root(vcpu, root_gfn))
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 67785f6..4cb1dbf 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -131,7 +131,7 @@ walk:
pte = vcpu-arch.cr3;
 #if PTTYPE == 64
if (!is_long_mode(vcpu)) {
-   pte = vcpu-arch.pdptrs[(addr  30)  3];
+   pte = kvm_pdptr_read(vcpu, (addr  30)  3);
if (!is_present_pte(pte))
goto not_present;
--walker-level;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 3ac45e3..37397f6 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -779,6 +779,18 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned 
long rflags)
to_svm(vcpu)-vmcb-save.rflags = rflags;
 }
 
+static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
+{
+   switch (reg) {
+   case VCPU_EXREG_PDPTR:
+   BUG_ON(!npt_enabled);
+   load_pdptrs(vcpu, vcpu-arch.cr3);
+   break;
+   default:
+   BUG();
+   }
+}
+
 static void svm_set_vintr(struct vcpu_svm *svm)
 {
svm-vmcb-control.intercept |= 1ULL  INTERCEPT_VINTR;
@@ -2286,12 +2298,6 @@ static int handle_exit(struct kvm_run *kvm_run, struct 
kvm_vcpu *vcpu)
}
vcpu-arch.cr0 = svm-vmcb-save.cr0;
vcpu-arch.cr3 = svm-vmcb-save.cr3;
-   if (is_paging(vcpu)  is_pae(vcpu)  !is_long_mode(vcpu)) {
-   if (!load_pdptrs(vcpu, vcpu-arch.cr3)) {
-   kvm_inject_gp(vcpu, 0);
-   return 1;
-   }
-   }
if (mmu_reload) {
kvm_mmu_reset_context(vcpu);
kvm_mmu_load(vcpu);
@@ -2642,6 +2648,11 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct 
kvm_run *kvm_run)
 
svm-next_rip = 0;
 
+   if (npt_enabled) {
+   vcpu-arch.regs_avail = ~(1  VCPU_EXREG_PDPTR);
+   vcpu-arch.regs_dirty = ~(1  VCPU_EXREG_PDPTR);
+   }
+
svm_complete_interrupts(svm);
 }
 
@@ -2750,6 +2761,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.set_gdt = svm_set_gdt,
.get_dr = svm_get_dr,
.set_dr = svm_set_dr,
+   .cache_reg = svm_cache_reg,
.get_rflags = svm_get_rflags,
.set_rflags = svm_set_rflags,
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1783606..fd05fd2 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ 

[PATCH] provide a kvm-free pic implementation

2009-06-02 Thread Glauber Costa
This patch separates all i8259 specific kvm implementation
in separate functions. This way, qemu can have an i8259 device
free from any kvm interference. kvm can have a simpler code path
(we may even be able to compile out qemu's pic in the future if
we want to), and everybody gets happy.

Signed-off-by: Glauber Costa glom...@redhat.com
---
 hw/i8259.c |  210 
 hw/pc.c|7 ++-
 hw/pc.h|1 +
 3 files changed, 134 insertions(+), 84 deletions(-)

diff --git a/hw/i8259.c b/hw/i8259.c
index bfc7fd9..132f294 100644
--- a/hw/i8259.c
+++ b/hw/i8259.c
@@ -185,16 +185,6 @@ int64_t irq_time[16];
 static void i8259_set_irq(void *opaque, int irq, int level)
 {
 PicState2 *s = opaque;
-#ifdef KVM_CAP_IRQCHIP
-if (kvm_enabled()) {
-int pic_ret;
-if (kvm_set_irq(irq, level, pic_ret)) {
-if (pic_ret != 0)
-apic_set_irq_delivered();
-return;
-}
-}
-#endif
 #if defined(DEBUG_PIC) || defined(DEBUG_IRQ_COUNT)
 if (level != irq_level[irq]) {
 #if defined(DEBUG_PIC)
@@ -478,77 +468,10 @@ static uint32_t elcr_ioport_read(void *opaque, uint32_t 
addr1)
 return s-elcr;
 }
 
-static void kvm_kernel_pic_save_to_user(PicState *s)
-{
-#if defined(KVM_CAP_IRQCHIP)  defined(TARGET_I386)
-struct kvm_irqchip chip;
-struct kvm_pic_state *kpic;
-
-chip.chip_id = (s-pics_state-pics[0] == s) ?
-   KVM_IRQCHIP_PIC_MASTER :
-   KVM_IRQCHIP_PIC_SLAVE;
-kvm_get_irqchip(kvm_context, chip);
-kpic = chip.chip.pic;
-
-s-last_irr = kpic-last_irr;
-s-irr = kpic-irr;
-s-imr = kpic-imr;
-s-isr = kpic-isr;
-s-priority_add = kpic-priority_add;
-s-irq_base = kpic-irq_base;
-s-read_reg_select = kpic-read_reg_select;
-s-poll = kpic-poll;
-s-special_mask = kpic-special_mask;
-s-init_state = kpic-init_state;
-s-auto_eoi = kpic-auto_eoi;
-s-rotate_on_auto_eoi = kpic-rotate_on_auto_eoi;
-s-special_fully_nested_mode = kpic-special_fully_nested_mode;
-s-init4 = kpic-init4;
-s-elcr = kpic-elcr;
-s-elcr_mask = kpic-elcr_mask;
-#endif
-}
-
-static void kvm_kernel_pic_load_from_user(PicState *s)
-{
-#if defined(KVM_CAP_IRQCHIP)  defined(TARGET_I386)
-struct kvm_irqchip chip;
-struct kvm_pic_state *kpic;
-
-chip.chip_id = (s-pics_state-pics[0] == s) ?
-   KVM_IRQCHIP_PIC_MASTER :
-   KVM_IRQCHIP_PIC_SLAVE;
-kpic = chip.chip.pic;
-
-kpic-last_irr = s-last_irr;
-kpic-irr = s-irr;
-kpic-imr = s-imr;
-kpic-isr = s-isr;
-kpic-priority_add = s-priority_add;
-kpic-irq_base = s-irq_base;
-kpic-read_reg_select = s-read_reg_select;
-kpic-poll = s-poll;
-kpic-special_mask = s-special_mask;
-kpic-init_state = s-init_state;
-kpic-auto_eoi = s-auto_eoi;
-kpic-rotate_on_auto_eoi = s-rotate_on_auto_eoi;
-kpic-special_fully_nested_mode = s-special_fully_nested_mode;
-kpic-init4 = s-init4;
-kpic-elcr = s-elcr;
-kpic-elcr_mask = s-elcr_mask;
-
-kvm_set_irqchip(kvm_context, chip);
-#endif
-}
-
-static void pic_save(QEMUFile *f, void *opaque)
+static void pic_save_common(QEMUFile *f, void *opaque)
 {
 PicState *s = opaque;
 
-if (kvm_enabled()  qemu_kvm_irqchip_in_kernel()) {
-kvm_kernel_pic_save_to_user(s);
-}
-
 qemu_put_8s(f, s-last_irr);
 qemu_put_8s(f, s-irr);
 qemu_put_8s(f, s-imr);
@@ -567,7 +490,12 @@ static void pic_save(QEMUFile *f, void *opaque)
 qemu_put_8s(f, s-elcr);
 }
 
-static int pic_load(QEMUFile *f, void *opaque, int version_id)
+static void pic_save(QEMUFile *f, void *opaque)
+{
+pic_save_common(f, opaque);
+}
+
+static int pic_load_common(QEMUFile *f, void *opaque, int version_id)
 {
 PicState *s = opaque;
 
@@ -591,13 +519,14 @@ static int pic_load(QEMUFile *f, void *opaque, int 
version_id)
 qemu_get_8s(f, s-single_mode);
 qemu_get_8s(f, s-elcr);
 
-if (kvm_enabled()  qemu_kvm_irqchip_in_kernel()) {
-kvm_kernel_pic_load_from_user(s);
-}
-
 return 0;
 }
 
+static int pic_load(QEMUFile *f, void *opaque, int version_id)
+{
+return pic_load_common(f, opaque, version_id);
+}
+
 /* XXX: add generic master/slave system */
 static void pic_init1(int io_addr, int elcr_addr, PicState *s)
 {
@@ -668,3 +597,118 @@ void pic_set_alt_irq_func(PicState2 *s, SetIRQFunc 
*alt_irq_func,
 s-alt_irq_func = alt_irq_func;
 s-alt_irq_opaque = alt_irq_opaque;
 }
+
+#ifdef KVM_CAP_IRQCHIP
+static void kvm_kernel_pic_save_to_user(PicState *s)
+{
+#if defined(TARGET_I386)
+struct kvm_irqchip chip;
+struct kvm_pic_state *kpic;
+
+chip.chip_id = (s-pics_state-pics[0] == s) ?
+   KVM_IRQCHIP_PIC_MASTER :
+   KVM_IRQCHIP_PIC_SLAVE;
+kvm_get_irqchip(kvm_context, chip);
+kpic = chip.chip.pic;
+
+s-last_irr = kpic-last_irr;
+s-irr = kpic-irr;
+s-imr = kpic-imr;
+s-isr = 

Re: [PATCH][KVM-AUTOTEST] TAP network support in kvm-autotest

2009-06-02 Thread sudhir kumar
On Thu, May 14, 2009 at 4:10 PM, Michael Goldish mgold...@redhat.com wrote:
 Hi Jason,

 We already have patches that implement similar functionality here in
 TLV, as mentioned in the to-do list (item #4 under 'Framework').
 They're not yet committed upstream because they're still quite fresh.

 Still, your patch looks good and is quite similar to mine. The main
 difference is that I use MAC/IP address pools specified by the user,
 instead of random MACs with arp/nmap to detect the matching IP
 addresses.

 I will post my patch to the mailing list soon, but it will come
 together with quite a few other patches that I haven't posted yet, so
 please be patient.

 Comments/questions:

 Why do you use nmap in addition to arp? In what cases will arp not
 suffice? I'm a little put off by the fact that nmap imposes an
 additional requirement on the host. Three hosts I've tried don't come
 with nmap installed by default.
Using nmap is very slow. It puts lot of delay in the complete test execution.
Michael,
   How do your patches implement ip search of the guest? system arp
might not be sufficient as it can have only certain fix number of
entries and might not have entry for the newly assigned guest ip. I
would like to have a quick look on your patches.
thanks in advance.

 Please see additional comments below.

 - Jason Wang jasow...@redhat.com wrote:

 Hi All:
 This patch tries to add tap network support in kvm-autotest. Multiple
 nics connected to different bridges could be achieved through this
 script. Public bridge is important for testing real network traffic
 and migration. The patch gives each nic with randomly generated mac
 address. The ip address required in the test could be dynamically
 probed through nmap/arp. Only the ip address of first NIC is used
 through the test.

 Example:
 nics = nic1 nic2
 network = bridge
 bridge = switch
 ifup =/etc/qemu-ifup-switch
 ifdown =/etc/qemu-ifdown-switch

 This would make the virtual machine have two nics both of which are
 connected to a bridge with the name of 'switch'. Ifup/ifdown scripts
 are also specified.

 Another Example:
 nics = nic1 nic2
 network = bridge
 bridge = switch
 bridge_nic2 = virbr0
 ifup =/etc/qemu-ifup-switch
 ifup_nic2 = /etc/qemu-ifup-virbr0

 This would makes the virtual machine have two nics: nic1 are connected
 to bridge 'switch' and nci2 are connected to bridge 'virbr0'.

 Public mode and user mode nic could also be mixed:
 nics = nic1 nic2
 network = bridge
 network_nic2 = user

 Looking forward for comments and suggestions.

 From: jason jasow...@redhat.com
 Date: Wed, 13 May 2009 16:15:28 +0800
 Subject: [PATCH] Add tap networking support.

 ---
  client/tests/kvm_runtest_2/kvm_utils.py |    7 +++
  client/tests/kvm_runtest_2/kvm_vm.py    |   74
 ++-
  2 files changed, 69 insertions(+), 12 deletions(-)

 diff --git a/client/tests/kvm_runtest_2/kvm_utils.py
 b/client/tests/kvm_runtest_2/kvm_utils.py
 index be8ad95..0d1f7f8 100644
 --- a/client/tests/kvm_runtest_2/kvm_utils.py
 +++ b/client/tests/kvm_runtest_2/kvm_utils.py
 @@ -773,3 +773,10 @@ def md5sum_file(filename, size=None):
          size -= len(data)
      f.close()
      return o.hexdigest()
 +
 +def random_mac():
 +    mac=[0x00,0x16,0x30,
 +         random.randint(0x00,0x09),
 +         random.randint(0x00,0x09),
 +         random.randint(0x00,0x09)]
 +    return ':'.join(map(lambda x: %02x %x,mac))

 Random MAC addresses will not necessarily work everywhere, as far as
 I know. That's why I prefer user specified MAC/IP address ranges.

 diff --git a/client/tests/kvm_runtest_2/kvm_vm.py
 b/client/tests/kvm_runtest_2/kvm_vm.py
 index fab839f..ea7dab6 100644
 --- a/client/tests/kvm_runtest_2/kvm_vm.py
 +++ b/client/tests/kvm_runtest_2/kvm_vm.py
 @@ -105,6 +105,10 @@ class VM:
          self.qemu_path = qemu_path
          self.image_dir = image_dir
          self.iso_dir = iso_dir
 +        self.macaddr = []
 +        for nic_name in kvm_utils.get_sub_dict_names(params,nics):
 +            macaddr = kvm_utils.random_mac()
 +            self.macaddr.append(macaddr)

      def verify_process_identity(self):
          Make sure .pid really points to the original qemu
 process.
 @@ -189,9 +193,25 @@ class VM:
          for nic_name in kvm_utils.get_sub_dict_names(params,
 nics):
              nic_params = kvm_utils.get_sub_dict(params, nic_name)
              qemu_cmd +=  -net nic,vlan=%d % vlan
 +            net = nic_params.get(network)
 +            if net == bridge:
 +                qemu_cmd += ,macaddr=%s % self.macaddr[vlan]
              if nic_params.get(nic_model):
                  qemu_cmd += ,model=%s % nic_params.get(nic_model)
 -            qemu_cmd +=  -net user,vlan=%d % vlan
 +            if net == bridge:
 +                qemu_cmd +=  -net tap,vlan=%d % vlan
 +                ifup = nic_params.get(ifup)
 +                if ifup:
 +                    qemu_cmd += ,script=%s % ifup
 +                else:
 +                   

Re: Fwd: kvm-autotest: False PASS results

2009-06-02 Thread sudhir kumar
On Mon, Jun 1, 2009 at 8:33 PM, Uri Lublin u...@redhat.com wrote:
 On 05/10/2009 08:15 PM, sudhir kumar wrote:

 Hi Uri,
 Any comments?


 -- Forwarded message --
 From: sudhir kumarsmalik...@gmail.com

 The kvm-autotest shows the following PASS results for migration,
 while the VM was crashed and test should have failed.

 Here is the sequence of test commands and results grepped from
 kvm-autotest output.


 /root/sudhir/regression/test/kvm-autotest-phx/client/tests/kvm_runtest_2/qemu
 -name 'vm1' -monitor
 unix:/tmp/monitor-20090508-055624-QSuS,server,nowait -drive

 file=/root/sudhir/regression/test/kvm-autotest-phx/client/tests/kvm_runtest_2/images/rhel5-32.raw,if=ide,boot=on
 -net nic,vlan=0 -net user,vlan=0 -m 8192
 -smp 4 -redir tcp:5000::22 -vnc :1



 /root/sudhir/regression/test/kvm-autotest-phx/client/tests/kvm_runtest_2/qemu
 -name 'dst' -monitor
 unix:/tmp/monitor-20090508-055625-iamW,server,nowait -drive

 file=/root/sudhir/regression/test/kvm-autotest-phx/client/tests/kvm_runtest_2/images/rhel5-32.raw,if=ide,boot=on
 -net nic,vlan=0 -net user,vlan=0 -m 8192
 -smp 4 -redir tcp:5001::22 -vnc :2 -incoming tcp:0:5200



 2009-05-08 05:58:43,471 Configuring logger for client level
                GOOD
 kvm_runtest_2.raw.8gb_mem.smp4.RHEL.5.3.i386.migrate.1
        END GOOD
 kvm_runtest_2.raw.8gb_mem.smp4.RHEL.5.3.i386.migrate.1

                GOOD
 kvm_runtest_2.raw.8gb_mem.smp4.RHEL.5.3.i386.migrate.2
 kvm_runtest_2.raw.8gb_mem.smp4.RHEL.5.3.i386.migrate.2
 timestamp=1241762371
 localtime=May 08 05:59:31       completed successfully
 Persistent state variable __group_level now set to 1
        END GOOD
 kvm_runtest_2.raw.8gb_mem.smp4.RHEL.5.3.i386.migrate.2
 kvm_runtest_2.raw.8gb_mem.smp4.RHEL.5.3.i386.migrate.2
 timestamp=1241762371
 localtime=May 08 05:59:31

  From the test output it looks that the test was succesful to
 log into the guest after migration:

 20090508-055926 raw.8gb_mem.smp4.RHEL.5.3.i386.migrate.2: Migration
 finished successfully
 20090508-055926 raw.8gb_mem.smp4.RHEL.5.3.i386.migrate.2: DEBUG:
 send_monitor_cmd: Sending monitor command: screendump

 /root/sudhir/regression/test/kvm-autotest-phx/client/results/default/kvm_runtest_2.raw.8gb_mem.smp4.RHEL.5.3.i386.migrate.2/debug/migration_post.ppm
 20090508-055926 raw.8gb_mem.smp4.RHEL.5.3.i386.migrate.2: DEBUG:
 send_monitor_cmd: Sending monitor command: screendump

 /root/sudhir/regression/test/kvm-autotest-phx/client/results/default/kvm_runtest_2.raw.8gb_mem.smp4.RHEL.5.3.i386.migrate.2/debug/migration_pre.ppm
 20090508-055926 raw.8gb_mem.smp4.RHEL.5.3.i386.migrate.2: DEBUG:
 send_monitor_cmd: Sending monitor command: quit
 20090508-055926 raw.8gb_mem.smp4.RHEL.5.3.i386.migrate.2: DEBUG:
 is_sshd_running: Timeout
 20090508-055926 raw.8gb_mem.smp4.RHEL.5.3.i386.migrate.2: Logging into
 guest after migration...
 20090508-055926 raw.8gb_mem.smp4.RHEL.5.3.i386.migrate.2: DEBUG:
 remote_login: Trying to login...
 20090508-055927 raw.8gb_mem.smp4.RHEL.5.3.i386.migrate.2: DEBUG:
 remote_login: Got 'Are you sure...'; sending 'yes'
 20090508-055927 raw.8gb_mem.smp4.RHEL.5.3.i386.migrate.2: DEBUG:
 remote_login: Got password prompt; sending '123456'
 20090508-055928 raw.8gb_mem.smp4.RHEL.5.3.i386.migrate.2: DEBUG:
 remote_login: Got shell prompt -- logged in
 20090508-055928 raw.8gb_mem.smp4.RHEL.5.3.i386.migrate.2: Logged in
 after migration
 20090508-055928 raw.8gb_mem.smp4.RHEL.5.3.i386.migrate.2: DEBUG:
 get_command_status_output: Sending command: help
 20090508-055930 raw.8gb_mem.smp4.RHEL.5.3.i386.migrate.2: DEBUG:
 postprocess_vm: Postprocessing VM 'vm1'...
 20090508-055930 raw.8gb_mem.smp4.RHEL.5.3.i386.migrate.2: DEBUG:
 postprocess_vm: VM object found in environment

 When I did vnc to the final migrated VM was crashed with a call trace
 as shown in the attachment.
 Quite less possible that the call trace appeared after the test
 finished as migration with memory
 more than 4GB is already broken [BUG 52527]. This looks a false PASS
 to me. Any idea how can we handle
 such falso positive results? Shall we wait for sometime after
 migration, log into the vm, do some work or run some good test,
 get output and report that the vm is alive?



 I don't think it's a False PASS.
 It seems the test was able to ssh into the guest, and run a command on the
 guest.

 Currently we only run migration once (round-trip). I think we should run
 migration more than once (using iterations). If the guest crashes due to
 migration, it would fail following rounds of migration.
Also I would like to have some scripts like basic_test.py which will
be executed inside the guest to check it's health to a more extent.
Though this will again need different scripts for windows/linux/mac
etc. Do you agree on it?


 Sorry for the late reply,
Its OK. Thanks for the response.
    Uri.




-- 
Sudhir Kumar
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo 

Re: [PATCH 3/3] KVM: Cache pdptrs

2009-06-02 Thread Joerg Roedel
On Mon, Jun 01, 2009 at 04:22:03PM +0300, Avi Kivity wrote:
 +static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
 +{
 + switch (reg) {
 + case VCPU_EXREG_PDPTR:
 + BUG_ON(!npt_enabled);
 + load_pdptrs(vcpu, vcpu-arch.cr3);
 + break;
 + default:
 + BUG();
 + }
 +}

Don't we need to check for the return value of load_pdptrs() here and inject
a #GP it it fails?

 +
  static void svm_set_vintr(struct vcpu_svm *svm)
  {
   svm-vmcb-control.intercept |= 1ULL  INTERCEPT_VINTR;
 @@ -2286,12 +2298,6 @@ static int handle_exit(struct kvm_run *kvm_run, struct 
 kvm_vcpu *vcpu)
   }
   vcpu-arch.cr0 = svm-vmcb-save.cr0;
   vcpu-arch.cr3 = svm-vmcb-save.cr3;
 - if (is_paging(vcpu)  is_pae(vcpu)  !is_long_mode(vcpu)) {
 - if (!load_pdptrs(vcpu, vcpu-arch.cr3)) {
 - kvm_inject_gp(vcpu, 0);
 - return 1;
 - }
 - }

... as done here.

Joerg

-- 
   | Advanced Micro Devices GmbH
 Operating | Karl-Hammerschmidt-Str. 34, 85609 Dornach bei München
 System| 
 Research  | Geschäftsführer: Thomas M. McCoy, Giuliano Meroni
 Center| Sitz: Dornach, Gemeinde Aschheim, Landkreis München
   | Registergericht München, HRB Nr. 43632

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/3] KVM: Cache pdptrs

2009-06-02 Thread Avi Kivity

Joerg Roedel wrote:

On Mon, Jun 01, 2009 at 04:22:03PM +0300, Avi Kivity wrote:
  

+static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
+{
+   switch (reg) {
+   case VCPU_EXREG_PDPTR:
+   BUG_ON(!npt_enabled);
+   load_pdptrs(vcpu, vcpu-arch.cr3);
+   break;
+   default:
+   BUG();
+   }
+}



Don't we need to check for the return value of load_pdptrs() here and inject
a #GP it it fails?
  


We're after some random exit, the guest won't be expecting a #GP in some 
random instruction.


The only options are ignore and triple fault.


+
 static void svm_set_vintr(struct vcpu_svm *svm)
 {
svm-vmcb-control.intercept |= 1ULL  INTERCEPT_VINTR;
@@ -2286,12 +2298,6 @@ static int handle_exit(struct kvm_run *kvm_run, struct 
kvm_vcpu *vcpu)
}
vcpu-arch.cr0 = svm-vmcb-save.cr0;
vcpu-arch.cr3 = svm-vmcb-save.cr3;
-   if (is_paging(vcpu)  is_pae(vcpu)  !is_long_mode(vcpu)) {
-   if (!load_pdptrs(vcpu, vcpu-arch.cr3)) {
-   kvm_inject_gp(vcpu, 0);
-   return 1;
-   }
-   }



... as done here.


That's a bug... luckily no guests trash their PDPTs after loading CR3.

I guess I should fix in a separate patch to avoid mixing a bugfix with a 
feature.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] KVM: VMX: Simplify pdptr and cr3 management

2009-06-02 Thread Sheng Yang
On Monday 01 June 2009 21:22:02 Avi Kivity wrote:
 Instead of reading the PDPTRs from memory after every exit (which is slow
 and wrong, as the PDPTRs are stored on the cpu), sync the PDPTRs from
 memory to the VMCS before entry, and from the VMCS to memory after exit.
 Do the same for cr3.


Thanks for fixing!

After review my original code, I found a potential bug. For SDM 3B have this:

23.3.4 Saving Non-Register State
...
If the logical processor supports the 1-setting of the “enable EPT” VM-
execution control, values are saved into the four (4) PDPTE fields as follows:
— If the “enable EPT” VM-execution control is 1 and the logical processor was
using PAE paging at the time of the VM exit, the PDPTE values currently in use
are saved:
• The values saved into bits 11:9 of each of the fields is undefined.
• If the value saved into one of the fields has bit 0 (present) clear, the 
value saved into bits 63:1 of that field is undefined. That value need not
correspond to the value that was loaded by VM entry or to any value that
might have been loaded in VMX non-root operation.
• If the value saved into one of the fields has bit 0 (present) set, the value
saved into bits 63:12 of the field is a guest-physical address.
— If the “enable EPT” VM-execution control is 0 or the logical processor was 
not using PAE paging at the time of the VM exit, the values saved are 
undefined.

But drop the ept_load_pdptrs() when exit and add it in cr0 handling result in 
Windows PAE guest hang on boot. I am checking it now. Any thoughts?...

-- 
regards
Yang, Sheng

 Signed-off-by: Avi Kivity a...@redhat.com
 ---
  arch/x86/kvm/vmx.c |   21 +++--
  1 files changed, 15 insertions(+), 6 deletions(-)

 diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
 index 5607de8..1783606 100644
 --- a/arch/x86/kvm/vmx.c
 +++ b/arch/x86/kvm/vmx.c
 @@ -1538,10 +1538,6 @@ static void vmx_decache_cr4_guest_bits(struct
 kvm_vcpu *vcpu) static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
  {
   if (is_paging(vcpu)  is_pae(vcpu)  !is_long_mode(vcpu)) {
 - if (!load_pdptrs(vcpu, vcpu-arch.cr3)) {
 - printk(KERN_ERR EPT: Fail to load pdptrs!\n);
 - return;
 - }
   vmcs_write64(GUEST_PDPTR0, vcpu-arch.pdptrs[0]);
   vmcs_write64(GUEST_PDPTR1, vcpu-arch.pdptrs[1]);
   vmcs_write64(GUEST_PDPTR2, vcpu-arch.pdptrs[2]);
 @@ -1549,6 +1545,16 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
   }
  }

 +static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
 +{
 + if (is_paging(vcpu)  is_pae(vcpu)  !is_long_mode(vcpu)) {
 + vcpu-arch.pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
 + vcpu-arch.pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
 + vcpu-arch.pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
 + vcpu-arch.pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
 + }
 +}
 +
  static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);

  static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
 @@ -1642,7 +1648,6 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu,
 unsigned long cr3) if (enable_ept) {
   eptp = construct_eptp(cr3);
   vmcs_write64(EPT_POINTER, eptp);
 - ept_load_pdptrs(vcpu);
   guest_cr3 = is_paging(vcpu) ? vcpu-arch.cr3 :
   VMX_EPT_IDENTITY_PAGETABLE_ADDR;
   }
 @@ -3199,7 +3204,7 @@ static int vmx_handle_exit(struct kvm_run *kvm_run,
 struct kvm_vcpu *vcpu) * to sync with guest real CR3. */
   if (enable_ept  is_paging(vcpu)) {
   vcpu-arch.cr3 = vmcs_readl(GUEST_CR3);
 - ept_load_pdptrs(vcpu);
 + ept_save_pdptrs(vcpu);
   }

   if (unlikely(vmx-fail)) {
 @@ -3376,6 +3381,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu,
 struct kvm_run *kvm_run) {
   struct vcpu_vmx *vmx = to_vmx(vcpu);

 + if (enable_ept  is_paging(vcpu)) {
 + vmcs_writel(GUEST_CR3, vcpu-arch.cr3);
 + ept_load_pdptrs(vcpu);
 + }
   /* Record the guest's net vcpu time for enforced NMI injections. */
   if (unlikely(!cpu_has_virtual_nmis()  vmx-soft_vnmi_blocked))
   vmx-entry_time = ktime_get();



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] KVM: VMX: Simplify pdptr and cr3 management

2009-06-02 Thread Avi Kivity

Sheng Yang wrote:

On Monday 01 June 2009 21:22:02 Avi Kivity wrote:
  

Instead of reading the PDPTRs from memory after every exit (which is slow
and wrong, as the PDPTRs are stored on the cpu), sync the PDPTRs from
memory to the VMCS before entry, and from the VMCS to memory after exit.
Do the same for cr3.




Thanks for fixing!

After review my original code, I found a potential bug. For SDM 3B have this:

23.3.4 Saving Non-Register State
...
If the logical processor supports the 1-setting of the “enable EPT” VM-
execution control, values are saved into the four (4) PDPTE fields as follows:
— If the “enable EPT” VM-execution control is 1 and the logical processor was
using PAE paging at the time of the VM exit, the PDPTE values currently in use
are saved:
• The values saved into bits 11:9 of each of the fields is undefined.
• If the value saved into one of the fields has bit 0 (present) clear, the 
value saved into bits 63:1 of that field is undefined. That value need not

correspond to the value that was loaded by VM entry or to any value that
might have been loaded in VMX non-root operation.
• If the value saved into one of the fields has bit 0 (present) set, the value
saved into bits 63:12 of the field is a guest-physical address.
— If the “enable EPT” VM-execution control is 0 or the logical processor was 
not using PAE paging at the time of the VM exit, the values saved are 
undefined.


But drop the ept_load_pdptrs() when exit and add it in cr0 handling result in 
Windows PAE guest hang on boot. I am checking it now. Any thoughts?...
  


You mean with the new code?  What version of Windows exactly?

I'll check it out, though EPTs are a little hard to find here.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] KVM: VMX: Simplify pdptr and cr3 management

2009-06-02 Thread Sheng Yang
On Tuesday 02 June 2009 17:26:27 Avi Kivity wrote:
 Sheng Yang wrote:
  On Monday 01 June 2009 21:22:02 Avi Kivity wrote:
  Instead of reading the PDPTRs from memory after every exit (which is
  slow and wrong, as the PDPTRs are stored on the cpu), sync the PDPTRs
  from memory to the VMCS before entry, and from the VMCS to memory after
  exit. Do the same for cr3.
 
  Thanks for fixing!
 
  After review my original code, I found a potential bug. For SDM 3B have
  this:
 
  23.3.4 Saving Non-Register State
  ...
  If the logical processor supports the 1-setting of the “enable EPT” VM-
  execution control, values are saved into the four (4) PDPTE fields as
  follows: — If the “enable EPT” VM-execution control is 1 and the logical
  processor was using PAE paging at the time of the VM exit, the PDPTE
  values currently in use are saved:
  • The values saved into bits 11:9 of each of the fields is undefined.
  • If the value saved into one of the fields has bit 0 (present) clear,
  the value saved into bits 63:1 of that field is undefined. That value
  need not correspond to the value that was loaded by VM entry or to any
  value that might have been loaded in VMX non-root operation.
  • If the value saved into one of the fields has bit 0 (present) set, the
  value saved into bits 63:12 of the field is a guest-physical address.
  — If the “enable EPT” VM-execution control is 0 or the logical processor
  was not using PAE paging at the time of the VM exit, the values saved are
  undefined.
 
  But drop the ept_load_pdptrs() when exit and add it in cr0 handling
  result in Windows PAE guest hang on boot. I am checking it now. Any
  thoughts?...

 You mean with the new code?  What version of Windows exactly?

 I'll check it out, though EPTs are a little hard to find here.

No, no, not with the new code. For CPU can load pdptrs if EPT enabled with PAE 
from VM exit, there should not be necessary load it explicitly. So I estimate 
the ept_load_pdptr() in exit handler, and put it in CR0 handling. Just tried 
to optimize load-pdptr according to the spec, but not got the desired 
result...

So I am trying to find the failure reason...

-- 
regards
Yang, Sheng
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/3] KVM: Cache pdptrs

2009-06-02 Thread Joerg Roedel
On Tue, Jun 02, 2009 at 12:09:17PM +0300, Avi Kivity wrote:
 Joerg Roedel wrote:
 On Mon, Jun 01, 2009 at 04:22:03PM +0300, Avi Kivity wrote:
   
 +static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
 +{
 +   switch (reg) {
 +   case VCPU_EXREG_PDPTR:
 +   BUG_ON(!npt_enabled);
 +   load_pdptrs(vcpu, vcpu-arch.cr3);
 +   break;
 +   default:
 +   BUG();
 +   }
 +}
 

 Don't we need to check for the return value of load_pdptrs() here and inject
 a #GP it it fails?
   

 We're after some random exit, the guest won't be expecting a #GP in some  
 random instruction.

 The only options are ignore and triple fault.

Thats not different from PAE with NPT anyways. With NPT the hardware
does not load all four pdptrs on cr3 switch time, only when they
are used.

Joerg

-- 
   | Advanced Micro Devices GmbH
 Operating | Karl-Hammerschmidt-Str. 34, 85609 Dornach bei München
 System| 
 Research  | Geschäftsführer: Thomas M. McCoy, Giuliano Meroni
 Center| Sitz: Dornach, Gemeinde Aschheim, Landkreis München
   | Registergericht München, HRB Nr. 43632

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/3] KVM: Cache pdptrs

2009-06-02 Thread Avi Kivity

Joerg Roedel wrote:

On Tue, Jun 02, 2009 at 12:09:17PM +0300, Avi Kivity wrote:
  

Joerg Roedel wrote:


On Mon, Jun 01, 2009 at 04:22:03PM +0300, Avi Kivity wrote:
  
  

+static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
+{
+   switch (reg) {
+   case VCPU_EXREG_PDPTR:
+   BUG_ON(!npt_enabled);
+   load_pdptrs(vcpu, vcpu-arch.cr3);
+   break;
+   default:
+   BUG();
+   }
+}



Don't we need to check for the return value of load_pdptrs() here and inject
a #GP it it fails?
  
  
We're after some random exit, the guest won't be expecting a #GP in some  
random instruction.


The only options are ignore and triple fault.



Thats not different from PAE with NPT anyways. With NPT the hardware
does not load all four pdptrs on cr3 switch time, only when they
are used.
  


That will at least cause a page fault on a related instruction.  But we 
can't #GP on a random exit.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] KVM: VMX: Simplify pdptr and cr3 management

2009-06-02 Thread Avi Kivity

Sheng Yang wrote:
No, no, not with the new code. For CPU can load pdptrs if EPT enabled with PAE 
from VM exit, there should not be necessary load it explicitly. So I estimate 
the ept_load_pdptr() in exit handler, and put it in CR0 handling. Just tried 
to optimize load-pdptr according to the spec, but not got the desired 
result...


So I am trying to find the failure reason...
  


so from your point of view, the patches are okay?

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: physical memory dump problem

2009-06-02 Thread Avi Kivity

Matteo Signorini wrote:

Hi!

I have a problem with both kvm pmemsave and memsave functions.
(I know that both invoke the cpu_physical_memory_rw function)

I'm trying to dump a guest physical memory region (sys_call_table for example).
I know that , for example, the sys_call_table is stored at
0x8133c620 phys addr (found in System.map)
  


System.map lists virtual addresses, not physical addresses.  Try memsave 
instead.



--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] KVM: VMX: Simplify pdptr and cr3 management

2009-06-02 Thread Sheng Yang
On Tuesday 02 June 2009 17:46:39 Avi Kivity wrote:
 Sheng Yang wrote:
  No, no, not with the new code. For CPU can load pdptrs if EPT enabled
  with PAE from VM exit, there should not be necessary load it explicitly.
  So I estimate the ept_load_pdptr() in exit handler, and put it in CR0
  handling. Just tried to optimize load-pdptr according to the spec, but
  not got the desired result...
 
  So I am trying to find the failure reason...

 so from your point of view, the patches are okay?

Yes. But we should can optimize it more by avoiding most of PDPTR loading, 
according to the spec.

-- 
regards
Yang, Sheng
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] powerpc/kvm: fix some init/exit annotations

2009-06-02 Thread Avi Kivity

Stephen Rothwell wrote:

Fixes a couple of warnings like this one:

WARNING: arch/powerpc/kvm/kvm-440.o(.text+0x1e8c): Section mismatch in 
reference from the function kvmppc_44x_exit() to the function 
.exit.text:kvmppc_booke_exit()
The function kvmppc_44x_exit() references a function in an exit section.
Often the function kvmppc_booke_exit() has valid usage outside the exit section
and the fix is to remove the __exit annotation of kvmppc_booke_exit.
  


Applied, thanks.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [KVM PATCH v3 0/3] mmio/pio cleanup

2009-06-02 Thread Avi Kivity

Gregory Haskins wrote:

(This is v3 of the series, applies to kvm.git/master:7ff90748)

This is in prep for some more substantial mmio/pio work for iosignalfd,
coming later.

[ Changelog:

  v3:
  *) Addressed feedback from Chris Wright w.r.t. patch 2/3
  *) Rebased to kvm.git/master:7ff90748

]
  


All in, thanks.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] KVM: VMX: Simplify pdptr and cr3 management

2009-06-02 Thread Avi Kivity

Sheng Yang wrote:

On Tuesday 02 June 2009 17:46:39 Avi Kivity wrote:
  

Sheng Yang wrote:


No, no, not with the new code. For CPU can load pdptrs if EPT enabled
with PAE from VM exit, there should not be necessary load it explicitly.
So I estimate the ept_load_pdptr() in exit handler, and put it in CR0
handling. Just tried to optimize load-pdptr according to the spec, but
not got the desired result...

So I am trying to find the failure reason...
  

so from your point of view, the patches are okay?



Yes. But we should can optimize it more by avoiding most of PDPTR loading, 
according to the spec.
  


Patch 3 makes reloading a very rare event.  It only happens on pae mode 
changes or if userspace sets cr3.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Read MADT entries from memory in processors _MAT method.

2009-06-02 Thread Avi Kivity

Gleb Natapov wrote:

Also use enable/disable bit from ACPI MADT entries to track CPU hot
plug. This removes assumptions about APIC ids from AML code and simplify
cup hotplug handling.

  


Applied, thanks.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] qemu-kvm: Flush icache after dma operations for ia64

2009-06-02 Thread Jes Sorensen

Avi Kivity wrote:

Zhang, Xiantao wrote:
Fine to me.  But seems the change in qemu_iovec_from_buffer is lost in 
this patch or that change is also not unnecessary ?


I think the fixed unmap handles that case.  Can you test to make sure?



Avi and I went through the code and verified that it was all covered by
the unmap case. It was pretty messy to come to the conclusion, which is
why I tried to document it in the code and the patch description.

Cheers,
Jes
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] KVM: VMX: Simplify pdptr and cr3 management

2009-06-02 Thread Sheng Yang
On Tuesday 02 June 2009 18:16:14 Avi Kivity wrote:
 Sheng Yang wrote:
  On Tuesday 02 June 2009 17:46:39 Avi Kivity wrote:
  Sheng Yang wrote:
  No, no, not with the new code. For CPU can load pdptrs if EPT enabled
  with PAE from VM exit, there should not be necessary load it
  explicitly. So I estimate the ept_load_pdptr() in exit handler, and put
  it in CR0 handling. Just tried to optimize load-pdptr according to the
  spec, but not got the desired result...
 
  So I am trying to find the failure reason...
 
  so from your point of view, the patches are okay?
 
  Yes. But we should can optimize it more by avoiding most of PDPTR
  loading, according to the spec.

 Patch 3 makes reloading a very rare event.  It only happens on pae mode
 changes or if userspace sets cr3.

Yes, you are right. Quite beautiful. :)

Sorry for misunderstood the meaning of on demand without looking at the 
patch carefully...

Looks fine now. Thanks!

BTW: one extra blank line above function kvm_pdptr_read() in patch 3. ;)

-- 
regards
Yang, Sheng
 



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] KVM: VMX: Simplify pdptr and cr3 management

2009-06-02 Thread Avi Kivity

Sheng Yang wrote:

Yes, you are right. Quite beautiful. :)
  


Thanks :)

It's basically a continuation of RIP/RSP caching.  I plan to continue it 
for cr0/cr3.


Sorry for misunderstood the meaning of on demand without looking at the 
patch carefully...


Looks fine now. Thanks!

BTW: one extra blank line above function kvm_pdptr_read() in patch 3. ;)
  


Will fix.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/3] KVM: Cache pdptrs

2009-06-02 Thread Avi Kivity

Avi Kivity wrote:

+
 static void svm_set_vintr(struct vcpu_svm *svm)
 {
 svm-vmcb-control.intercept |= 1ULL  INTERCEPT_VINTR;
@@ -2286,12 +2298,6 @@ static int handle_exit(struct kvm_run 
*kvm_run, struct kvm_vcpu *vcpu)

 }
 vcpu-arch.cr0 = svm-vmcb-save.cr0;
 vcpu-arch.cr3 = svm-vmcb-save.cr3;
-if (is_paging(vcpu)  is_pae(vcpu)  !is_long_mode(vcpu)) {
-if (!load_pdptrs(vcpu, vcpu-arch.cr3)) {
-kvm_inject_gp(vcpu, 0);
-return 1;
-}
-}



... as done here.


That's a bug... luckily no guests trash their PDPTs after loading CR3.

I guess I should fix in a separate patch to avoid mixing a bugfix with 
a feature.




That could cause a regression if we catch a guest preparing a new 
pdpte.  So I'll leave the code as is.


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Fix tpr patching to get cpu index from Windows per cpu area.

2009-06-02 Thread Gleb Natapov
Using cpu_index for this purpose work only by luck.

Signed-off-by: Gleb Natapov g...@redhat.com
diff --git a/kvm-tpr-opt.c b/kvm-tpr-opt.c
index 246e08d..bdbc742 100644
--- a/kvm-tpr-opt.c
+++ b/kvm-tpr-opt.c
@@ -220,13 +220,29 @@ static int bios_is_mapped(CPUState *env, uint64_t rip)
 return 1;
 }
 
+static int get_pcr_cpu(CPUState *env)
+{
+struct kvm_sregs sregs;
+uint8_t b;
+
+kvm_save_registers(env);
+
+if (cpu_memory_rw_debug(env, env-segs[R_FS].base + 0x51, b, 1, 0)  0)
+   return -1;
+
+return (int)b;
+}
+
 static int enable_vapic(CPUState *env)
 {
 static uint8_t one = 1;
+int pcr_cpu = get_pcr_cpu(env);
+
+if (pcr_cpu  0)
+   return 0;
 
-kvm_enable_vapic(kvm_context, env-cpu_index,
-vapic_phys + (env-cpu_index  7));
-cpu_physical_memory_rw(vapic_phys + (env-cpu_index  7) + 4, one, 1, 1);
+kvm_enable_vapic(kvm_context, env-cpu_index, vapic_phys + (pcr_cpu  7));
+cpu_physical_memory_rw(vapic_phys + (pcr_cpu  7) + 4, one, 1, 1);
 bios_enabled = 1;
 
 return 1;
--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Fix tpr patching to get cpu index from Windows per cpu area.

2009-06-02 Thread Avi Kivity

Gleb Natapov wrote:

Using cpu_index for this purpose work only by luck.

  


Applied, thanks.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] clean up cpu hotplug code

2009-06-02 Thread Avi Kivity

Glauber Costa wrote:

There's nothing kvm specific in get_cpu function. Remove it from
kvm ifdef. Buy us a cleaner code, and may help us with any attempt
of integrating this on the future.

Signed-off-by: Glauber Costa glom...@redhat.com
---
 hw/acpi.c |   12 +++-
 1 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/hw/acpi.c b/hw/acpi.c
index f4062ac..7f23e4e 100644
--- a/hw/acpi.c
+++ b/hw/acpi.c
@@ -770,9 +770,7 @@ static void disable_processor(struct gpe_regs *g, int cpu)
 g-cpus_sts[cpu/8] = ~(1  (cpu%8));
 }
 
-#if defined(TARGET_I386) || defined(TARGET_X86_64)

-#ifdef USE_KVM
-static CPUState *qemu_kvm_cpu_env(int index)
+static CPUState *qemu_get_cpu_env(int index)
 {
 CPUState *penv;
 
@@ -786,18 +784,14 @@ static CPUState *qemu_kvm_cpu_env(int index)
 
 return NULL;

 }
-#endif
  


Want this cause an undefined function warning on non-x86 targets?

If you send two identical patches, please version them and note the changes.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] provide a kvm-free pic implementation

2009-06-02 Thread Avi Kivity

Glauber Costa wrote:

This patch separates all i8259 specific kvm implementation
in separate functions. This way, qemu can have an i8259 device
free from any kvm interference. kvm can have a simpler code path
(we may even be able to compile out qemu's pic in the future if
we want to), and everybody gets happy.
  


Applied, thanks.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC v3 3/4] Break dependency between vcpu index in vcpus array and vcpu_id.

2009-06-02 Thread Marcelo Tosatti
On Thu, May 28, 2009 at 12:54:46PM +0300, Gleb Natapov wrote:
 Archs are free to use vcpu_id as they see fit. For x86 it is used as
 vcpu's apic id. New ioctl is added to configure boot vcpu id that was
 assumed to be 0 till now.
 
 Signed-off-by: Gleb Natapov g...@redhat.com
 ---
  include/linux/kvm.h  |2 +
  include/linux/kvm_host.h |2 +
  virt/kvm/kvm_main.c  |   50 ++---
  3 files changed, 33 insertions(+), 21 deletions(-)
 
 diff --git a/include/linux/kvm.h b/include/linux/kvm.h
 index 632a856..d10ab5d 100644
 --- a/include/linux/kvm.h
 +++ b/include/linux/kvm.h
 @@ -430,6 +430,7 @@ struct kvm_trace_rec {
  #ifdef __KVM_HAVE_PIT
  #define KVM_CAP_PIT2 33
  #endif
 +#define KVM_CAP_SET_BOOT_CPU_ID 34
  
  #ifdef KVM_CAP_IRQ_ROUTING
  
 @@ -537,6 +538,7 @@ struct kvm_irqfd {
  #define KVM_DEASSIGN_DEV_IRQ   _IOW(KVMIO, 0x75, struct kvm_assigned_irq)
  #define KVM_IRQFD  _IOW(KVMIO, 0x76, struct kvm_irqfd)
  #define KVM_CREATE_PIT2 _IOW(KVMIO, 0x77, struct 
 kvm_pit_config)
 +#define KVM_SET_BOOT_CPU_ID_IO(KVMIO, 0x78)
  
  /*
   * ioctls for vcpu fds
 diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
 index e9e0cd8..e368a14 100644
 --- a/include/linux/kvm_host.h
 +++ b/include/linux/kvm_host.h
 @@ -130,8 +130,10 @@ struct kvm {
   int nmemslots;
   struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS +
   KVM_PRIVATE_MEM_SLOTS];
 + u32 bsp_vcpu_id;
   struct kvm_vcpu *bsp_vcpu;
   struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
 + atomic_t online_vcpus;
   struct list_head vm_list;
   struct kvm_io_bus mmio_bus;
   struct kvm_io_bus pio_bus;
 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
 index 5a55fe0..d65c637 100644
 --- a/virt/kvm/kvm_main.c
 +++ b/virt/kvm/kvm_main.c
 @@ -693,11 +693,6 @@ out:
  }
  #endif
  
 -static inline int valid_vcpu(int n)
 -{
 - return likely(n = 0  n  KVM_MAX_VCPUS);
 -}
 -
  inline int kvm_is_mmio_pfn(pfn_t pfn)
  {
   if (pfn_valid(pfn)) {
 @@ -1713,15 +1708,12 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu)
  /*
   * Creates some virtual cpus.  Good luck creating more than one.
   */
 -static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
 +static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
  {
   int r;
   struct kvm_vcpu *vcpu;
  
 - if (!valid_vcpu(n))
 - return -EINVAL;
 -
 - vcpu = kvm_arch_vcpu_create(kvm, n);
 + vcpu = kvm_arch_vcpu_create(kvm, id);
   if (IS_ERR(vcpu))
   return PTR_ERR(vcpu);
  
 @@ -1732,25 +1724,36 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, 
 int n)
   return r;
  
   mutex_lock(kvm-lock);
 - if (kvm-vcpus[n]) {
 - r = -EEXIST;
 + if (atomic_read(kvm-online_vcpus) == KVM_MAX_VCPUS) {
 + r = -EINVAL;
   goto vcpu_destroy;
   }
 - kvm-vcpus[n] = vcpu;
 - if (n == 0)
 - kvm-bsp_vcpu = vcpu;
 - mutex_unlock(kvm-lock);
 +
 + for (r = 0; r  atomic_read(kvm-online_vcpus); r++)
 + if (kvm-vcpus[r]-vcpu_id == id) {
 + r = -EEXIST;
 + goto vcpu_destroy;
 + }
 +
 + BUG_ON(kvm-vcpus[atomic_read(kvm-online_vcpus)]);
  
   /* Now it's all set up, let userspace reach it */
   kvm_get_kvm(kvm);
   r = create_vcpu_fd(vcpu);
 - if (r  0)
 - goto unlink;
 + if (r  0) {
 + kvm_put_kvm(kvm);
 + goto vcpu_destroy;
 + }
 +
 + kvm-vcpus[atomic_read(kvm-online_vcpus)] = vcpu;
 + smp_wmb();
 + atomic_inc(kvm-online_vcpus);
 +
 + if (kvm-bsp_vcpu_id == id)
 + kvm-bsp_vcpu = vcpu;
 + mutex_unlock(kvm-lock);
   return r;
  
 -unlink:
 - mutex_lock(kvm-lock);
 - kvm-vcpus[n] = NULL;
  vcpu_destroy:
   mutex_unlock(kvm-lock);
   kvm_arch_vcpu_destroy(vcpu);
 @@ -2223,6 +2226,10 @@ static long kvm_vm_ioctl(struct file *filp,
   r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags);
   break;
   }
 + case KVM_SET_BOOT_CPU_ID:
 + kvm-bsp_vcpu_id = arg;
 + r = 0;
 + break;

Don't you also need to update the bsp_vcpu pointer? And aren't these two
(bsp_vcpu pointer and bsp_vcpu_id) somewhat redundant? 

Other than looks fine.

   default:
   r = kvm_arch_vm_ioctl(filp, ioctl, arg);
   }
 @@ -2289,6 +2296,7 @@ static long kvm_dev_ioctl_check_extension_generic(long 
 arg)
   case KVM_CAP_USER_MEMORY:
   case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
   case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
 + case KVM_CAP_SET_BOOT_CPU_ID:
   return 1;
  #ifdef CONFIG_HAVE_KVM_IRQCHIP
   case KVM_CAP_IRQ_ROUTING:
 -- 
 1.6.2.1
 
 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More 

Re: [PATCH 4/4] kvm-s390: streamline memslot handling - v6

2009-06-02 Thread Marcelo Tosatti
On Sun, May 31, 2009 at 11:22:58AM +0300, Avi Kivity wrote:
 ehrha...@linux.vnet.ibm.com wrote:
 From: Christian Ehrhardt ehrha...@linux.vnet.ibm.com

 *updates in v6*
 - ensure the wait_on_bit waiter is notified
 - move the reset of requests to kvm_vcpu_release to drop them early

 *updates in v5*
 - ensure dropping vcpu all requests while freeing a vcpu

 *updates in v4*
 - kickout only scheduled vcpus (its superfluous and wait might hang forever 
 on
   not running vcpus)

   

 v3 is already in (and pushed so I can't unapply), so please rebase on  
 top of current git.

Christian, 

Seems a good step toward further unification. Can you please rebase as
requested? 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC v3 3/4] Break dependency between vcpu index in vcpus array and vcpu_id.

2009-06-02 Thread Gleb Natapov
On Tue, Jun 02, 2009 at 09:38:08AM -0300, Marcelo Tosatti wrote:
  @@ -2223,6 +2226,10 @@ static long kvm_vm_ioctl(struct file *filp,
  r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags);
  break;
  }
  +   case KVM_SET_BOOT_CPU_ID:
  +   kvm-bsp_vcpu_id = arg;
  +   r = 0;
  +   break;
 
 Don't you also need to update the bsp_vcpu pointer? And aren't these two
Actually I think I need to return an error if userspace tries to change BSP
after BSP cpu is already created.

 (bsp_vcpu pointer and bsp_vcpu_id) somewhat redundant? 
 
Sometimes pointer to bsp vcpu is needed. It can be found looping over all
vcpus, but I prefer to have the pointer handy.

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH RFC] Do not use cpu_index in interface between libkvm and qemu

2009-06-02 Thread Gleb Natapov
On vcpu creation cookie is returned which is used in future communication.

Signed-off-by: Gleb Natapov g...@redhat.com
diff --git a/cpu-defs.h b/cpu-defs.h
index 1e071e7..5f541e0 100644
--- a/cpu-defs.h
+++ b/cpu-defs.h
@@ -147,6 +147,7 @@ struct KVMCPUState {
 int stop;
 int stopped;
 int created;
+void *vcpu_ctx;
 struct qemu_work_item *queued_work_first, *queued_work_last;
 };
 
diff --git a/hw/apic.c b/hw/apic.c
index 86aa6b6..c5d97b2 100644
--- a/hw/apic.c
+++ b/hw/apic.c
@@ -833,7 +833,7 @@ static void kvm_kernel_lapic_save_to_user(APICState *s)
 struct kvm_lapic_state *kapic = apic;
 int i, v;
 
-kvm_get_lapic(kvm_context, s-cpu_env-cpu_index, kapic);
+kvm_get_lapic(s-cpu_env-kvm_cpu_state.vcpu_ctx, kapic);
 
 s-id = kapic_reg(kapic, 0x2)  24;
 s-tpr = kapic_reg(kapic, 0x8);
@@ -886,7 +886,7 @@ static void kvm_kernel_lapic_load_from_user(APICState *s)
 kapic_set_reg(klapic, 0x38, s-initial_count);
 kapic_set_reg(klapic, 0x3e, s-divide_conf);
 
-kvm_set_lapic(kvm_context, s-cpu_env-cpu_index, klapic);
+kvm_set_lapic(s-cpu_env-kvm_cpu_state.vcpu_ctx, klapic);
 }
 
 #endif
diff --git a/kvm-tpr-opt.c b/kvm-tpr-opt.c
index bdbc742..3f388ef 100644
--- a/kvm-tpr-opt.c
+++ b/kvm-tpr-opt.c
@@ -70,7 +70,7 @@ static uint8_t read_byte_virt(CPUState *env, target_ulong 
virt)
 {
 struct kvm_sregs sregs;
 
-kvm_get_sregs(kvm_context, env-cpu_index, sregs);
+kvm_get_sregs(env-kvm_cpu_state.vcpu_ctx, sregs);
 return ldub_phys(map_addr(sregs, virt, NULL));
 }
 
@@ -78,7 +78,7 @@ static void write_byte_virt(CPUState *env, target_ulong virt, 
uint8_t b)
 {
 struct kvm_sregs sregs;
 
-kvm_get_sregs(kvm_context, env-cpu_index, sregs);
+kvm_get_sregs(env-kvm_cpu_state.vcpu_ctx, sregs);
 stb_phys(map_addr(sregs, virt, NULL), b);
 }
 
@@ -86,7 +86,7 @@ static __u64 kvm_rsp_read(CPUState *env)
 {
 struct kvm_regs regs;
 
-kvm_get_regs(kvm_context, env-cpu_index, regs);
+kvm_get_regs(env-kvm_cpu_state.vcpu_ctx, regs);
 return regs.rsp;
 }
 
@@ -192,7 +192,7 @@ static int bios_is_mapped(CPUState *env, uint64_t rip)
 if (bios_enabled)
return 1;
 
-kvm_get_sregs(kvm_context, env-cpu_index, sregs);
+kvm_get_sregs(env-kvm_cpu_state.vcpu_ctx, sregs);
 
 probe = (rip  0xf000) + 0xe;
 phys = map_addr(sregs, probe, perms);
@@ -241,7 +241,7 @@ static int enable_vapic(CPUState *env)
 if (pcr_cpu  0)
return 0;
 
-kvm_enable_vapic(kvm_context, env-cpu_index, vapic_phys + (pcr_cpu  7));
+kvm_enable_vapic(env-kvm_cpu_state.vcpu_ctx, vapic_phys + (pcr_cpu  7));
 cpu_physical_memory_rw(vapic_phys + (pcr_cpu  7) + 4, one, 1, 1);
 bios_enabled = 1;
 
@@ -314,7 +314,7 @@ void kvm_tpr_access_report(CPUState *env, uint64_t rip, int 
is_write)
 
 void kvm_tpr_vcpu_start(CPUState *env)
 {
-kvm_enable_tpr_access_reporting(kvm_context, env-cpu_index);
+kvm_enable_tpr_access_reporting(env-kvm_cpu_state.vcpu_ctx);
 if (bios_enabled)
enable_vapic(env);
 }
@@ -364,7 +364,7 @@ static void vtpr_ioport_write(void *opaque, uint32_t addr, 
uint32_t val)
 struct kvm_sregs sregs;
 uint32_t rip;
 
-kvm_get_regs(kvm_context, env-cpu_index, regs);
+kvm_get_regs(env-kvm_cpu_state.vcpu_ctx, regs);
 rip = regs.rip - 2;
 write_byte_virt(env, rip, 0x66);
 write_byte_virt(env, rip + 1, 0x90);
@@ -372,7 +372,7 @@ static void vtpr_ioport_write(void *opaque, uint32_t addr, 
uint32_t val)
return;
 if (!bios_is_mapped(env, rip))
printf(bios not mapped?\n);
-kvm_get_sregs(kvm_context, env-cpu_index, sregs);
+kvm_get_sregs(env-kvm_cpu_state.vcpu_ctx, sregs);
 for (addr = 0xf000u; addr = 0x8000u; addr -= 4096)
if (map_addr(sregs, addr, NULL) == 0xfee0u) {
real_tpr = addr + 0x80;
diff --git a/libkvm-all.c b/libkvm-all.c
index 1668e32..a826341 100644
--- a/libkvm-all.c
+++ b/libkvm-all.c
@@ -356,10 +356,12 @@ kvm_context_t kvm_init(struct kvm_callbacks *callbacks,
 
 void kvm_finalize(kvm_context_t kvm)
 {
+   /* FIXME
if (kvm-vcpu_fd[0] != -1)
close(kvm-vcpu_fd[0]);
if (kvm-vm_fd != -1)
close(kvm-vm_fd);
+   */
close(kvm-fd);
free(kvm);
 }
@@ -374,32 +376,43 @@ void kvm_disable_pit_creation(kvm_context_t kvm)
kvm-no_pit_creation = 1;
 }
 
-int kvm_create_vcpu(kvm_context_t kvm, int slot)
+kvm_vcpu_context_t kvm_create_vcpu(kvm_context_t kvm, int id)
 {
long mmap_size;
int r;
+   kvm_vcpu_context_t vcpu_ctx = malloc(sizeof(struct kvm_vcpu_context));
 
-   r = ioctl(kvm-vm_fd, KVM_CREATE_VCPU, slot);
+   if (!vcpu_ctx) {
+   errno = ENOMEM;
+   return NULL;
+   }
+
+   vcpu_ctx-kvm = kvm;
+   vcpu_ctx-id = id;
+
+   r = ioctl(kvm-vm_fd, KVM_CREATE_VCPU, id);
if (r == -1) {
-   r = -errno;
fprintf(stderr, 

Re: [PATCH 4/4] kvm-s390: streamline memslot handling - v6

2009-06-02 Thread Christian Ehrhardt

Marcelo Tosatti wrote:

On Sun, May 31, 2009 at 11:22:58AM +0300, Avi Kivity wrote:
  

ehrha...@linux.vnet.ibm.com wrote:


From: Christian Ehrhardt ehrha...@linux.vnet.ibm.com

*updates in v6*
- ensure the wait_on_bit waiter is notified
- move the reset of requests to kvm_vcpu_release to drop them early

*updates in v5*
- ensure dropping vcpu all requests while freeing a vcpu

*updates in v4*
- kickout only scheduled vcpus (its superfluous and wait might hang forever on
  not running vcpus)

  
  
v3 is already in (and pushed so I can't unapply), so please rebase on  
top of current git.



Christian, 


Seems a good step toward further unification. Can you please rebase as
requested? 


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
  
The missing updates are mostly fixes or code merges due to our 
discussions and should be on the list in a few minutes.


--

Grüsse / regards, Christian Ehrhardt
IBM Linux Technology Center, Open Virtualization 


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/3] kvm-s390: infrastructure to kick vcpus out of guest state - rebased

2009-06-02 Thread ehrhardt
From: Christian Ehrhardt ehrha...@linux.vnet.ibm.com

As requested this is a rebased patch on top of the already applied v3
of the patch series.

*updates to already applied version*
- ensure allocations (might_sleep) are out of atomic context
- centralize consumption of vcpu-request bits

To ensure vcpu's come out of guest context in certain cases this patch adds a
s390 specific way to kick them out of guest context. Currently it kicks them
out to rerun the vcpu_run path in the s390 code, but the mechanism itself is
expandable and with a new flag we could also add e.g. kicks to userspace etc.

Signed-off-by: Christian Ehrhardt ehrha...@linux.vnet.ibm.com
---

[diffstat]
 include/asm/kvm_host.h |2 +-
 kvm/intercept.c|   10 ++
 kvm/kvm-s390.c |7 +++
 kvm/kvm-s390.h |   16 +++-
 kvm/sigp.c |   31 +--
 5 files changed, 46 insertions(+), 20 deletions(-)

Index: kvm/arch/s390/kvm/intercept.c
===
--- kvm.orig/arch/s390/kvm/intercept.c
+++ kvm/arch/s390/kvm/intercept.c
@@ -141,10 +141,12 @@ static int handle_stop(struct kvm_vcpu *
rc = -ENOTSUPP;
}
 
-   if (vcpu-arch.local_int.action_bits  ACTION_RELOADVCPU_ON_STOP) {
-   vcpu-arch.local_int.action_bits = ~ACTION_RELOADVCPU_ON_STOP;
-   rc = SIE_INTERCEPT_RERUNVCPU;
-   vcpu-run-exit_reason = KVM_EXIT_INTR;
+   if (vcpu-arch.local_int.action_bits  ACTION_VCPUREQUEST_ON_STOP) {
+   vcpu-arch.local_int.action_bits = ~ACTION_VCPUREQUEST_ON_STOP;
+   if (kvm_s390_handle_vcpu_requests(vcpu, VCPUREQUESTLVL_SIGP)) {
+   rc = SIE_INTERCEPT_CHECKREQUESTS;
+   vcpu-run-exit_reason = KVM_EXIT_INTR;
+   }
}
 
if (vcpu-arch.local_int.action_bits  ACTION_STOP_ON_STOP) {
Index: kvm/arch/s390/include/asm/kvm_host.h
===
--- kvm.orig/arch/s390/include/asm/kvm_host.h
+++ kvm/arch/s390/include/asm/kvm_host.h
@@ -182,7 +182,7 @@ struct kvm_s390_interrupt_info {
 /* for local_interrupt.action_flags */
 #define ACTION_STORE_ON_STOP   (10)
 #define ACTION_STOP_ON_STOP(11)
-#define ACTION_RELOADVCPU_ON_STOP  (12)
+#define ACTION_VCPUREQUEST_ON_STOP (12)
 
 struct kvm_s390_local_interrupt {
spinlock_t lock;
Index: kvm/arch/s390/kvm/kvm-s390.c
===
--- kvm.orig/arch/s390/kvm/kvm-s390.c
+++ kvm/arch/s390/kvm/kvm-s390.c
@@ -484,8 +484,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_v
 
 rerun_vcpu:
if (vcpu-requests)
-   if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, vcpu-requests))
-   kvm_s390_vcpu_set_mem(vcpu);
+   kvm_s390_handle_vcpu_requests(vcpu, VCPUREQUESTLVL_VCPURUN);
 
/* verify, that memory has been registered */
if (!vcpu-arch.sie_block-gmslm) {
@@ -521,7 +520,7 @@ rerun_vcpu:
rc = kvm_handle_sie_intercept(vcpu);
} while (!signal_pending(current)  !rc);
 
-   if (rc == SIE_INTERCEPT_RERUNVCPU)
+   if (rc == SIE_INTERCEPT_CHECKREQUESTS)
goto rerun_vcpu;
 
if (signal_pending(current)  !rc) {
@@ -710,7 +709,7 @@ int kvm_arch_set_memory_region(struct kv
kvm-vcpus[i]-requests))
continue;
kvm_s390_inject_sigp_stop(kvm-vcpus[i],
- ACTION_RELOADVCPU_ON_STOP);
+ ACTION_VCPUREQUEST_ON_STOP);
}
}
 
Index: kvm/arch/s390/kvm/kvm-s390.h
===
--- kvm.orig/arch/s390/kvm/kvm-s390.h
+++ kvm/arch/s390/kvm/kvm-s390.h
@@ -25,7 +25,7 @@
 typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu);
 
 /* negativ values are error codes, positive values for internal conditions */
-#define SIE_INTERCEPT_RERUNVCPU(10)
+#define SIE_INTERCEPT_CHECKREQUESTS(10)
 int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu);
 
 #define VM_EVENT(d_kvm, d_loglevel, d_string, d_args...)\
@@ -81,6 +81,20 @@ static inline void kvm_s390_vcpu_set_mem
up_read(vcpu-kvm-slots_lock);
 }
 
+/* interception levels from which handle vcpu requests can be called */
+#define VCPUREQUESTLVL_SIGP1
+#define VCPUREQUESTLVL_VCPURUN 2
+static inline unsigned long kvm_s390_handle_vcpu_requests(struct kvm_vcpu 
*vcpu,
+  int level)
+{
+   BUG_ON(!level);
+
+   if (!vcpu-requests)
+   return 0;
+
+   return vcpu-requests;
+}
+
 /* implemented in priv.c */
 int kvm_s390_handle_b2(struct kvm_vcpu *vcpu);
 
Index: kvm/arch/s390/kvm/sigp.c

[PATCH 0/3] kvm-s390: revised version of kvm-s390 guest memory handling - rebased

2009-06-02 Thread ehrhardt
From: Christian Ehrhardt ehrha...@de.ibm.com

As requested this is a rebased patch on top of the already applied v3
of the patch series.

*updates to already applied version*
- ensure the wait_on_bit waiter is notified
- move the reset of requests to kvm_vcpu_release to drop them early
- ensure dropping all vcpu requests while freeing a vcpu
- ensure kick allocations (might_sleep) are out of atomic context
- update vcpu-cpu in kvm-s390 arch handler for load/put
- centralize consumption of vcpu-request bits
- updates on running vcpus can now be handled without need to rerun the vcpu
- kvm_arch_set_memory_region waits until the bit is consumed by the vcpu
- kickout only scheduled vcpus (wait might hang forever on non-scheduled vcpus)

Note: further unification of make_all_cpu_request and the kick mechanism is
planned, but it might be good to split it from this step towards commonality.

Patches included:
Subject: [PATCH 1/3] kvm-s390: infrastructure to kick vcpus out of guest state 
- rebased
Subject: [PATCH 2/3] kvm-s390: update vcpu-cpu - rebased
Subject: [PATCH 3/3] kvm-s390: streamline memslot handling - rebased

Overall-Diffstat:
 arch/s390/include/asm/kvm_host.h |2 +-
 arch/s390/kvm/intercept.c|   10 ++
 arch/s390/kvm/kvm-s390.c |   36 +---
 arch/s390/kvm/kvm-s390.h |   23 ++-
 arch/s390/kvm/sigp.c |   31 +--
 virt/kvm/kvm_main.c  |4 
 6 files changed, 79 insertions(+), 27 deletions(-)
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/3] kvm-s390: streamline memslot handling - rebased

2009-06-02 Thread ehrhardt
From: Christian Ehrhardt ehrha...@linux.vnet.ibm.com

As requested this is a rebased patch on top of the already applied v3
of the patch series.

*updates to applied version*
- ensure the wait_on_bit waiter is notified
- ensure dropping vcpu all requests while freeing a vcpu
- kickout only scheduled vcpus (its superfluous and wait might hang forever on
  not running vcpus)
- kvm_arch_set_memory_region waits until the bit is consumed by the vcpu

This patch relocates the variables kvm-s390 uses to track guest mem addr/size.
As discussed dropping the variables at struct kvm_arch level allows to use the
common vcpu-request based mechanism to reload guest memory if e.g. changes
via set_memory_region.
The kick mechanism introduced in this series is used to ensure running vcpus
leave guest state to catch the update.


Signed-off-by: Christian Ehrhardt ehrha...@linux.vnet.ibm.com
---

[diffstat]
 arch/s390/kvm/kvm-s390.c |   27 ---
 arch/s390/kvm/kvm-s390.h |7 +++
 virt/kvm/kvm_main.c  |4 
 3 files changed, 31 insertions(+), 7 deletions(-)

Index: kvm/arch/s390/kvm/kvm-s390.c
===
--- kvm.orig/arch/s390/kvm/kvm-s390.c
+++ kvm/arch/s390/kvm/kvm-s390.c
@@ -674,6 +674,12 @@ long kvm_arch_vcpu_ioctl(struct file *fi
return -EINVAL;
 }
 
+static int wait_bit_schedule(void *word)
+{
+   schedule();
+   return 0;
+}
+
 /* Section: memory related */
 int kvm_arch_set_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem,
@@ -681,6 +687,7 @@ int kvm_arch_set_memory_region(struct kv
int user_alloc)
 {
int i;
+   struct kvm_vcpu *vcpu;
 
/* A few sanity checks. We can have exactly one memory slot which has
   to start at guest virtual zero and which has to be located at a
@@ -706,13 +713,19 @@ int kvm_arch_set_memory_region(struct kv
 
/* request update of sie control block for all available vcpus */
for (i = 0; i  KVM_MAX_VCPUS; ++i) {
-   if (kvm-vcpus[i]) {
-   if (test_and_set_bit(KVM_REQ_MMU_RELOAD,
-   kvm-vcpus[i]-requests))
-   continue;
-   kvm_s390_inject_sigp_stop(kvm-vcpus[i],
- ACTION_VCPUREQUEST_ON_STOP);
-   }
+   vcpu = kvm-vcpus[i];
+   if (!vcpu)
+   continue;
+
+   if (!test_and_set_bit(KVM_REQ_MMU_RELOAD, vcpu-requests))
+   continue;
+
+   if (vcpu-cpu == -1)
+   continue;
+
+   kvm_s390_inject_sigp_stop(vcpu, ACTION_VCPUREQUEST_ON_STOP);
+   wait_on_bit(vcpu-requests, KVM_REQ_MMU_RELOAD,
+   wait_bit_schedule, TASK_UNINTERRUPTIBLE);
}
 
return 0;
Index: kvm/arch/s390/kvm/kvm-s390.h
===
--- kvm.orig/arch/s390/kvm/kvm-s390.h
+++ kvm/arch/s390/kvm/kvm-s390.h
@@ -92,6 +92,13 @@ static inline unsigned long kvm_s390_han
if (!vcpu-requests)
return 0;
 
+   /* requests that can be handled at all levels */
+   if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, vcpu-requests)) {
+   smp_mb__after_clear_bit();
+   wake_up_bit(vcpu-requests, KVM_REQ_MMU_RELOAD);
+   kvm_s390_vcpu_set_mem(vcpu);
+   }
+
return vcpu-requests;
 }
 
Index: kvm/virt/kvm/kvm_main.c
===
--- kvm.orig/virt/kvm/kvm_main.c
+++ kvm/virt/kvm/kvm_main.c
@@ -1682,6 +1682,10 @@ static int kvm_vcpu_release(struct inode
 {
struct kvm_vcpu *vcpu = filp-private_data;
 
+   clear_bit(KVM_REQ_MMU_RELOAD, vcpu-requests);
+   smp_mb__after_clear_bit();
+   wake_up_bit(vcpu-requests, KVM_REQ_MMU_RELOAD);
+
kvm_put_kvm(vcpu-kvm);
return 0;
 }
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/3] kvm-s390: update vcpu-cpu - rebased

2009-06-02 Thread ehrhardt
From: Christian Ehrhardt ehrha...@linux.vnet.ibm.com

As requested this is a rebased patch on top of the already applied v3
of the patch series.

kvm on s390 formerly ignored vcpu-cpu.
This patch adds set/unset vcpu-cpu in kvm_arch_vcpu_load/put to allow
further architecture unification e.g. let generic code not find -1 on
currently scheduled vcpus.

Signed-off-by: Christian Ehrhardt ehrha...@linux.vnet.ibm.com
---

[diffstat]
 kvm-s390.c |2 ++
 1 file changed, 2 insertions(+)

[diff]
Index: kvm/arch/s390/kvm/kvm-s390.c
===
--- kvm.orig/arch/s390/kvm/kvm-s390.c
+++ kvm/arch/s390/kvm/kvm-s390.c
@@ -244,6 +244,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcp
 
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
+   vcpu-cpu = cpu;
save_fp_regs(vcpu-arch.host_fpregs);
save_access_regs(vcpu-arch.host_acrs);
vcpu-arch.guest_fpregs.fpc = FPC_VALID_MASK;
@@ -253,6 +254,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu 
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
+   vcpu-cpu = -1;
save_fp_regs(vcpu-arch.guest_fpregs);
save_access_regs(vcpu-arch.guest_acrs);
restore_fp_regs(vcpu-arch.host_fpregs);
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v7 1/2] qemu-kvm: add irqfd support

2009-06-02 Thread Gregory Haskins
Avi Kivity wrote:
 Gregory Haskins wrote:
 irqfd lets you create an eventfd based file-desriptor to inject
 interrupts
 to a kvm guest.  We associate one gsi per fd for fine-grained routing.

   

 Sorry, getting to userspace a little late in the game.
Np.. I need to re-work this for the POLLHUP patch I am about to push anyway.


 @@ -34,6 +34,7 @@
  #include string.h
  #include errno.h
  #include sys/ioctl.h
 +#include sys/eventfd.h
   

 This only made it in 2.6.22.  So it needs a ./configure -time check
 for existence.

I could also just put this within the #ifdef KVM_CAP_IRQFD?



 +int kvm_create_irqfd(kvm_context_t kvm, int gsi, int flags)
 +{
 +return -ENOENT;
 +}
 +
 +int kvm_destroy_irqfd(kvm_context_t kvm, int fd, int flags)
 +{
 +return -ENOENT;
 +}
   

 -ENOSYS is more traditional.

Ack.

-Greg





signature.asc
Description: OpenPGP digital signature


[PATCHv2 00/13] qemu: MSI-X support

2009-06-02 Thread Michael S. Tsirkin
Resending. Incorporated a minor fix pointed out by Isaku Yamahata.

Here is the port of MSI-X support patches to upstream qemu.
Please comment or commit.

This patchset adds generic support for MSI-X, adds implementation in
APIC, and uses MSI-X in virtio-net. At Paul's suggestion, I use stl_phy
to decouple APIC and MSI-X implementation.

This uses the mask table patch that I posted previously, and which is
now included in the series.

-- 
MST

Michael S. Tsirkin (13):
  qemu: make default_write_config use mask table
  qemu: capability bits in pci save/restore
  qemu: add routines to manage PCI capabilities
  qemu: helper routines for pci access.
  qemu: MSI-X support functions
  qemu: add flag to disable MSI-X by default
  qemu: minimal MSI/MSI-X implementation for PC
  qemu: add support for resizing regions
  qemu: virtio support for many interrupt vectors
  qemu: MSI-X support in virtio PCI
  qemu: request 3 vectors in virtio-net
  qemu: virtio save/load bindings
  qemu: add pci_get/set_byte

 Makefile.target|2 +-
 hw/apic.c  |   50 ++-
 hw/msix.c  |  426 
 hw/msix.h  |   35 +
 hw/pci.c   |  295 +++-
 hw/pci.h   |  105 -
 hw/syborg_virtio.c |   13 ++-
 hw/virtio-net.c|1 +
 hw/virtio-pci.c|  215 +-
 hw/virtio.c|   70 ++---
 hw/virtio.h|   14 ++-
 qemu-options.hx|2 +
 vl.c   |3 +
 13 files changed, 1017 insertions(+), 214 deletions(-)
 create mode 100644 hw/msix.c
 create mode 100644 hw/msix.h
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCHv2 01/13] qemu: make default_write_config use mask table

2009-06-02 Thread Michael S. Tsirkin
Change much of hw/pci to use symbolic constants and a table-driven
design: add a mask table with writable bits set and readonly bits unset.
Detect change by comparing original and new registers.

This makes it easy to support capabilities where read-only/writeable
bit layout differs between devices, depending on capabilities present.

As a result, writing a single byte in BAR registers now works as
it should. Writing to upper limit registers in the bridge
also works as it should. Code is also shorter.

Signed-off-by: Michael S. Tsirkin m...@redhat.com
---
 hw/pci.c |  145 -
 hw/pci.h |   18 +++-
 2 files changed, 46 insertions(+), 117 deletions(-)

diff --git a/hw/pci.c b/hw/pci.c
index 0ab5b94..1e70e46 100644
--- a/hw/pci.c
+++ b/hw/pci.c
@@ -239,6 +239,17 @@ int pci_assign_devaddr(const char *addr, int *domp, int 
*busp, unsigned *slotp)
 return pci_parse_devaddr(devaddr, domp, busp, slotp);
 }
 
+static void pci_init_mask(PCIDevice *dev)
+{
+int i;
+dev-mask[PCI_CACHE_LINE_SIZE] = 0xff;
+dev-mask[PCI_INTERRUPT_LINE] = 0xff;
+dev-mask[PCI_COMMAND] = PCI_COMMAND_IO | PCI_COMMAND_MEMORY
+ | PCI_COMMAND_MASTER;
+for (i = PCI_CONFIG_HEADER_SIZE; i  PCI_CONFIG_SPACE_SIZE; ++i)
+dev-mask[i] = 0xff;
+}
+
 /* -1 for devfn means auto assign */
 static PCIDevice *do_pci_register_device(PCIDevice *pci_dev, PCIBus *bus,
  const char *name, int devfn,
@@ -261,6 +272,7 @@ static PCIDevice *do_pci_register_device(PCIDevice 
*pci_dev, PCIBus *bus,
 pstrcpy(pci_dev-name, sizeof(pci_dev-name), name);
 memset(pci_dev-irq_state, 0, sizeof(pci_dev-irq_state));
 pci_set_default_subsystem_id(pci_dev);
+pci_init_mask(pci_dev);
 
 if (!config_read)
 config_read = pci_default_read_config;
@@ -334,6 +346,7 @@ void pci_register_io_region(PCIDevice *pci_dev, int 
region_num,
 {
 PCIIORegion *r;
 uint32_t addr;
+uint32_t mask;
 
 if ((unsigned int)region_num = PCI_NUM_REGIONS)
 return;
@@ -349,12 +362,17 @@ void pci_register_io_region(PCIDevice *pci_dev, int 
region_num,
 r-size = size;
 r-type = type;
 r-map_func = map_func;
+
+mask = ~(size - 1);
 if (region_num == PCI_ROM_SLOT) {
 addr = 0x30;
+/* ROM enable bit is writeable */
+mask |= 1;
 } else {
 addr = 0x10 + region_num * 4;
 }
 *(uint32_t *)(pci_dev-config + addr) = cpu_to_le32(type);
+*(uint32_t *)(pci_dev-mask + addr) = cpu_to_le32(mask);
 }
 
 static void pci_update_mappings(PCIDevice *d)
@@ -463,118 +481,21 @@ uint32_t pci_default_read_config(PCIDevice *d,
 return val;
 }
 
-void pci_default_write_config(PCIDevice *d,
-  uint32_t address, uint32_t val, int len)
+void pci_default_write_config(PCIDevice *d, uint32_t addr, uint32_t val, int l)
 {
-int can_write, i;
-uint32_t end, addr;
-
-if (len == 4  ((address = 0x10  address  0x10 + 4 * 6) ||
- (address = 0x30  address  0x34))) {
-PCIIORegion *r;
-int reg;
+uint8_t orig[PCI_CONFIG_SPACE_SIZE];
+int i;
 
-if ( address = 0x30 ) {
-reg = PCI_ROM_SLOT;
-}else{
-reg = (address - 0x10)  2;
-}
-r = d-io_regions[reg];
-if (r-size == 0)
-goto default_config;
-/* compute the stored value */
-if (reg == PCI_ROM_SLOT) {
-/* keep ROM enable bit */
-val = (~(r-size - 1)) | 1;
-} else {
-val = ~(r-size - 1);
-val |= r-type;
-}
-*(uint32_t *)(d-config + address) = cpu_to_le32(val);
-pci_update_mappings(d);
-return;
-}
- default_config:
 /* not efficient, but simple */
-addr = address;
-for(i = 0; i  len; i++) {
-/* default read/write accesses */
-switch(d-config[0x0e]) {
-case 0x00:
-case 0x80:
-switch(addr) {
-case 0x00:
-case 0x01:
-case 0x02:
-case 0x03:
-case 0x06:
-case 0x07:
-case 0x08:
-case 0x09:
-case 0x0a:
-case 0x0b:
-case 0x0e:
-case 0x10 ... 0x27: /* base */
-case 0x2c ... 0x2f: /* read-only subsystem ID  vendor ID */
-case 0x30 ... 0x33: /* rom */
-case 0x3d:
-can_write = 0;
-break;
-default:
-can_write = 1;
-break;
-}
-break;
-default:
-case 0x01:
-switch(addr) {
-case 0x00:
-case 0x01:
-case 0x02:
-case 0x03:
-case 0x06:
-case 0x07:
-case 0x08:
-case 0x09:
-case 0x0a:
-case 0x0b:
-case 

[PATCHv2 03/13] qemu: add routines to manage PCI capabilities

2009-06-02 Thread Michael S. Tsirkin
Add routines to manage PCI capability list. First user will be MSI-X.

Signed-off-by: Michael S. Tsirkin m...@redhat.com
---
 hw/pci.c |   98 --
 hw/pci.h |   18 +++-
 2 files changed, 106 insertions(+), 10 deletions(-)

diff --git a/hw/pci.c b/hw/pci.c
index 5dcfb4e..31ba2ed 100644
--- a/hw/pci.c
+++ b/hw/pci.c
@@ -130,12 +130,13 @@ void pci_device_save(PCIDevice *s, QEMUFile *f)
 int version = s-cap_present ? 3 : 2;
 int i;
 
-qemu_put_be32(f, version); /* PCI device version */
+/* PCI device version and capabilities */
+qemu_put_be32(f, version);
+if (version = 3)
+qemu_put_be32(f, s-cap_present);
 qemu_put_buffer(f, s-config, 256);
 for (i = 0; i  4; i++)
 qemu_put_be32(f, s-irq_state[i]);
-if (version = 3)
-qemu_put_be32(f, s-cap_present);
 }
 
 int pci_device_load(PCIDevice *s, QEMUFile *f)
@@ -146,12 +147,6 @@ int pci_device_load(PCIDevice *s, QEMUFile *f)
 version_id = qemu_get_be32(f);
 if (version_id  3)
 return -EINVAL;
-qemu_get_buffer(f, s-config, 256);
-pci_update_mappings(s);
-
-if (version_id = 2)
-for (i = 0; i  4; i ++)
-s-irq_state[i] = qemu_get_be32(f);
 if (version_id = 3)
 s-cap_present = qemu_get_be32(f);
 else
@@ -160,6 +155,18 @@ int pci_device_load(PCIDevice *s, QEMUFile *f)
 if (s-cap_present  ~s-cap_supported)
 return -EINVAL;
 
+qemu_get_buffer(f, s-config, 256);
+pci_update_mappings(s);
+
+if (version_id = 2)
+for (i = 0; i  4; i ++)
+s-irq_state[i] = qemu_get_be32(f);
+/* Clear mask and used bits for capabilities.
+   Must be restored separately, since capabilities can
+   be placed anywhere in config space. */
+memset(s-used, 0, PCI_CONFIG_SPACE_SIZE);
+for (i = PCI_CONFIG_HEADER_SIZE; i  PCI_CONFIG_SPACE_SIZE; ++i)
+s-mask[i] = 0xff;
 return 0;
 }
 
@@ -870,3 +877,76 @@ PCIDevice *pci_create_simple(PCIBus *bus, int devfn, const 
char *name)
 
 return (PCIDevice *)dev;
 }
+
+static int pci_find_space(PCIDevice *pdev, uint8_t size)
+{
+int offset = PCI_CONFIG_HEADER_SIZE;
+int i;
+for (i = PCI_CONFIG_HEADER_SIZE; i  PCI_CONFIG_SPACE_SIZE; ++i)
+if (pdev-used[i])
+offset = i + 1;
+else if (i - offset + 1 == size)
+return offset;
+return 0;
+}
+
+static uint8_t pci_find_capability_list(PCIDevice *pdev, uint8_t cap_id,
+uint8_t *prev_p)
+{
+uint8_t next, prev;
+
+if (!(pdev-config[PCI_STATUS]  PCI_STATUS_CAP_LIST))
+return 0;
+
+for (prev = PCI_CAPABILITY_LIST; (next = pdev-config[prev]);
+ prev = next + PCI_CAP_LIST_NEXT)
+if (pdev-config[next + PCI_CAP_LIST_ID] == cap_id)
+break;
+
+*prev_p = prev;
+return next;
+}
+
+/* Reserve space and add capability to the linked list in pci config space */
+int pci_add_capability(PCIDevice *pdev, uint8_t cap_id, uint8_t size)
+{
+uint8_t offset = pci_find_space(pdev, size);
+uint8_t *config = pdev-config + offset;
+if (!offset)
+return -ENOSPC;
+config[PCI_CAP_LIST_ID] = cap_id;
+config[PCI_CAP_LIST_NEXT] = pdev-config[PCI_CAPABILITY_LIST];
+pdev-config[PCI_CAPABILITY_LIST] = offset;
+pdev-config[PCI_STATUS] |= PCI_STATUS_CAP_LIST;
+memset(pdev-used + offset, 0xFF, size);
+/* Make capability read-only by default */
+memset(pdev-mask + offset, 0, size);
+return offset;
+}
+
+/* Unlink capability from the pci config space. */
+void pci_del_capability(PCIDevice *pdev, uint8_t cap_id, uint8_t size)
+{
+uint8_t prev, offset = pci_find_capability_list(pdev, cap_id, prev);
+if (!offset)
+return;
+pdev-config[prev] = pdev-config[offset + PCI_CAP_LIST_NEXT];
+/* Make capability writeable again */
+memset(pdev-mask + offset, 0xff, size);
+memset(pdev-used + offset, 0, size);
+
+if (!pdev-config[PCI_CAPABILITY_LIST])
+pdev-config[PCI_STATUS] = ~PCI_STATUS_CAP_LIST;
+}
+
+/* Reserve space for capability at a known offset (to call after load). */
+void pci_reserve_capability(PCIDevice *pdev, uint8_t offset, uint8_t size)
+{
+memset(pdev-used + offset, 0xff, size);
+}
+
+uint8_t pci_find_capability(PCIDevice *pdev, uint8_t cap_id)
+{
+uint8_t prev;
+return pci_find_capability_list(pdev, cap_id, prev);
+}
diff --git a/hw/pci.h b/hw/pci.h
index 9139504..40137c6 100644
--- a/hw/pci.h
+++ b/hw/pci.h
@@ -123,6 +123,10 @@ typedef struct PCIIORegion {
 #define PCI_MIN_GNT0x3e/* 8 bits */
 #define PCI_MAX_LAT0x3f/* 8 bits */
 
+/* Capability lists */
+#define PCI_CAP_LIST_ID0   /* Capability ID */
+#define PCI_CAP_LIST_NEXT  1   /* Next capability in the list */
+
 #define PCI_REVISION0x08/* obsolete, use PCI_REVISION_ID */
 #define PCI_SUBVENDOR_ID0x2c   

[PATCHv2 04/13] qemu: helper routines for pci access.

2009-06-02 Thread Michael S. Tsirkin
Add inline routines for convenient access to pci devices
with correct (little) endianness. Will be used by MSI-X support.

Signed-off-by: Michael S. Tsirkin m...@redhat.com
---
 hw/pci.h |   30 +++---
 1 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/hw/pci.h b/hw/pci.h
index 40137c6..4e112a3 100644
--- a/hw/pci.h
+++ b/hw/pci.h
@@ -240,21 +240,45 @@ PCIBus *pci_bridge_init(PCIBus *bus, int devfn, uint16_t 
vid, uint16_t did,
 pci_map_irq_fn map_irq, const char *name);
 
 static inline void
+pci_set_word(uint8_t *config, uint16_t val)
+{
+cpu_to_le16wu((uint16_t *)config, val);
+}
+
+static inline uint16_t
+pci_get_word(uint8_t *config)
+{
+return le16_to_cpupu((uint16_t *)config);
+}
+
+static inline void
+pci_set_long(uint8_t *config, uint32_t val)
+{
+cpu_to_le32wu((uint32_t *)config, val);
+}
+
+static inline uint32_t
+pci_get_long(uint8_t *config)
+{
+return le32_to_cpupu((uint32_t *)config);
+}
+
+static inline void
 pci_config_set_vendor_id(uint8_t *pci_config, uint16_t val)
 {
-cpu_to_le16wu((uint16_t *)pci_config[PCI_VENDOR_ID], val);
+pci_set_word(pci_config[PCI_VENDOR_ID], val);
 }
 
 static inline void
 pci_config_set_device_id(uint8_t *pci_config, uint16_t val)
 {
-cpu_to_le16wu((uint16_t *)pci_config[PCI_DEVICE_ID], val);
+pci_set_word(pci_config[PCI_DEVICE_ID], val);
 }
 
 static inline void
 pci_config_set_class(uint8_t *pci_config, uint16_t val)
 {
-cpu_to_le16wu((uint16_t *)pci_config[PCI_CLASS_DEVICE], val);
+pci_set_word(pci_config[PCI_CLASS_DEVICE], val);
 }
 
 typedef void (*pci_qdev_initfn)(PCIDevice *dev);
-- 
1.6.3.1.56.g79e1.dirty

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCHv2 05/13] qemu: MSI-X support functions

2009-06-02 Thread Michael S. Tsirkin
Add functions implementing MSI-X support. First user will be virtio-pci.
Note that platform must set a flag to declare MSI supported.
For PC this will be set by APIC.

Signed-off-by: Michael S. Tsirkin m...@redhat.com
---
 Makefile.target |2 +-
 hw/msix.c   |  423 +++
 hw/msix.h   |   35 +
 hw/pci.h|   20 +++
 4 files changed, 479 insertions(+), 1 deletions(-)
 create mode 100644 hw/msix.c
 create mode 100644 hw/msix.h

diff --git a/Makefile.target b/Makefile.target
index 664a1e3..87b2859 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -486,7 +486,7 @@ endif #CONFIG_BSD_USER
 ifndef CONFIG_USER_ONLY
 
 OBJS=vl.o osdep.o monitor.o pci.o loader.o isa_mmio.o machine.o \
- gdbstub.o gdbstub-xml.o
+ gdbstub.o gdbstub-xml.o msix.o
 # virtio has to be here due to weird dependency between PCI and virtio-net.
 # need to fix this properly
 OBJS+=virtio-blk.o virtio-balloon.o virtio-net.o virtio-console.o
diff --git a/hw/msix.c b/hw/msix.c
new file mode 100644
index 000..4db2fc1
--- /dev/null
+++ b/hw/msix.c
@@ -0,0 +1,423 @@
+/*
+ * MSI-X device support
+ *
+ * This module includes support for MSI-X in pci devices.
+ *
+ * Author: Michael S. Tsirkin m...@redhat.com
+ *
+ *  Copyright (c) 2009, Red Hat Inc, Michael S. Tsirkin (m...@redhat.com)
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#include hw.h
+#include msix.h
+#include pci.h
+
+/* Declaration from linux/pci_regs.h */
+#define  PCI_CAP_ID_MSIX 0x11 /* MSI-X */
+#define  PCI_MSIX_FLAGS 2 /* Table at lower 11 bits */
+#define  PCI_MSIX_FLAGS_QSIZE  0x7FF
+#define  PCI_MSIX_FLAGS_ENABLE (1  15)
+#define  PCI_MSIX_FLAGS_BIRMASK(7  0)
+
+/* MSI-X capability structure */
+#define MSIX_TABLE_OFFSET 4
+#define MSIX_PBA_OFFSET 8
+#define MSIX_CAP_LENGTH 12
+
+/* MSI enable bit is in byte 1 in FLAGS register */
+#define MSIX_ENABLE_OFFSET (PCI_MSIX_FLAGS + 1)
+#define MSIX_ENABLE_MASK (PCI_MSIX_FLAGS_ENABLE  8)
+
+/* MSI-X table format */
+#define MSIX_MSG_ADDR 0
+#define MSIX_MSG_UPPER_ADDR 4
+#define MSIX_MSG_DATA 8
+#define MSIX_VECTOR_CTRL 12
+#define MSIX_ENTRY_SIZE 16
+#define MSIX_VECTOR_MASK 0x1
+
+/* How much space does an MSIX table need. */
+/* The spec requires giving the table structure
+ * a 4K aligned region all by itself. Align it to
+ * target pages so that drivers can do passthrough
+ * on the rest of the region. */
+#define MSIX_PAGE_SIZE TARGET_PAGE_ALIGN(0x1000)
+/* Reserve second half of the page for pending bits */
+#define MSIX_PAGE_PENDING (MSIX_PAGE_SIZE / 2)
+#define MSIX_MAX_ENTRIES 32
+
+
+#ifdef MSIX_DEBUG
+#define DEBUG(fmt, ...)   \
+do {  \
+  fprintf(stderr, %s:  fmt, __func__ , __VA_ARGS__);\
+} while (0)
+#else
+#define DEBUG(fmt, ...) do { } while(0)
+#endif
+
+/* Flag to globally disable MSI-X support */
+int msix_disable;
+
+/* Flag for interrupt controller to declare MSI-X support */
+int msix_supported;
+
+/* Add MSI-X capability to the config space for the device. */
+/* Given a bar and its size, add MSI-X table on top of it
+ * and fill MSI-X capability in the config space.
+ * Original bar size must be a power of 2 or 0.
+ * New bar size is returned. */
+static int msix_add_config(struct PCIDevice *pdev, unsigned short nentries,
+   unsigned bar_nr, unsigned bar_size)
+{
+int config_offset;
+uint8_t *config;
+uint32_t new_size;
+
+if (nentries  1 || nentries  PCI_MSIX_FLAGS_QSIZE + 1)
+return -EINVAL;
+if (bar_size  0x8000)
+return -ENOSPC;
+
+/* Add space for MSI-X structures */
+if (!bar_size)
+new_size = MSIX_PAGE_SIZE;
+else if (bar_size  MSIX_PAGE_SIZE) {
+bar_size = MSIX_PAGE_SIZE;
+new_size = MSIX_PAGE_SIZE * 2;
+} else
+new_size = bar_size * 2;
+
+pdev-msix_bar_size = new_size;
+config_offset = pci_add_capability(pdev, PCI_CAP_ID_MSIX, MSIX_CAP_LENGTH);
+if (config_offset  0)
+return config_offset;
+config = pdev-config + config_offset;
+
+pci_set_word(config + PCI_MSIX_FLAGS, nentries - 1);
+/* Table on top of BAR */
+pci_set_long(config + MSIX_TABLE_OFFSET, bar_size | bar_nr);
+/* Pending bits on top of that */
+pci_set_long(config + MSIX_PBA_OFFSET, (bar_size + MSIX_PAGE_PENDING) |
+ bar_nr);
+pdev-msix_cap = config_offset;
+/* Make flags bit writeable. */
+pdev-mask[config_offset + MSIX_ENABLE_OFFSET] |= MSIX_ENABLE_MASK;
+return 0;
+}
+
+static void msix_free_irq_entries(PCIDevice *dev)
+{
+int vector;
+
+for (vector = 0; vector  dev-msix_entries_nr; ++vector)
+dev-msix_entry_used[vector] = 0;
+}
+
+/* Handle MSI-X capability config write. */
+void msix_write_config(PCIDevice *dev, uint32_t addr,
+ 

[PATCHv2 02/13] qemu: capability bits in pci save/restore

2009-06-02 Thread Michael S. Tsirkin
Add support for capability bits in save/restore for pci.
These will be used for MSI, where the capability might
be present or not as requested by user, which does not
map well into a single version number.

Signed-off-by: Michael S. Tsirkin m...@redhat.com
---
 hw/pci.c |   14 --
 hw/pci.h |4 
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/hw/pci.c b/hw/pci.c
index 1e70e46..5dcfb4e 100644
--- a/hw/pci.c
+++ b/hw/pci.c
@@ -127,12 +127,15 @@ int pci_bus_num(PCIBus *s)
 
 void pci_device_save(PCIDevice *s, QEMUFile *f)
 {
+int version = s-cap_present ? 3 : 2;
 int i;
 
-qemu_put_be32(f, 2); /* PCI device version */
+qemu_put_be32(f, version); /* PCI device version */
 qemu_put_buffer(f, s-config, 256);
 for (i = 0; i  4; i++)
 qemu_put_be32(f, s-irq_state[i]);
+if (version = 3)
+qemu_put_be32(f, s-cap_present);
 }
 
 int pci_device_load(PCIDevice *s, QEMUFile *f)
@@ -141,7 +144,7 @@ int pci_device_load(PCIDevice *s, QEMUFile *f)
 int i;
 
 version_id = qemu_get_be32(f);
-if (version_id  2)
+if (version_id  3)
 return -EINVAL;
 qemu_get_buffer(f, s-config, 256);
 pci_update_mappings(s);
@@ -149,6 +152,13 @@ int pci_device_load(PCIDevice *s, QEMUFile *f)
 if (version_id = 2)
 for (i = 0; i  4; i ++)
 s-irq_state[i] = qemu_get_be32(f);
+if (version_id = 3)
+s-cap_present = qemu_get_be32(f);
+else
+s-cap_present = 0;
+
+if (s-cap_present  ~s-cap_supported)
+return -EINVAL;
 
 return 0;
 }
diff --git a/hw/pci.h b/hw/pci.h
index c0c2380..9139504 100644
--- a/hw/pci.h
+++ b/hw/pci.h
@@ -178,6 +178,10 @@ struct PCIDevice {
 
 /* Current IRQ levels.  Used internally by the generic PCI code.  */
 int irq_state[4];
+
+/* Capability bits for save/load */
+uint32_t cap_supported;
+uint32_t cap_present;
 };
 
 PCIDevice *pci_register_device(PCIBus *bus, const char *name,
-- 
1.6.3.1.56.g79e1.dirty

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCHv2 06/13] qemu: add flag to disable MSI-X by default

2009-06-02 Thread Michael S. Tsirkin
Add global flag to disable MSI-X by default.  This is useful primarily
to make images loadable by older qemu (without msix).  Even when MSI-X
is disabled by flag, you can still load images that have MSI-X enabled.

Signed-off-by: Michael S. Tsirkin m...@redhat.com
---
 hw/msix.c   |3 +++
 qemu-options.hx |2 ++
 vl.c|3 +++
 3 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/hw/msix.c b/hw/msix.c
index 4db2fc1..970a8ca 100644
--- a/hw/msix.c
+++ b/hw/msix.c
@@ -122,6 +122,9 @@ void msix_write_config(PCIDevice *dev, uint32_t addr,
uint32_t val, int len)
 {
 unsigned enable_pos = dev-msix_cap + MSIX_ENABLE_OFFSET;
+if (!(dev-cap_present  QEMU_PCI_CAP_MSIX))
+return;
+
 if (addr + len = enable_pos || addr  enable_pos)
 return;
 
diff --git a/qemu-options.hx b/qemu-options.hx
index 87af798..fd041a4 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -1575,3 +1575,5 @@ DEF(semihosting, 0, QEMU_OPTION_semihosting,
 DEF(old-param, 0, QEMU_OPTION_old_param,
 -old-param  old param mode\n)
 #endif
+DEF(disable-msix, 0, QEMU_OPTION_disable_msix,
+-disable-msix disable msix support for PCI devices (enabled by 
default)\n)
diff --git a/vl.c b/vl.c
index 2c1f0e0..2757d4f 100644
--- a/vl.c
+++ b/vl.c
@@ -134,6 +134,7 @@ int main(int argc, char **argv)
 #include hw/usb.h
 #include hw/pcmcia.h
 #include hw/pc.h
+#include hw/msix.h
 #include hw/audiodev.h
 #include hw/isa.h
 #include hw/baum.h
@@ -5557,6 +5558,8 @@ int main(int argc, char **argv, char **envp)
 xen_mode = XEN_ATTACH;
 break;
 #endif
+case QEMU_OPTION_disable_msix:
+msix_disable = 1;
 }
 }
 }
-- 
1.6.3.1.56.g79e1.dirty

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCHv2 07/13] qemu: minimal MSI/MSI-X implementation for PC

2009-06-02 Thread Michael S. Tsirkin
Implement MSI support in APIC. Note that MSI and MMIO APIC registers
are at the same memory location, but actually not on the global bus: MSI
is on PCI bus, APIC is connected directly to the CPU. We map them on the
global bus at the same address which happens to work because MSI
registers are reserved in APIC MMIO and vice versa.

Signed-off-by: Michael S. Tsirkin m...@redhat.com
---
 hw/apic.c |   50 ++
 1 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/hw/apic.c b/hw/apic.c
index 8c8b2de..d831709 100644
--- a/hw/apic.c
+++ b/hw/apic.c
@@ -19,6 +19,8 @@
  */
 #include hw.h
 #include pc.h
+#include pci.h
+#include msix.h
 #include qemu-timer.h
 #include host-utils.h
 
@@ -63,6 +65,19 @@
 #define MAX_APICS 255
 #define MAX_APIC_WORDS 8
 
+/* Intel APIC constants: from include/asm/msidef.h */
+#define MSI_DATA_VECTOR_SHIFT  0
+#define MSI_DATA_VECTOR_MASK   0x00ff
+#define MSI_DATA_DELIVERY_MODE_SHIFT   8
+#define MSI_DATA_TRIGGER_SHIFT 15
+#define MSI_DATA_LEVEL_SHIFT   14
+#define MSI_ADDR_DEST_MODE_SHIFT   2
+#define MSI_ADDR_DEST_ID_SHIFT 12
+#defineMSI_ADDR_DEST_ID_MASK   0x000
+
+#define MSI_ADDR_BASE   0xfee0
+#define MSI_ADDR_SIZE   0x10
+
 typedef struct APICState {
 CPUState *cpu_env;
 uint32_t apicbase;
@@ -86,6 +101,13 @@ typedef struct APICState {
 QEMUTimer *timer;
 } APICState;
 
+struct msi_state {
+uint64_t addr;
+uint32_t data;
+int mask;
+int pending;
+};
+
 static int apic_io_memory;
 static APICState *local_apics[MAX_APICS + 1];
 static int last_apic_id = 0;
@@ -712,11 +734,31 @@ static uint32_t apic_mem_readl(void *opaque, 
target_phys_addr_t addr)
 return val;
 }
 
+static void apic_send_msi(target_phys_addr_t addr, uint32 data)
+{
+uint8_t dest = (addr  MSI_ADDR_DEST_ID_MASK)  MSI_ADDR_DEST_ID_SHIFT;
+uint8_t vector = (data  MSI_DATA_VECTOR_MASK)  MSI_DATA_VECTOR_SHIFT;
+uint8_t dest_mode = (addr  MSI_ADDR_DEST_MODE_SHIFT)  0x1;
+uint8_t trigger_mode = (data  MSI_DATA_TRIGGER_SHIFT)  0x1;
+uint8_t delivery = (data  MSI_DATA_DELIVERY_MODE_SHIFT)  0x7;
+/* XXX: Ignore redirection hint. */
+apic_deliver_irq(dest, dest_mode, delivery, vector, 0, trigger_mode);
+}
+
 static void apic_mem_writel(void *opaque, target_phys_addr_t addr, uint32_t 
val)
 {
 CPUState *env;
 APICState *s;
-int index;
+int index = (addr  4)  0xff;
+if (addr  0xfff || !index) {
+/* MSI and MMIO APIC are at the same memory location,
+ * but actually not on the global bus: MSI is on PCI bus
+ * APIC is connected directly to the CPU.
+ * Mapping them on the global bus happens to work because
+ * MSI registers are reserved in APIC MMIO and vice versa. */
+apic_send_msi(addr, val);
+return;
+}
 
 env = cpu_single_env;
 if (!env)
@@ -727,7 +769,6 @@ static void apic_mem_writel(void *opaque, 
target_phys_addr_t addr, uint32_t val)
 printf(APIC write: %08x = %08x\n, (uint32_t)addr, val);
 #endif
 
-index = (addr  4)  0xff;
 switch(index) {
 case 0x02:
 s-id = (val  24);
@@ -911,6 +952,7 @@ int apic_init(CPUState *env)
 s-cpu_env = env;
 
 apic_reset(s);
+msix_supported = 1;
 
 /* XXX: mapping more APICs at the same memory location */
 if (apic_io_memory == 0) {
@@ -918,7 +960,8 @@ int apic_init(CPUState *env)
on the global memory bus. */
 apic_io_memory = cpu_register_io_memory(0, apic_mem_read,
 apic_mem_write, NULL);
-cpu_register_physical_memory(s-apicbase  ~0xfff, 0x1000,
+/* XXX: what if the base changes? */
+cpu_register_physical_memory(MSI_ADDR_BASE, MSI_ADDR_SIZE,
  apic_io_memory);
 }
 s-timer = qemu_new_timer(vm_clock, apic_timer, s);
@@ -929,4 +972,3 @@ int apic_init(CPUState *env)
 local_apics[s-id] = s;
 return 0;
 }
-
-- 
1.6.3.1.56.g79e1.dirty

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCHv2 08/13] qemu: add support for resizing regions

2009-06-02 Thread Michael S. Tsirkin
Make it possible to resize PCI regions.  This will be used by virtio
with MSI-X, where the region size depends on whether MSI-X is enabled,
and can change across load/save.

Signed-off-by: Michael S. Tsirkin m...@redhat.com
---
 hw/pci.c |   54 --
 hw/pci.h |3 +++
 2 files changed, 39 insertions(+), 18 deletions(-)

diff --git a/hw/pci.c b/hw/pci.c
index 31ba2ed..a63d988 100644
--- a/hw/pci.c
+++ b/hw/pci.c
@@ -392,6 +392,41 @@ void pci_register_io_region(PCIDevice *pci_dev, int 
region_num,
 *(uint32_t *)(pci_dev-mask + addr) = cpu_to_le32(mask);
 }
 
+static void pci_unmap_region(PCIDevice *d, PCIIORegion *r)
+{
+if (r-addr == -1)
+return;
+if (r-type  PCI_ADDRESS_SPACE_IO) {
+int class;
+/* NOTE: specific hack for IDE in PC case:
+   only one byte must be mapped. */
+class = pci_get_word(d-config + PCI_CLASS_DEVICE);
+if (class == 0x0101  r-size == 4) {
+isa_unassign_ioport(r-addr + 2, 1);
+} else {
+isa_unassign_ioport(r-addr, r-size);
+}
+} else {
+cpu_register_physical_memory(pci_to_cpu_addr(r-addr),
+ r-size,
+ IO_MEM_UNASSIGNED);
+qemu_unregister_coalesced_mmio(r-addr, r-size);
+}
+}
+
+void pci_resize_io_region(PCIDevice *pci_dev, int region_num,
+  uint32_t size)
+{
+
+PCIIORegion *r = pci_dev-io_regions[region_num];
+if (r-size == size)
+return;
+r-size = size;
+pci_unmap_region(pci_dev, r);
+r-addr = -1;
+pci_update_mappings(pci_dev);
+}
+
 static void pci_update_mappings(PCIDevice *d)
 {
 PCIIORegion *r;
@@ -445,24 +480,7 @@ static void pci_update_mappings(PCIDevice *d)
 }
 /* now do the real mapping */
 if (new_addr != r-addr) {
-if (r-addr != -1) {
-if (r-type  PCI_ADDRESS_SPACE_IO) {
-int class;
-/* NOTE: specific hack for IDE in PC case:
-   only one byte must be mapped. */
-class = d-config[0x0a] | (d-config[0x0b]  8);
-if (class == 0x0101  r-size == 4) {
-isa_unassign_ioport(r-addr + 2, 1);
-} else {
-isa_unassign_ioport(r-addr, r-size);
-}
-} else {
-cpu_register_physical_memory(pci_to_cpu_addr(r-addr),
- r-size,
- IO_MEM_UNASSIGNED);
-qemu_unregister_coalesced_mmio(r-addr, r-size);
-}
-}
+pci_unmap_region(d, r);
 r-addr = new_addr;
 if (r-addr != -1) {
 r-map_func(d, i, r-addr, r-size, r-type);
diff --git a/hw/pci.h b/hw/pci.h
index 6da626b..4072f16 100644
--- a/hw/pci.h
+++ b/hw/pci.h
@@ -221,6 +221,9 @@ void pci_register_io_region(PCIDevice *pci_dev, int 
region_num,
 uint32_t size, int type,
 PCIMapIORegionFunc *map_func);
 
+void pci_resize_io_region(PCIDevice *pci_dev, int region_num,
+  uint32_t size);
+
 int pci_add_capability(PCIDevice *pci_dev, uint8_t cap_id, uint8_t cap_size);
 
 void pci_del_capability(PCIDevice *pci_dev, uint8_t cap_id, uint8_t cap_size);
-- 
1.6.3.1.56.g79e1.dirty

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCHv2 09/13] qemu: virtio support for many interrupt vectors

2009-06-02 Thread Michael S. Tsirkin
Extend virtio to support many interrupt vectors, and rearrange code in
preparation for multi-vector support (mostly move reset out to bindings,
because we will have to reset the vectors in transport-specific code).
Actual bindings in pci, and use in net, to follow.
Load and save are not connected to bindings yet, so they are left
stubbed out for now.

Signed-off-by: Michael S. Tsirkin m...@redhat.com
---
 hw/syborg_virtio.c |   13 --
 hw/virtio-pci.c|   24 +++
 hw/virtio.c|   63 ++-
 hw/virtio.h|   10 ++-
 4 files changed, 84 insertions(+), 26 deletions(-)

diff --git a/hw/syborg_virtio.c b/hw/syborg_virtio.c
index 37c219c..d8c978a 100644
--- a/hw/syborg_virtio.c
+++ b/hw/syborg_virtio.c
@@ -134,7 +134,10 @@ static void syborg_virtio_writel(void *opaque, 
target_phys_addr_t offset,
 vdev-features = value;
 break;
 case SYBORG_VIRTIO_QUEUE_BASE:
-virtio_queue_set_addr(vdev, vdev-queue_sel, value);
+if (value == 0)
+virtio_reset(vdev);
+else
+virtio_queue_set_addr(vdev, vdev-queue_sel, value);
 break;
 case SYBORG_VIRTIO_QUEUE_SEL:
 if (value  VIRTIO_PCI_QUEUE_MAX)
@@ -228,7 +231,7 @@ static CPUWriteMemoryFunc *syborg_virtio_writefn[] = {
  syborg_virtio_writel
 };
 
-static void syborg_virtio_update_irq(void *opaque)
+static void syborg_virtio_update_irq(void *opaque, uint16_t vector)
 {
 SyborgVirtIOProxy *proxy = opaque;
 int level;
@@ -239,7 +242,7 @@ static void syborg_virtio_update_irq(void *opaque)
 }
 
 static VirtIOBindings syborg_virtio_bindings = {
-.update_irq = syborg_virtio_update_irq
+.notify = syborg_virtio_update_irq
 };
 
 static void syborg_virtio_init(SyborgVirtIOProxy *proxy, VirtIODevice *vdev)
@@ -248,6 +251,8 @@ static void syborg_virtio_init(SyborgVirtIOProxy *proxy, 
VirtIODevice *vdev)
 
 proxy-vdev = vdev;
 
+/* Don't support multiple vectors */
+proxy-vdev-nvectors = 0;
 sysbus_init_irq(proxy-busdev, proxy-irq);
 iomemtype = cpu_register_io_memory(0, syborg_virtio_readfn,
syborg_virtio_writefn, proxy);
@@ -255,6 +260,8 @@ static void syborg_virtio_init(SyborgVirtIOProxy *proxy, 
VirtIODevice *vdev)
 
 proxy-id = ((uint32_t)0x1af4  16) | vdev-device_id;
 
+qemu_register_reset(virtio_reset, 0, vdev);
+
 virtio_bind_device(vdev, syborg_virtio_bindings, proxy);
 }
 
diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c
index c072423..7dfdd80 100644
--- a/hw/virtio-pci.c
+++ b/hw/virtio-pci.c
@@ -78,13 +78,19 @@ typedef struct {
 
 /* virtio device */
 
-static void virtio_pci_update_irq(void *opaque)
+static void virtio_pci_notify(void *opaque, uint16_t vector)
 {
 VirtIOPCIProxy *proxy = opaque;
 
 qemu_set_irq(proxy-pci_dev.irq[0], proxy-vdev-isr  1);
 }
 
+static void virtio_pci_reset(void *opaque)
+{
+VirtIOPCIProxy *proxy = opaque;
+virtio_reset(proxy-vdev);
+}
+
 static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
 {
 VirtIOPCIProxy *proxy = opaque;
@@ -108,7 +114,10 @@ static void virtio_ioport_write(void *opaque, uint32_t 
addr, uint32_t val)
 break;
 case VIRTIO_PCI_QUEUE_PFN:
 pa = (target_phys_addr_t)val  VIRTIO_PCI_QUEUE_ADDR_SHIFT;
-virtio_queue_set_addr(vdev, vdev-queue_sel, pa);
+if (pa == 0)
+virtio_pci_reset(proxy);
+else
+virtio_queue_set_addr(vdev, vdev-queue_sel, pa);
 break;
 case VIRTIO_PCI_QUEUE_SEL:
 if (val  VIRTIO_PCI_QUEUE_MAX)
@@ -120,7 +129,7 @@ static void virtio_ioport_write(void *opaque, uint32_t 
addr, uint32_t val)
 case VIRTIO_PCI_STATUS:
 vdev-status = val  0xFF;
 if (vdev-status == 0)
-virtio_reset(vdev);
+virtio_pci_reset(proxy);
 break;
 }
 }
@@ -158,7 +167,7 @@ static uint32_t virtio_ioport_read(void *opaque, uint32_t 
addr)
 /* reading from the ISR also clears it. */
 ret = vdev-isr;
 vdev-isr = 0;
-virtio_update_irq(vdev);
+qemu_set_irq(proxy-pci_dev.irq[0], 0);
 break;
 default:
 break;
@@ -243,7 +252,7 @@ static void virtio_map(PCIDevice *pci_dev, int region_num,
 }
 
 static const VirtIOBindings virtio_pci_bindings = {
-.update_irq = virtio_pci_update_irq
+.notify = virtio_pci_notify
 };
 
 static void virtio_init_pci(VirtIOPCIProxy *proxy, VirtIODevice *vdev,
@@ -255,6 +264,9 @@ static void virtio_init_pci(VirtIOPCIProxy *proxy, 
VirtIODevice *vdev,
 
 proxy-vdev = vdev;
 
+/* No support for multiple vectors yet. */
+proxy-vdev-nvectors = 0;
+
 config = proxy-pci_dev.config;
 pci_config_set_vendor_id(config, vendor);
 pci_config_set_device_id(config, device);
@@ -279,6 +291,8 @@ static void virtio_init_pci(VirtIOPCIProxy *proxy, 
VirtIODevice *vdev,
 pci_register_io_region(proxy-pci_dev, 0, size, 

[PATCHv2 10/13] qemu: MSI-X support in virtio PCI

2009-06-02 Thread Michael S. Tsirkin
This enables actual support for MSI-X in virtio PCI.
First user will be virtio-net.

Signed-off-by: Michael S. Tsirkin m...@redhat.com
---
 hw/virtio-pci.c |  152 --
 1 files changed, 112 insertions(+), 40 deletions(-)

diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c
index 7dfdd80..294f4c7 100644
--- a/hw/virtio-pci.c
+++ b/hw/virtio-pci.c
@@ -18,6 +18,7 @@
 #include virtio.h
 #include pci.h
 //#include sysemu.h
+#include msix.h
 
 /* from Linux's linux/virtio_pci.h */
 
@@ -47,7 +48,24 @@
  * a read-and-acknowledge. */
 #define VIRTIO_PCI_ISR  19
 
-#define VIRTIO_PCI_CONFIG   20
+/* MSI-X registers: only enabled if MSI-X is enabled. */
+/* A 16-bit vector for configuration changes. */
+#define VIRTIO_MSI_CONFIG_VECTOR20
+/* A 16-bit vector for selected queue notifications. */
+#define VIRTIO_MSI_QUEUE_VECTOR 22
+
+/* Config space size */
+#define VIRTIO_PCI_CONFIG_NOMSI 20
+#define VIRTIO_PCI_CONFIG_MSI   24
+#define VIRTIO_PCI_REGION_SIZE(dev) (msix_present(dev) ? \
+ VIRTIO_PCI_CONFIG_MSI : \
+ VIRTIO_PCI_CONFIG_NOMSI)
+
+/* The remaining space is defined by each driver as the per-driver
+ * configuration space */
+#define VIRTIO_PCI_CONFIG(dev)  (msix_enabled(dev) ? \
+ VIRTIO_PCI_CONFIG_MSI : \
+ VIRTIO_PCI_CONFIG_NOMSI)
 
 /* Virtio ABI version, if we increment this, we break the guest driver. */
 #define VIRTIO_PCI_ABI_VERSION  0
@@ -81,14 +99,17 @@ typedef struct {
 static void virtio_pci_notify(void *opaque, uint16_t vector)
 {
 VirtIOPCIProxy *proxy = opaque;
-
-qemu_set_irq(proxy-pci_dev.irq[0], proxy-vdev-isr  1);
+if (msix_enabled(proxy-pci_dev))
+msix_notify(proxy-pci_dev, vector);
+else
+qemu_set_irq(proxy-pci_dev.irq[0], proxy-vdev-isr  1);
 }
 
 static void virtio_pci_reset(void *opaque)
 {
 VirtIOPCIProxy *proxy = opaque;
 virtio_reset(proxy-vdev);
+msix_reset(proxy-pci_dev);
 }
 
 static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
@@ -97,8 +118,6 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, 
uint32_t val)
 VirtIODevice *vdev = proxy-vdev;
 target_phys_addr_t pa;
 
-addr -= proxy-addr;
-
 switch (addr) {
 case VIRTIO_PCI_GUEST_FEATURES:
/* Guest does not negotiate properly?  We have to assume nothing. */
@@ -131,17 +150,33 @@ static void virtio_ioport_write(void *opaque, uint32_t 
addr, uint32_t val)
 if (vdev-status == 0)
 virtio_pci_reset(proxy);
 break;
+case VIRTIO_MSI_CONFIG_VECTOR:
+msix_vector_unuse(proxy-pci_dev, vdev-config_vector);
+/* Make it possible for guest to discover an error took place. */
+if (msix_vector_use(proxy-pci_dev, val)  0)
+val = VIRTIO_NO_VECTOR;
+vdev-config_vector = val;
+break;
+case VIRTIO_MSI_QUEUE_VECTOR:
+msix_vector_unuse(proxy-pci_dev,
+  virtio_queue_vector(vdev, vdev-queue_sel));
+/* Make it possible for guest to discover an error took place. */
+if (msix_vector_use(proxy-pci_dev, val)  0)
+val = VIRTIO_NO_VECTOR;
+virtio_queue_set_vector(vdev, vdev-queue_sel, val);
+break;
+default:
+fprintf(stderr, %s: unexpected address 0x%x value 0x%x\n,
+__func__, addr, val);
+break;
 }
 }
 
-static uint32_t virtio_ioport_read(void *opaque, uint32_t addr)
+static uint32_t virtio_ioport_read(VirtIOPCIProxy *proxy, uint32_t addr)
 {
-VirtIOPCIProxy *proxy = opaque;
 VirtIODevice *vdev = proxy-vdev;
 uint32_t ret = 0x;
 
-addr -= proxy-addr;
-
 switch (addr) {
 case VIRTIO_PCI_HOST_FEATURES:
 ret = vdev-get_features(vdev);
@@ -169,6 +204,12 @@ static uint32_t virtio_ioport_read(void *opaque, uint32_t 
addr)
 vdev-isr = 0;
 qemu_set_irq(proxy-pci_dev.irq[0], 0);
 break;
+case VIRTIO_MSI_CONFIG_VECTOR:
+ret = vdev-config_vector;
+break;
+case VIRTIO_MSI_QUEUE_VECTOR:
+ret = virtio_queue_vector(vdev, vdev-queue_sel);
+break;
 default:
 break;
 }
@@ -179,42 +220,72 @@ static uint32_t virtio_ioport_read(void *opaque, uint32_t 
addr)
 static uint32_t virtio_pci_config_readb(void *opaque, uint32_t addr)
 {
 VirtIOPCIProxy *proxy = opaque;
-addr -= proxy-addr + VIRTIO_PCI_CONFIG;
+uint32_t config = VIRTIO_PCI_CONFIG(proxy-pci_dev);
+addr -= proxy-addr;
+if (addr  config)
+return virtio_ioport_read(proxy, addr);
+addr -= config;
 return virtio_config_readb(proxy-vdev, addr);
 }
 
 static uint32_t virtio_pci_config_readw(void *opaque, uint32_t addr)
 {
 VirtIOPCIProxy *proxy = opaque;
-addr -= proxy-addr + 

[PATCHv2 11/13] qemu: request 3 vectors in virtio-net

2009-06-02 Thread Michael S. Tsirkin
Request up to 3 vectors in virtio-net. Actual bindings might supply
less.

Signed-off-by: Michael S. Tsirkin m...@redhat.com
---
 hw/virtio-net.c |1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/hw/virtio-net.c b/hw/virtio-net.c
index 60aa6da..6118fe3 100644
--- a/hw/virtio-net.c
+++ b/hw/virtio-net.c
@@ -621,6 +621,7 @@ VirtIODevice *virtio_net_init(DeviceState *dev)
 n-mac_table.macs = qemu_mallocz(MAC_TABLE_ENTRIES * ETH_ALEN);
 
 n-vlans = qemu_mallocz(MAX_VLAN  3);
+n-vdev.nvectors = 3;
 
 register_savevm(virtio-net, virtio_net_id++, VIRTIO_NET_VM_VERSION,
 virtio_net_save, virtio_net_load, n);
-- 
1.6.3.1.56.g79e1.dirty

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCHv2 12/13] qemu: virtio save/load bindings

2009-06-02 Thread Michael S. Tsirkin
Implement bindings for virtio save/load. Use them in virtio pci.

Signed-off-by: Michael S. Tsirkin m...@redhat.com
---
 hw/virtio-pci.c |   49 -
 hw/virtio.c |   31 ++-
 hw/virtio.h |4 
 3 files changed, 66 insertions(+), 18 deletions(-)

diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c
index 294f4c7..589fbb1 100644
--- a/hw/virtio-pci.c
+++ b/hw/virtio-pci.c
@@ -105,6 +105,48 @@ static void virtio_pci_notify(void *opaque, uint16_t 
vector)
 qemu_set_irq(proxy-pci_dev.irq[0], proxy-vdev-isr  1);
 }
 
+static void virtio_pci_save_config(void * opaque, QEMUFile *f)
+{
+VirtIOPCIProxy *proxy = opaque;
+pci_device_save(proxy-pci_dev, f);
+msix_save(proxy-pci_dev, f);
+if (msix_present(proxy-pci_dev))
+qemu_put_be16(f, proxy-vdev-config_vector);
+}
+
+static void virtio_pci_save_queue(void * opaque, int n, QEMUFile *f)
+{
+VirtIOPCIProxy *proxy = opaque;
+if (msix_present(proxy-pci_dev))
+qemu_put_be16(f, virtio_queue_vector(proxy-vdev, n));
+}
+
+static int virtio_pci_load_config(void * opaque, QEMUFile *f)
+{
+VirtIOPCIProxy *proxy = opaque;
+int ret;
+ret = pci_device_load(proxy-pci_dev, f);
+if (ret)
+return ret;
+ret = msix_load(proxy-pci_dev, f);
+if (ret)
+return ret;
+if (msix_present(proxy-pci_dev))
+qemu_get_be16s(f, proxy-vdev-config_vector);
+return 0;
+}
+
+static int virtio_pci_load_queue(void * opaque, int n, QEMUFile *f)
+{
+VirtIOPCIProxy *proxy = opaque;
+uint16_t vector;
+if (!msix_present(proxy-pci_dev))
+return 0;
+qemu_get_be16s(f, vector);
+virtio_queue_set_vector(proxy-vdev, n, vector);
+return 0;
+}
+
 static void virtio_pci_reset(void *opaque)
 {
 VirtIOPCIProxy *proxy = opaque;
@@ -317,7 +359,12 @@ static void virtio_write_config(PCIDevice *pci_dev, 
uint32_t address,
 }
 
 static const VirtIOBindings virtio_pci_bindings = {
-.notify = virtio_pci_notify
+.notify = virtio_pci_notify,
+.save_config = virtio_pci_save_config,
+.load_config = virtio_pci_load_config,
+.save_config = virtio_pci_save_config,
+.save_queue = virtio_pci_save_queue,
+.load_queue = virtio_pci_load_queue,
 };
 
 static void virtio_init_pci(VirtIOPCIProxy *proxy, VirtIODevice *vdev,
diff --git a/hw/virtio.c b/hw/virtio.c
index 63ffcff..b773dff 100644
--- a/hw/virtio.c
+++ b/hw/virtio.c
@@ -568,9 +568,8 @@ void virtio_save(VirtIODevice *vdev, QEMUFile *f)
 {
 int i;
 
-/* FIXME: load/save binding.  */
-//pci_device_save(vdev-pci_dev, f);
-//msix_save(vdev-pci_dev, f);
+if (vdev-binding-save_config)
+vdev-binding-save_config(vdev-binding_opaque, f);
 
 qemu_put_8s(f, vdev-status);
 qemu_put_8s(f, vdev-isr);
@@ -596,19 +595,20 @@ void virtio_save(VirtIODevice *vdev, QEMUFile *f)
 qemu_put_be32(f, vdev-vq[i].vring.num);
 qemu_put_be64(f, vdev-vq[i].pa);
 qemu_put_be16s(f, vdev-vq[i].last_avail_idx);
-if (vdev-nvectors)
-qemu_put_be16s(f, vdev-vq[i].vector);
+if (vdev-binding-save_queue)
+vdev-binding-save_queue(vdev-binding_opaque, i, f);
 }
 }
 
 int virtio_load(VirtIODevice *vdev, QEMUFile *f)
 {
-int num, i;
+int num, i, ret;
 
-/* FIXME: load/save binding.  */
-//pci_device_load(vdev-pci_dev, f);
-//r = msix_load(vdev-pci_dev, f);
-//pci_resize_io_region(vdev-pci_dev, 1, msix_bar_size(vdev-pci_dev));
+if (vdev-binding-load_config) {
+ret = vdev-binding-load_config(vdev-binding_opaque, f);
+if (ret)
+return ret;
+}
 
 qemu_get_8s(f, vdev-status);
 qemu_get_8s(f, vdev-isr);
@@ -617,10 +617,6 @@ int virtio_load(VirtIODevice *vdev, QEMUFile *f)
 vdev-config_len = qemu_get_be32(f);
 qemu_get_buffer(f, vdev-config, vdev-config_len);
 
-if (vdev-nvectors) {
-qemu_get_be16s(f, vdev-config_vector);
-//msix_vector_use(vdev-pci_dev, vdev-config_vector);
-}
 num = qemu_get_be32(f);
 
 for (i = 0; i  num; i++) {
@@ -631,9 +627,10 @@ int virtio_load(VirtIODevice *vdev, QEMUFile *f)
 if (vdev-vq[i].pa) {
 virtqueue_init(vdev-vq[i]);
 }
-if (vdev-nvectors) {
-qemu_get_be16s(f, vdev-vq[i].vector);
-//msix_vector_use(vdev-pci_dev, vdev-config_vector);
+if (vdev-binding-load_queue) {
+ret = vdev-binding-load_queue(vdev-binding_opaque, i, f);
+if (ret)
+return ret;
 }
 }
 
diff --git a/hw/virtio.h b/hw/virtio.h
index 04a3c3d..ce05517 100644
--- a/hw/virtio.h
+++ b/hw/virtio.h
@@ -72,6 +72,10 @@ typedef struct VirtQueueElement
 
 typedef struct {
 void (*notify)(void * opaque, uint16_t vector);
+void (*save_config)(void * opaque, QEMUFile *f);
+void (*save_queue)(void * opaque, int n, QEMUFile *f);
+int (*load_config)(void * opaque, 

[PATCHv2 13/13] qemu: add pci_get/set_byte

2009-06-02 Thread Michael S. Tsirkin
Add pci_get/set_byte to keep *_word and *_long access functions company.
They are unused for now.

Signed-off-by: Michael S. Tsirkin m...@redhat.com
---
 hw/pci.h |   12 
 1 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/hw/pci.h b/hw/pci.h
index 4072f16..e1e4fb4 100644
--- a/hw/pci.h
+++ b/hw/pci.h
@@ -263,6 +263,18 @@ PCIBus *pci_bridge_init(PCIBus *bus, int devfn, uint16_t 
vid, uint16_t did,
 pci_map_irq_fn map_irq, const char *name);
 
 static inline void
+pci_set_byte(uint8_t *config, uint8_t val)
+{
+*config = val;
+}
+
+static inline uint8_t
+pci_get_byte(uint8_t *config)
+{
+return *config;
+}
+
+static inline void
 pci_set_word(uint8_t *config, uint16_t val)
 {
 cpu_to_le16wu((uint16_t *)config, val);
-- 
1.6.3.1.56.g79e1.dirty
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[KVM-RFC PATCH 0/2] irqfd: use POLLHUP notification for close()

2009-06-02 Thread Gregory Haskins
(Applies to kvm.git/master:25deed73)

Please see the header for 2/2 for a description.  This patch series has
been fully tested and appears to be working correctly.  I have it as an RFC
for now because it needs Davide's official submission/SOB for patch 1/2, and
it should get some eyeballs/acks on my SRCU usage before going in.

I will submit the updated irqfd userspace which eschews the deassign() verb
since we can now just use the close(fd) method alone.  I will also address
the userspace review comments from Avi.

---

Davide Libenzi (1):
  eventfd: send POLLHUP on f_ops-release

Gregory Haskins (1):
  kvm: use POLLHUP to close an irqfd instead of an explicit ioctl


 fs/eventfd.c|   10 +++
 include/linux/kvm.h |2 -
 virt/kvm/eventfd.c  |  177 +++
 virt/kvm/kvm_main.c |3 +
 4 files changed, 90 insertions(+), 102 deletions(-)

-- 
Signature
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[KVM-RFC PATCH 1/2] eventfd: send POLLHUP on f_ops-release

2009-06-02 Thread Gregory Haskins
From: Davide Libenzi davi...@xmailserver.org

Return-path: davi...@xmailserver.org
Received: from novell.com ([:::130.57.1.153])
by soto.provo.novell.com with ESMTP; Wed, 27 May 2009 13:35:04 -0600
Received: from x35.xmailserver.org not authenticated [64.71.152.41]
by novell.com with M+ Extreme Email Engine 2008.3.release
via secured  encrypted transport (TLS);
Wed, 27 May 2009 13:35:04 -0600
X-MailFrom: davi...@xmailserver.org
X-AuthUser: davi...@xmailserver.org
Received: from makko.or.mcafeemobile.com
by x35.xmailserver.org with [XMail 1.26 ESMTP Server]
id S2EBEB0 for ghask...@novell.com from davi...@xmailserver.org;
Wed, 27 May 2009 15:34:01 -0400
Date: Wed, 27 May 2009 12:28:57 -0700 (PDT)
From: Davide Libenzi davi...@xmailserver.org
X-X-Sender: dav...@makko.or.mcafeemobile.com
To: Michael S. Tsirkin m...@redhat.com
cc: Gregory Haskins ghask...@novell.com, kvm@vger.kernel.org,
Linux Kernel Mailing List linux-ker...@vger.kernel.org, a...@redhat.com,
mtosa...@redhat.com
Subject: Re: [KVM PATCH v10] kvm: add support for irqfd
In-Reply-To: 20090527184106.ga18...@redhat.com
Message-ID: alpine.deb.1.10.0905271222590.20...@makko.or.mcafeemobile.com
References: 20090520142234.22285.72274.st...@dev.haskins.net 
20090527130447.ga11...@redhat.com 4a1d48fa.9080...@novell.com 
20090527184106.ga18...@redhat.com
User-Agent: Alpine 1.10 (DEB 962 2008-03-14)
X-GPG-FINGRPRINT: CFAE 5BEE FD36 F65E E640  56FE 0974 BF23 270F 474E
X-GPG-PUBLIC_KEY: http://www.xmailserver.org/davidel.asc
MIME-Version: 1.0
Content-Type: TEXT/PLAIN; charset=US-ASCII

On Wed, 27 May 2009, Michael S. Tsirkin wrote:

 On Wed, May 27, 2009 at 10:06:50AM -0400, Gregory Haskins wrote:
  Michael S. Tsirkin wrote:
   On Wed, May 20, 2009 at 10:30:49AM -0400, Gregory Haskins wrote:
  
   +static int
   +kvm_assign_irqfd(struct kvm *kvm, int fd, int gsi)
   +{
   +struct _irqfd *irqfd;
   +struct file *file = NULL;
   +int ret;
   +
   +irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
   +if (!irqfd)
   +return -ENOMEM;
   +
   +irqfd-kvm = kvm;
   +irqfd-gsi = gsi;
   +INIT_LIST_HEAD(irqfd-list);
   +INIT_WORK(irqfd-work, irqfd_inject);
   +
   +/*
   + * Embed the file* lifetime in the irqfd.
   + */
   +file = fget(fd);
   +if (IS_ERR(file)) {
   +ret = PTR_ERR(file);
   +goto fail;
   +}
  
  
   So we get a reference to a file, and unless the user is nice to us, it
   will only be dropped when kvm char device file is closed?
   I think this will deadlock if the fd in question is the open kvm char 
   device.
  
  
  
  Hmm...I hadn't considered this possibility, though I am not sure if it
  would cause a deadlock in the pattern you suggest.  It seems more like
  it would result in, at worst, an extra reference to itself (and thus a
  leak) rather than a deadlock...
 
  I digress.  In either case, perhaps I should s/fget/eventfd_fget to at
  least limit the type of fd to eventfd.  I was trying to be slick by
  not needing the eventfd_fget() exported, but I am going to need to
  export it later anyway for iosignalfd, so its probably a moot point.
 
  Thanks Michael,
  -Greg
 

 This only works as long as eventfd does not do fget on some fd as well.
 Which it does not do now, and may never do - but we create a fragile
 system this way.

 I think it's really wrong, fundamentally, to keep a reference to a
 file until another file is closed, unless you are code under fs/.
 We will get nasty circular references sooner or later.

 Isn't the real reason we use fd to be able to support the same interface
 on top of both kvm and lguest?
 And if so, wouldn't some kind of bus be a better solution?

Another solution, that I proposed in the past, is having irqfd hold no
references to the eventfd. It's just register (holding an eventfd-get())
for events (in the way that currently does), notice the POLLHUP, unchain
from it, and propagate the eventfd-close event to whatever the irqfd logic
is supposed to.



- Davide

Signed-off-by: Gregory Haskins ghask...@novell.com
---

 fs/eventfd.c |   10 +-
 1 files changed, 9 insertions(+), 1 deletions(-)

diff --git a/fs/eventfd.c b/fs/eventfd.c
index 3f0e197..72f5f8d 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -61,7 +61,15 @@ EXPORT_SYMBOL_GPL(eventfd_signal);
 
 static int eventfd_release(struct inode *inode, struct file *file)
 {
-   kfree(file-private_data);
+   struct eventfd_ctx *ctx = file-private_data;
+
+   /*
+* No need to hold the lock here, since we are on the file cleanup
+* path and the ones still attached to the wait queue will be
+* serialized by wake_up_locked_poll().
+*/
+   wake_up_locked_poll(ctx-wqh, POLLHUP);
+   kfree(ctx);
return 0;
 }
 

--
To unsubscribe from this list: send the line unsubscribe kvm 

[KVM-RFC PATCH 2/2] kvm: use POLLHUP to close an irqfd instead of an explicit ioctl

2009-06-02 Thread Gregory Haskins
Assigning an irqfd object to a kvm object creates a relationship that we
currently manage by having the kvm oject acquire/hold a file* reference to
the underlying eventfd.  The lifetime of these objects is properly maintained
by decoupling the two objects whenever the irqfd is closed or kvm is closed,
whichever comes first.

However, the irqfd close method is less than ideal since it requires two
system calls to complete (one for ioctl(kvmfd, IRQFD_DEASSIGN), the other for
close(eventfd)).  This dual-call approach was utilized because there was no
notification mechanism on the eventfd side at the time irqfd was implemented.

Recently, Davide proposed a patch to send a POLLHUP wakeup whenever an
eventfd is about to close.  So we eliminate the IRQFD_DEASSIGN ioctl (*)
vector in favor of sensing the desassign automatically when the fd is closed.
The resulting code is slightly more complex as a result since we need to
allow either side to sever the relationship independently.  We utilize SRCU
to guarantee stable concurrent access to the KVM pointer without adding
additional atomic operations in the fast path.

At minimum, this design should be acked by both Davide and Paul (cc'd).

(*) The irqfd patch does not exist in any released tree, so the understanding
is that we can alter the irqfd specific ABI without taking the normal
precautions, such as CAP bits.

Signed-off-by: Gregory Haskins ghask...@novell.com
CC: Davide Libenzi davi...@xmailserver.org
CC: Michael S. Tsirkin m...@redhat.com
CC: Paul E. McKenney paul...@linux.vnet.ibm.com
---

 include/linux/kvm.h |2 -
 virt/kvm/eventfd.c  |  177 +++
 virt/kvm/kvm_main.c |3 +
 3 files changed, 81 insertions(+), 101 deletions(-)

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 632a856..29b62cc 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -482,8 +482,6 @@ struct kvm_x86_mce {
 };
 #endif
 
-#define KVM_IRQFD_FLAG_DEASSIGN (1  0)
-
 struct kvm_irqfd {
__u32 fd;
__u32 gsi;
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index f3f2ea1..6ed62e2 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -37,26 +37,63 @@
  * 
  */
 struct _irqfd {
+   struct mutex  lock;
+   struct srcu_structsrcu;
struct kvm   *kvm;
int   gsi;
-   struct file  *file;
struct list_head  list;
poll_tablept;
wait_queue_head_t*wqh;
wait_queue_t  wait;
-   struct work_structwork;
+   struct work_structinject;
 };
 
 static void
 irqfd_inject(struct work_struct *work)
 {
-   struct _irqfd *irqfd = container_of(work, struct _irqfd, work);
-   struct kvm *kvm = irqfd-kvm;
+   struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
+   struct kvm *kvm;
+   int idx;
+
+   idx = srcu_read_lock(irqfd-srcu);
+
+   kvm = rcu_dereference(irqfd-kvm);
+   if (kvm) {
+   mutex_lock(kvm-lock);
+   kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd-gsi, 1);
+   kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd-gsi, 0);
+   mutex_unlock(kvm-lock);
+   }
+
+   srcu_read_unlock(irqfd-srcu, idx);
+}
+
+static void
+irqfd_disconnect(struct _irqfd *irqfd)
+{
+   struct kvm *kvm;
+
+   mutex_lock(irqfd-lock);
+
+   kvm = rcu_dereference(irqfd-kvm);
+   rcu_assign_pointer(irqfd-kvm, NULL);
+
+   mutex_unlock(irqfd-lock);
+
+   if (!kvm)
+   return;
 
mutex_lock(kvm-lock);
-   kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd-gsi, 1);
-   kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd-gsi, 0);
+   list_del(irqfd-list);
mutex_unlock(kvm-lock);
+
+   /*
+* It is important to not drop the kvm reference until the next grace
+* period because there might be lockless references in flight up
+* until then
+*/
+   synchronize_srcu(irqfd-srcu);
+   kvm_put_kvm(kvm);
 }
 
 static int
@@ -64,12 +101,28 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, 
void *key)
 {
struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait);
 
-   /*
-* The wake_up is called with interrupts disabled.  Therefore we need
-* to defer the IRQ injection until later since we need to acquire the
-* kvm-lock to do so.
-*/
-   schedule_work(irqfd-work);
+   switch ((unsigned long)key) {
+   case POLLIN:
+   /*
+* The POLLIN wake_up is called with interrupts disabled.
+* Therefore we need to defer the IRQ injection until later
+* since we need to acquire the kvm-lock to do so.
+*/
+   schedule_work(irqfd-inject);
+  

RE: [PATCH] qemu-kvm: Flush icache after dma operations for ia64

2009-06-02 Thread Zhang, Xiantao
Hi, Jes
Have you verified whether it works for you ?  You may run kernel build in 
the guest with 4 vcpus,  if it can be done successfully without any error, it 
should be Okay I think, otherwise, we may need to investigate it further. :)
Xiantao 
 


 

-Original Message-
From: Jes Sorensen [mailto:j...@sgi.com] 
Sent: Tuesday, June 02, 2009 6:57 PM
To: Avi Kivity
Cc: Zhang, Xiantao; kvm@vger.kernel.org; kvm-i...@vger.kernel.org; Hollis 
Blanchard
Subject: Re: [PATCH] qemu-kvm: Flush icache after dma operations for ia64

Avi Kivity wrote:
 Zhang, Xiantao wrote:
 Fine to me.  But seems the change in qemu_iovec_from_buffer is lost in 
 this patch or that change is also not unnecessary ?
 
 I think the fixed unmap handles that case.  Can you test to make sure?
 

Avi and I went through the code and verified that it was all covered by
the unmap case. It was pretty messy to come to the conclusion, which is
why I tried to document it in the code and the patch description.

Cheers,
Jes
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v8] qemu-kvm: add irqfd support

2009-06-02 Thread Gregory Haskins
irqfd lets you create an eventfd based file-desriptor to inject interrupts
to a kvm guest.  We associate one gsi per fd for fine-grained routing.

[note: this is meant to work in conjunction with the POLLHUP version of
 irqfd, which has not yet been accepted into kvm.git]

[ Changelog:

v8:
*) removed deassign() verb in favor of POLLHUP support on close()
*) only include sys/eventfd.h if CAP_IRQFD is defined to properly
   support older kernels
*) s/ENOENT/ENOSYS
] 

Signed-off-by: Gregory Haskins ghask...@novell.com
---

 kvm/libkvm/libkvm.c |   49 +
 kvm/libkvm/libkvm.h |   14 ++
 2 files changed, 63 insertions(+), 0 deletions(-)

diff --git a/kvm/libkvm/libkvm.c b/kvm/libkvm/libkvm.c
index ba0a5d1..10cb2ac 100644
--- a/kvm/libkvm/libkvm.c
+++ b/kvm/libkvm/libkvm.c
@@ -1444,3 +1444,52 @@ int kvm_assign_set_msix_entry(kvm_context_t kvm,
 return ret;
 }
 #endif
+
+#ifdef KVM_CAP_IRQFD
+
+#include sys/eventfd.h
+
+static int _kvm_irqfd(kvm_context_t kvm, int fd, int gsi, int flags)
+{
+   int r;
+   struct kvm_irqfd data = {
+   .fd= fd,
+   .gsi   = gsi,
+   .flags = flags,
+   };
+
+   r = ioctl(kvm-vm_fd, KVM_IRQFD, data);
+   if (r == -1)
+   r = -errno;
+   return r;
+}
+
+int kvm_irqfd(kvm_context_t kvm, int gsi, int flags)
+{
+   int r;
+   int fd;
+
+   if (!kvm_check_extension(kvm, KVM_CAP_IRQFD))
+   return -ENOENT;
+
+   fd = eventfd(0, 0);
+   if (fd  0)
+   return -errno;
+
+   r = _kvm_irqfd(kvm, fd, gsi, 0);
+   if (r  0) {
+   close(fd);
+   return -errno;
+   }
+
+   return fd;
+}
+
+#else /* KVM_CAP_IRQFD */
+
+int kvm_irqfd(kvm_context_t kvm, int gsi, int flags)
+{
+   return -ENOSYS;
+}
+
+#endif /* KVM_CAP_IRQFD */
diff --git a/kvm/libkvm/libkvm.h b/kvm/libkvm/libkvm.h
index 4821a1e..aca8ed6 100644
--- a/kvm/libkvm/libkvm.h
+++ b/kvm/libkvm/libkvm.h
@@ -856,6 +856,20 @@ int kvm_commit_irq_routes(kvm_context_t kvm);
  */
 int kvm_get_irq_route_gsi(kvm_context_t kvm);
 
+/*!
+ * \brief Create a file descriptor for injecting interrupts
+ *
+ * Creates an eventfd based file-descriptor that maps to a specific GSI
+ * in the guest.  eventfd compliant signaling (write() from userspace, or
+ * eventfd_signal() from kernelspace) will cause the GSI to inject
+ * itself into the guest at the next available window.
+ *
+ * \param kvm Pointer to the current kvm_context
+ * \param gsi GSI to assign to this fd
+ * \param flags reserved, must be zero
+ */
+int kvm_irqfd(kvm_context_t kvm, int gsi, int flags);
+
 #ifdef KVM_CAP_DEVICE_MSIX
 int kvm_assign_set_msix_nr(kvm_context_t kvm,
   struct kvm_assigned_msix_nr *msix_nr);

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v8] qemu-kvm: add irqfd support

2009-06-02 Thread Gregory Haskins
Gregory Haskins wrote:
 irqfd lets you create an eventfd based file-desriptor to inject interrupts
 to a kvm guest.  We associate one gsi per fd for fine-grained routing.

 [note: this is meant to work in conjunction with the POLLHUP version of
  irqfd, which has not yet been accepted into kvm.git]

 [ Changelog:

 v8:
 *) removed deassign() verb in favor of POLLHUP support on close()
 *) only include sys/eventfd.h if CAP_IRQFD is defined to properly
support older kernels
 *) s/ENOENT/ENOSYS
 ] 
   

Hmm..it would appear I already released a 'v8' in the past. :(

In any case, this is the latest version to consider

-Greg

 Signed-off-by: Gregory Haskins ghask...@novell.com
 ---

  kvm/libkvm/libkvm.c |   49 +
  kvm/libkvm/libkvm.h |   14 ++
  2 files changed, 63 insertions(+), 0 deletions(-)

 diff --git a/kvm/libkvm/libkvm.c b/kvm/libkvm/libkvm.c
 index ba0a5d1..10cb2ac 100644
 --- a/kvm/libkvm/libkvm.c
 +++ b/kvm/libkvm/libkvm.c
 @@ -1444,3 +1444,52 @@ int kvm_assign_set_msix_entry(kvm_context_t kvm,
  return ret;
  }
  #endif
 +
 +#ifdef KVM_CAP_IRQFD
 +
 +#include sys/eventfd.h
 +
 +static int _kvm_irqfd(kvm_context_t kvm, int fd, int gsi, int flags)
 +{
 + int r;
 + struct kvm_irqfd data = {
 + .fd= fd,
 + .gsi   = gsi,
 + .flags = flags,
 + };
 +
 + r = ioctl(kvm-vm_fd, KVM_IRQFD, data);
 + if (r == -1)
 + r = -errno;
 + return r;
 +}
 +
 +int kvm_irqfd(kvm_context_t kvm, int gsi, int flags)
 +{
 + int r;
 + int fd;
 +
 + if (!kvm_check_extension(kvm, KVM_CAP_IRQFD))
 + return -ENOENT;
 +
 + fd = eventfd(0, 0);
 + if (fd  0)
 + return -errno;
 +
 + r = _kvm_irqfd(kvm, fd, gsi, 0);
 + if (r  0) {
 + close(fd);
 + return -errno;
 + }
 +
 + return fd;
 +}
 +
 +#else /* KVM_CAP_IRQFD */
 +
 +int kvm_irqfd(kvm_context_t kvm, int gsi, int flags)
 +{
 + return -ENOSYS;
 +}
 +
 +#endif /* KVM_CAP_IRQFD */
 diff --git a/kvm/libkvm/libkvm.h b/kvm/libkvm/libkvm.h
 index 4821a1e..aca8ed6 100644
 --- a/kvm/libkvm/libkvm.h
 +++ b/kvm/libkvm/libkvm.h
 @@ -856,6 +856,20 @@ int kvm_commit_irq_routes(kvm_context_t kvm);
   */
  int kvm_get_irq_route_gsi(kvm_context_t kvm);
  
 +/*!
 + * \brief Create a file descriptor for injecting interrupts
 + *
 + * Creates an eventfd based file-descriptor that maps to a specific GSI
 + * in the guest.  eventfd compliant signaling (write() from userspace, or
 + * eventfd_signal() from kernelspace) will cause the GSI to inject
 + * itself into the guest at the next available window.
 + *
 + * \param kvm Pointer to the current kvm_context
 + * \param gsi GSI to assign to this fd
 + * \param flags reserved, must be zero
 + */
 +int kvm_irqfd(kvm_context_t kvm, int gsi, int flags);
 +
  #ifdef KVM_CAP_DEVICE_MSIX
  int kvm_assign_set_msix_nr(kvm_context_t kvm,
  struct kvm_assigned_msix_nr *msix_nr);

 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
   




signature.asc
Description: OpenPGP digital signature


Re: [KVM-RFC PATCH 0/2] irqfd: use POLLHUP notification for close()

2009-06-02 Thread Michael S. Tsirkin
On Tue, Jun 02, 2009 at 11:15:28AM -0400, Gregory Haskins wrote:
 (Applies to kvm.git/master:25deed73)
 
 Please see the header for 2/2 for a description.  This patch series has
 been fully tested and appears to be working correctly.  I have it as an RFC
 for now because it needs Davide's official submission/SOB for patch 1/2, and
 it should get some eyeballs/acks on my SRCU usage before going in.
 
 I will submit the updated irqfd userspace which eschews the deassign() verb
 since we can now just use the close(fd) method alone.  I will also address
 the userspace review comments from Avi.


We are not killing the deassign though, do we?
It's good to have that option e.g. for when we pass
the fd to another process.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [KVM-RFC PATCH 0/2] irqfd: use POLLHUP notification for close()

2009-06-02 Thread Gregory Haskins
Michael S. Tsirkin wrote:
 On Tue, Jun 02, 2009 at 11:15:28AM -0400, Gregory Haskins wrote:
   
 (Applies to kvm.git/master:25deed73)

 Please see the header for 2/2 for a description.  This patch series has
 been fully tested and appears to be working correctly.  I have it as an RFC
 for now because it needs Davide's official submission/SOB for patch 1/2, and
 it should get some eyeballs/acks on my SRCU usage before going in.

 I will submit the updated irqfd userspace which eschews the deassign() verb
 since we can now just use the close(fd) method alone.  I will also address
 the userspace review comments from Avi.
 


 We are not killing the deassign though, do we?
   

Yes, it is not needed any more now that we have proper
release-notification from eventfd.

 It's good to have that option e.g. for when we pass
 the fd to another process.
   

Passing the fd to another app should up the underlying file reference
count.  If the producer app wants to deassign it simply calls
close(fd) (as opposed to today where it calls DEASSIGN+close), but the
reference count will allow the consuming app to leave the eventfd's file
open.  Or am I misunderstanding you?

-Greg




signature.asc
Description: OpenPGP digital signature


out of memory error leads to unbootable VM

2009-06-02 Thread Ross Boylan
I had a VM running Linux.  While running that SAS installer the kernel
ran out of memory and killed the installer.  After that, any attempt to
run a binary inside the VM produced cannot execute binary file.  The
log (inside the VM) also showed messages like
Buffer I/O error on device hda1, logical block 12243549368.  
hda1 is 86\% full.

I killed the VM and attempted to restart.  However, the virtual BIOS
says the disk is unrecognizeable.

lvscan shows, among other things,
  inactive Original '/dev/turtle/Lenny00' [5.00 GB] inherit
  ACTIVE'/dev/turtle/SAS92' [33.00 GB] inherit
  inactive Snapshot '/dev/turtle/Lenny01SAS' [3.00 GB] inherit
The active volume has the installation disk; Lenny01SAS was the disk
the VM was using; it is a snapshot of Lenny00.  I have not deliberately
deactivated either.

lvdisplay does not give any indication of COW overflow, though that may
be because the volumes are inactive.

Only about 11G of SAS92 are in use.  The SAS installation docs seem to
indicate 1 to 1.5G for a full installation.  Although it looks as if I
had enough room, I suppose a snapshot overflow could account for the
inactive snapshot (and inactive original?).

It probably did run out of virtual RAM, since kvm started
(inadvertently) with the default RAM.

Any ideas what's going on?

Thanks.
Ross Boylan


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [KVM-RFC PATCH 0/2] irqfd: use POLLHUP notification for close()

2009-06-02 Thread Michael S. Tsirkin
On Tue, Jun 02, 2009 at 12:14:15PM -0400, Gregory Haskins wrote:
 Michael S. Tsirkin wrote:
  On Tue, Jun 02, 2009 at 11:15:28AM -0400, Gregory Haskins wrote:

  (Applies to kvm.git/master:25deed73)
 
  Please see the header for 2/2 for a description.  This patch series has
  been fully tested and appears to be working correctly.  I have it as an RFC
  for now because it needs Davide's official submission/SOB for patch 1/2, 
  and
  it should get some eyeballs/acks on my SRCU usage before going in.
 
  I will submit the updated irqfd userspace which eschews the deassign() verb
  since we can now just use the close(fd) method alone.  I will also address
  the userspace review comments from Avi.
  
 
 
  We are not killing the deassign though, do we?

 
 Yes, it is not needed any more now that we have proper
 release-notification from eventfd.
 
  It's good to have that option e.g. for when we pass
  the fd to another process.

 
 Passing the fd to another app should up the underlying file reference
 count.  If the producer app wants to deassign it simply calls
 close(fd) (as opposed to today where it calls DEASSIGN+close), but the
 reference count will allow the consuming app to leave the eventfd's file
 open.  Or am I misunderstanding you?
 
 -Greg
 
 

I think we want to keep supporting the deassign ioctl. This, even though
close overlaps with it functionally somewhat.

This allows qemu to pass eventfd to another process/device, and then
block/unblock interrupts as seen by that process by
assigning/deassigning irq to it. This is much easier and lightweight
than asking another process to close the fd and passing another fd
later.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: out of memory error leads to unbootable VM

2009-06-02 Thread Avi Kivity

Ross Boylan wrote:

I had a VM running Linux.  While running that SAS installer the kernel
ran out of memory and killed the installer.  After that, any attempt to
run a binary inside the VM produced cannot execute binary file.  The
log (inside the VM) also showed messages like
Buffer I/O error on device hda1, logical block 12243549368.  
hda1 is 86\% full.


I killed the VM and attempted to restart.  However, the virtual BIOS
says the disk is unrecognizeable.

lvscan shows, among other things,
  inactive Original '/dev/turtle/Lenny00' [5.00 GB] inherit
  ACTIVE'/dev/turtle/SAS92' [33.00 GB] inherit
  inactive Snapshot '/dev/turtle/Lenny01SAS' [3.00 GB] inherit
The active volume has the installation disk; Lenny01SAS was the disk
the VM was using; it is a snapshot of Lenny00.  I have not deliberately
deactivated either.

lvdisplay does not give any indication of COW overflow, though that may
be because the volumes are inactive.

Only about 11G of SAS92 are in use.  The SAS installation docs seem to
indicate 1 to 1.5G for a full installation.  Although it looks as if I
had enough room, I suppose a snapshot overflow could account for the
inactive snapshot (and inactive original?).

It probably did run out of virtual RAM, since kvm started
(inadvertently) with the default RAM.

Any ideas what's going on?

  


Sounds like a guest bug that caused guest disk corruption.  Try with 
more memory (on new volumes to be sure).


--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [KVM-RFC PATCH 0/2] irqfd: use POLLHUP notification for close()

2009-06-02 Thread Gregory Haskins
Michael S. Tsirkin wrote:
 On Tue, Jun 02, 2009 at 12:14:15PM -0400, Gregory Haskins wrote:
   
 Michael S. Tsirkin wrote:
 
 On Tue, Jun 02, 2009 at 11:15:28AM -0400, Gregory Haskins wrote:
   
   
 (Applies to kvm.git/master:25deed73)

 Please see the header for 2/2 for a description.  This patch series has
 been fully tested and appears to be working correctly.  I have it as an RFC
 for now because it needs Davide's official submission/SOB for patch 1/2, 
 and
 it should get some eyeballs/acks on my SRCU usage before going in.

 I will submit the updated irqfd userspace which eschews the deassign() verb
 since we can now just use the close(fd) method alone.  I will also address
 the userspace review comments from Avi.
 
 
 We are not killing the deassign though, do we?
   
   
 Yes, it is not needed any more now that we have proper
 release-notification from eventfd.

 
 It's good to have that option e.g. for when we pass
 the fd to another process.
   
   
 Passing the fd to another app should up the underlying file reference
 count.  If the producer app wants to deassign it simply calls
 close(fd) (as opposed to today where it calls DEASSIGN+close), but the
 reference count will allow the consuming app to leave the eventfd's file
 open.  Or am I misunderstanding you?

 -Greg


 

 I think we want to keep supporting the deassign ioctl. This, even though
 close overlaps with it functionally somewhat.

 This allows qemu to pass eventfd to another process/device, and then
 block/unblock interrupts as seen by that process by
 assigning/deassigning irq to it. This is much easier and lightweight
 than asking another process to close the fd and passing another fd
 later.

   
Perhaps, but if that is the case we should just ignore this series and
continue with the DEASSIGN+close methodology since it already provides
that separation.  Trying to do a hybrid is just messy.

But in any case, I think that approach is flawed.  DEASSIGN shouldn't be
used as a mask in my opinion, and we shouldn't be reassigning a
channel's meaning under the covers like that.  If this is in fact a
valid use case, we should have a separate GSI_MASK type operation that
is independent of irqfd.  Likewise, we really should pass a new fd if
the gsi-routing is changing.  Today there is a tight coupling of
fd-to-gsi, and I think that makes sense to continue this association.

-Greg



signature.asc
Description: OpenPGP digital signature


Re: [KVM-RFC PATCH 0/2] irqfd: use POLLHUP notification for close()

2009-06-02 Thread Michael S. Tsirkin
On Tue, Jun 02, 2009 at 12:34:12PM -0400, Gregory Haskins wrote:
 Michael S. Tsirkin wrote:
  On Tue, Jun 02, 2009 at 12:14:15PM -0400, Gregory Haskins wrote:

  Michael S. Tsirkin wrote:
  
  On Tue, Jun 02, 2009 at 11:15:28AM -0400, Gregory Haskins wrote:


  (Applies to kvm.git/master:25deed73)
 
  Please see the header for 2/2 for a description.  This patch series has
  been fully tested and appears to be working correctly.  I have it as an 
  RFC
  for now because it needs Davide's official submission/SOB for patch 1/2, 
  and
  it should get some eyeballs/acks on my SRCU usage before going in.
 
  I will submit the updated irqfd userspace which eschews the deassign() 
  verb
  since we can now just use the close(fd) method alone.  I will also 
  address
  the userspace review comments from Avi.
  
  
  We are not killing the deassign though, do we?


  Yes, it is not needed any more now that we have proper
  release-notification from eventfd.
 
  
  It's good to have that option e.g. for when we pass
  the fd to another process.


  Passing the fd to another app should up the underlying file reference
  count.  If the producer app wants to deassign it simply calls
  close(fd) (as opposed to today where it calls DEASSIGN+close), but the
  reference count will allow the consuming app to leave the eventfd's file
  open.  Or am I misunderstanding you?
 
  -Greg
 
 
  
 
  I think we want to keep supporting the deassign ioctl. This, even though
  close overlaps with it functionally somewhat.
 
  This allows qemu to pass eventfd to another process/device, and then
  block/unblock interrupts as seen by that process by
  assigning/deassigning irq to it. This is much easier and lightweight
  than asking another process to close the fd and passing another fd
  later.
 

 Perhaps, but if that is the case we should just ignore this series and
 continue with the DEASSIGN+close methodology since it already provides
 that separation. Trying to do a hybrid is just messy.

As I see it, it's the least evil.

One-way ioctl operations on file descriptors are messier still. What's
another example of an ioctl that can't be undone without closing the fd?
And having close not clean up the state unless you do an ioctl first is
very messy IMO - I don't think you'll find any such examples in kernel.

 But in any case, I think that approach is flawed.  DEASSIGN shouldn't be
 used as a mask in my opinion, and we shouldn't be reassigning a
 channel's meaning under the covers like that.  If this is in fact a
 valid use case, we should have a separate GSI_MASK type operation that
 is independent of irqfd.
  Likewise, we really should pass a new fd if
 the gsi-routing is changing.  Today there is a tight coupling of
 fd-to-gsi, and I think that makes sense to continue this association.
 
 -Greg
 

I'm not arguing that this use-case is not theoretical. Just that if you
don't create the fd to connect to GSI, you shouln't ask the user to
destroy it to disconnect. Who knows what else this eventfd descriptor
can be used for?

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [KVM-RFC PATCH 0/2] irqfd: use POLLHUP notification for close()

2009-06-02 Thread Michael S. Tsirkin
On Tue, Jun 02, 2009 at 07:59:49PM +0300, Michael S. Tsirkin wrote:
 On Tue, Jun 02, 2009 at 12:34:12PM -0400, Gregory Haskins wrote:
  Michael S. Tsirkin wrote:
   On Tue, Jun 02, 2009 at 12:14:15PM -0400, Gregory Haskins wrote:
 
   Michael S. Tsirkin wrote:
   
   On Tue, Jun 02, 2009 at 11:15:28AM -0400, Gregory Haskins wrote:
 
 
   (Applies to kvm.git/master:25deed73)
  
   Please see the header for 2/2 for a description.  This patch series has
   been fully tested and appears to be working correctly.  I have it as 
   an RFC
   for now because it needs Davide's official submission/SOB for patch 
   1/2, and
   it should get some eyeballs/acks on my SRCU usage before going in.
  
   I will submit the updated irqfd userspace which eschews the deassign() 
   verb
   since we can now just use the close(fd) method alone.  I will also 
   address
   the userspace review comments from Avi.
   
   
   We are not killing the deassign though, do we?
 
 
   Yes, it is not needed any more now that we have proper
   release-notification from eventfd.
  
   
   It's good to have that option e.g. for when we pass
   the fd to another process.
 
 
   Passing the fd to another app should up the underlying file reference
   count.  If the producer app wants to deassign it simply calls
   close(fd) (as opposed to today where it calls DEASSIGN+close), but the
   reference count will allow the consuming app to leave the eventfd's file
   open.  Or am I misunderstanding you?
  
   -Greg
  
  
   
  
   I think we want to keep supporting the deassign ioctl. This, even though
   close overlaps with it functionally somewhat.
  
   This allows qemu to pass eventfd to another process/device, and then
   block/unblock interrupts as seen by that process by
   assigning/deassigning irq to it. This is much easier and lightweight
   than asking another process to close the fd and passing another fd
   later.
  
 
  Perhaps, but if that is the case we should just ignore this series and
  continue with the DEASSIGN+close methodology since it already provides
  that separation. Trying to do a hybrid is just messy.
 
 As I see it, it's the least evil.
 
 One-way ioctl operations on file descriptors are messier still. What's
 another example of an ioctl that can't be undone without closing the fd?
 And having close not clean up the state unless you do an ioctl first is
 very messy IMO - I don't think you'll find any such examples in kernel.
 
  But in any case, I think that approach is flawed.  DEASSIGN shouldn't be
  used as a mask in my opinion, and we shouldn't be reassigning a
  channel's meaning under the covers like that.  If this is in fact a
  valid use case, we should have a separate GSI_MASK type operation that
  is independent of irqfd.
   Likewise, we really should pass a new fd if
  the gsi-routing is changing.  Today there is a tight coupling of
  fd-to-gsi, and I think that makes sense to continue this association.
  
  -Greg
  
 
 I'm not arguing that this use-case is not theoretical. Just that if you
 don't create the fd to connect to GSI, you shouln't ask the user to
 destroy it to disconnect. Who knows what else this eventfd descriptor
 can be used for?

As a follow-up, here's another example: imagine an application that
handles interrupt events from a thread by blocking on eventfd. To wake
up this thread, we could reuse the same eventfd just by writing there. I
might want to do this even after I don't get any interrupts anymore.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: out of memory error leads to unbootable VM

2009-06-02 Thread Ross Boylan
On Tue, 2009-06-02 at 19:27 +0300, Avi Kivity wrote:
 Ross Boylan wrote:
  I had a VM running Linux.  While running that SAS installer the kernel
  ran out of memory and killed the installer.  After that, any attempt to
  run a binary inside the VM produced cannot execute binary file.  The
  log (inside the VM) also showed messages like
  Buffer I/O error on device hda1, logical block 12243549368.  
  hda1 is 86\% full.
 
  I killed the VM and attempted to restart.  However, the virtual BIOS
  says the disk is unrecognizeable.
 
  lvscan shows, among other things,
inactive Original '/dev/turtle/Lenny00' [5.00 GB] inherit
ACTIVE'/dev/turtle/SAS92' [33.00 GB] inherit
inactive Snapshot '/dev/turtle/Lenny01SAS' [3.00 GB] inherit
  The active volume has the installation disk; Lenny01SAS was the disk
  the VM was using; it is a snapshot of Lenny00.  I have not deliberately
  deactivated either.
 
  lvdisplay does not give any indication of COW overflow, though that may
  be because the volumes are inactive.
 
  Only about 11G of SAS92 are in use.  The SAS installation docs seem to
  indicate 1 to 1.5G for a full installation.  Although it looks as if I
  had enough room, I suppose a snapshot overflow could account for the
  inactive snapshot (and inactive original?).
 
  It probably did run out of virtual RAM, since kvm started
  (inadvertently) with the default RAM.
 
  Any ideas what's going on?
 

 
 Sounds like a guest bug that caused guest disk corruption.  Try with 
 more memory (on new volumes to be sure).

Isn't Linux (the guest, running Debian Lenny amd64 dual processor) more
robust than that?  

At any rate, I'll try again.  I'm hoping the original (Lenny00) is OK,
so there's less I need to rebuild.

Ross

P.S. Running kvm 72+dfsg-5~lenny1 with a host 2.6.29 kernel.  They guest
was probably 2.6.26, though it might have been 2.6.29.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [KVM-RFC PATCH 2/2] kvm: use POLLHUP to close an irqfd instead of an explicit ioctl

2009-06-02 Thread Davide Libenzi
On Tue, 2 Jun 2009, Gregory Haskins wrote:

 @@ -64,12 +101,28 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int 
 sync, void *key)
  {
   struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait);
  
 - /*
 -  * The wake_up is called with interrupts disabled.  Therefore we need
 -  * to defer the IRQ injection until later since we need to acquire the
 -  * kvm-lock to do so.
 -  */
 - schedule_work(irqfd-work);
 + switch ((unsigned long)key) {
 + case POLLIN:
 + /*
 +  * The POLLIN wake_up is called with interrupts disabled.
 +  * Therefore we need to defer the IRQ injection until later
 +  * since we need to acquire the kvm-lock to do so.
 +  */
 + schedule_work(irqfd-inject);
 + break;
 + case POLLHUP:
 + /*
 +  * The POLLHUP is called unlocked, so it theoretically should
 +  * be safe to remove ourselves from the wqh
 +  */
 + remove_wait_queue(irqfd-wqh, irqfd-wait);
 + flush_work(irqfd-inject);
 + irqfd_disconnect(irqfd);
 +
 + cleanup_srcu_struct(irqfd-srcu);
 + kfree(irqfd);
 + break;
 + }

Since key is an event *bitmap*, you better be treating it as such. Do 
not assume what eventfd delivers today (event bitmaps with only one bit 
set). So this better be like:

if (key  POLLIN) {
...
}
if (key  POLLHUP) {
...
}


- Davide


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [KVM-RFC PATCH 0/2] irqfd: use POLLHUP notification for close()

2009-06-02 Thread Gregory Haskins
Michael S. Tsirkin wrote:
 On Tue, Jun 02, 2009 at 12:34:12PM -0400, Gregory Haskins wrote:
   
 Michael S. Tsirkin wrote:
 
 On Tue, Jun 02, 2009 at 12:14:15PM -0400, Gregory Haskins wrote:
   
   
 Michael S. Tsirkin wrote:
 
 
 On Tue, Jun 02, 2009 at 11:15:28AM -0400, Gregory Haskins wrote:
   
   
   
 (Applies to kvm.git/master:25deed73)

 Please see the header for 2/2 for a description.  This patch series has
 been fully tested and appears to be working correctly.  I have it as an 
 RFC
 for now because it needs Davide's official submission/SOB for patch 1/2, 
 and
 it should get some eyeballs/acks on my SRCU usage before going in.

 I will submit the updated irqfd userspace which eschews the deassign() 
 verb
 since we can now just use the close(fd) method alone.  I will also 
 address
 the userspace review comments from Avi.
 
 
 
 We are not killing the deassign though, do we?
   
   
   
 Yes, it is not needed any more now that we have proper
 release-notification from eventfd.

 
 
 It's good to have that option e.g. for when we pass
 the fd to another process.
   
   
   
 Passing the fd to another app should up the underlying file reference
 count.  If the producer app wants to deassign it simply calls
 close(fd) (as opposed to today where it calls DEASSIGN+close), but the
 reference count will allow the consuming app to leave the eventfd's file
 open.  Or am I misunderstanding you?

 -Greg


 
 
 I think we want to keep supporting the deassign ioctl. This, even though
 close overlaps with it functionally somewhat.

 This allows qemu to pass eventfd to another process/device, and then
 block/unblock interrupts as seen by that process by
 assigning/deassigning irq to it. This is much easier and lightweight
 than asking another process to close the fd and passing another fd
 later.

   
   
 Perhaps, but if that is the case we should just ignore this series and
 continue with the DEASSIGN+close methodology since it already provides
 that separation. Trying to do a hybrid is just messy.
 

 As I see it, it's the least evil.
   

Which?  Leaving the code as is, or a hybrid?
 One-way ioctl operations on file descriptors are messier still. What's
 another example of an ioctl that can't be undone without closing the fd?
   
-ENOPARSE

 And having close not clean up the state unless you do an ioctl first is
 very messy IMO - I don't think you'll find any such examples in kernel.

   

I agree, and that is why I am advocating this POLLHUP solution.  It was
only this other way to begin with because the technology didn't exist
until Davide showed me the light.

Problem with your request is that I already looked into what is
essentially a bi-directional reference problem (for a different reason)
when I started the POLLHUP series.  Its messy to do this in a way that
doesn't negatively impact the fast path (introducing locking, etc) or
make my head explode making sure it doesn't race.  Afaict, we would need
to solve this problem to do what you are proposing (patches welcome).

If this hybrid decoupled-deassign + unified-close is indeed an important
feature set, I suggest that we still consider this POLLHUP series for
inclusion, and then someone can re-introduce DEASSIGN support in the
future as a CAP bit extension.  That way we at least get the desirable
close() properties that we both seem in favor of, and get this advanced
use case when we need it (and can figure out the locking design).


 But in any case, I think that approach is flawed.  DEASSIGN shouldn't be
 used as a mask in my opinion, and we shouldn't be reassigning a
 channel's meaning under the covers like that.  If this is in fact a
 valid use case, we should have a separate GSI_MASK type operation that
 is independent of irqfd.
  Likewise, we really should pass a new fd if
 the gsi-routing is changing.  Today there is a tight coupling of
 fd-to-gsi, and I think that makes sense to continue this association.

 -Greg

 

 I'm not arguing that this use-case is not theoretical. Just that if you
 don't create the fd to connect to GSI, you shouln't ask the user to
 destroy it to disconnect.

Well, thats just it.  Today, you *do* create the eventfd to bundle with
the gsi (take a look at my userspace patches..I posted some new ones
today).   The eventfd is returned after you specify the GSI via
kvm_irqfd().  Thats why I am arguing that it is natural for close() to
terminate the assignment.  To me, this is consistent with other
interfaces that return an fd (socket(), open(), etc).

That said, if we are going to support your proposal going forward, we
should probably change libkvm::kvm_irqfd() to take the fd as a
parameter, instead of returning it.


  Who knows what else this eventfd descriptor
 can be used for?
   

Perhaps, but you are exceeding the original design specifications of
irqfd as it is, so we can't really predict what 

Re: [KVM-RFC PATCH 2/2] kvm: use POLLHUP to close an irqfd instead of an explicit ioctl

2009-06-02 Thread Gregory Haskins
Davide Libenzi wrote:
 On Tue, 2 Jun 2009, Gregory Haskins wrote:

   
 @@ -64,12 +101,28 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int 
 sync, void *key)
  {
  struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait);
  
 -/*
 - * The wake_up is called with interrupts disabled.  Therefore we need
 - * to defer the IRQ injection until later since we need to acquire the
 - * kvm-lock to do so.
 - */
 -schedule_work(irqfd-work);
 +switch ((unsigned long)key) {
 +case POLLIN:
 +/*
 + * The POLLIN wake_up is called with interrupts disabled.
 + * Therefore we need to defer the IRQ injection until later
 + * since we need to acquire the kvm-lock to do so.
 + */
 +schedule_work(irqfd-inject);
 +break;
 +case POLLHUP:
 +/*
 + * The POLLHUP is called unlocked, so it theoretically should
 + * be safe to remove ourselves from the wqh
 + */
 +remove_wait_queue(irqfd-wqh, irqfd-wait);
 +flush_work(irqfd-inject);
 +irqfd_disconnect(irqfd);
 +
 +cleanup_srcu_struct(irqfd-srcu);
 +kfree(irqfd);
 +break;
 +}
 

 Since key is an event *bitmap*, you better be treating it as such. Do 
 not assume what eventfd delivers today (event bitmaps with only one bit 
 set). So this better be like:

   if (key  POLLIN) {
   ...
   }
   if (key  POLLHUP) {
   ...
   }


 - Davide


   
Indeed.  Thanks for catching that!

-Greg



signature.asc
Description: OpenPGP digital signature


Re: [KVM-RFC PATCH 2/2] kvm: use POLLHUP to close an irqfd instead of an explicit ioctl

2009-06-02 Thread Paul E. McKenney
On Tue, Jun 02, 2009 at 11:15:38AM -0400, Gregory Haskins wrote:
 Assigning an irqfd object to a kvm object creates a relationship that we
 currently manage by having the kvm oject acquire/hold a file* reference to
 the underlying eventfd.  The lifetime of these objects is properly maintained
 by decoupling the two objects whenever the irqfd is closed or kvm is closed,
 whichever comes first.
 
 However, the irqfd close method is less than ideal since it requires two
 system calls to complete (one for ioctl(kvmfd, IRQFD_DEASSIGN), the other for
 close(eventfd)).  This dual-call approach was utilized because there was no
 notification mechanism on the eventfd side at the time irqfd was implemented.
 
 Recently, Davide proposed a patch to send a POLLHUP wakeup whenever an
 eventfd is about to close.  So we eliminate the IRQFD_DEASSIGN ioctl (*)
 vector in favor of sensing the desassign automatically when the fd is closed.
 The resulting code is slightly more complex as a result since we need to
 allow either side to sever the relationship independently.  We utilize SRCU
 to guarantee stable concurrent access to the KVM pointer without adding
 additional atomic operations in the fast path.
 
 At minimum, this design should be acked by both Davide and Paul (cc'd).
 
 (*) The irqfd patch does not exist in any released tree, so the understanding
 is that we can alter the irqfd specific ABI without taking the normal
 precautions, such as CAP bits.

A few questions and suggestions interspersed below.

Thanx, Paul

 Signed-off-by: Gregory Haskins ghask...@novell.com
 CC: Davide Libenzi davi...@xmailserver.org
 CC: Michael S. Tsirkin m...@redhat.com
 CC: Paul E. McKenney paul...@linux.vnet.ibm.com
 ---
 
  include/linux/kvm.h |2 -
  virt/kvm/eventfd.c  |  177 
 +++
  virt/kvm/kvm_main.c |3 +
  3 files changed, 81 insertions(+), 101 deletions(-)
 
 diff --git a/include/linux/kvm.h b/include/linux/kvm.h
 index 632a856..29b62cc 100644
 --- a/include/linux/kvm.h
 +++ b/include/linux/kvm.h
 @@ -482,8 +482,6 @@ struct kvm_x86_mce {
  };
  #endif
 
 -#define KVM_IRQFD_FLAG_DEASSIGN (1  0)
 -
  struct kvm_irqfd {
   __u32 fd;
   __u32 gsi;
 diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
 index f3f2ea1..6ed62e2 100644
 --- a/virt/kvm/eventfd.c
 +++ b/virt/kvm/eventfd.c
 @@ -37,26 +37,63 @@
   * 
   */
  struct _irqfd {
 + struct mutex  lock;
 + struct srcu_structsrcu;
   struct kvm   *kvm;
   int   gsi;
 - struct file  *file;
   struct list_head  list;
   poll_tablept;
   wait_queue_head_t*wqh;
   wait_queue_t  wait;
 - struct work_structwork;
 + struct work_structinject;
  };
 
  static void
  irqfd_inject(struct work_struct *work)
  {
 - struct _irqfd *irqfd = container_of(work, struct _irqfd, work);
 - struct kvm *kvm = irqfd-kvm;
 + struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
 + struct kvm *kvm;
 + int idx;
 +
 + idx = srcu_read_lock(irqfd-srcu);
 +
 + kvm = rcu_dereference(irqfd-kvm);
 + if (kvm) {
 + mutex_lock(kvm-lock);
 + kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd-gsi, 1);
 + kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd-gsi, 0);
 + mutex_unlock(kvm-lock);
 + }
 +
 + srcu_read_unlock(irqfd-srcu, idx);
 +}
 +
 +static void
 +irqfd_disconnect(struct _irqfd *irqfd)
 +{
 + struct kvm *kvm;
 +
 + mutex_lock(irqfd-lock);
 +
 + kvm = rcu_dereference(irqfd-kvm);
 + rcu_assign_pointer(irqfd-kvm, NULL);
 +
 + mutex_unlock(irqfd-lock);
 +
 + if (!kvm)
 + return;
 
   mutex_lock(kvm-lock);
 - kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd-gsi, 1);
 - kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd-gsi, 0);
 + list_del(irqfd-list);
   mutex_unlock(kvm-lock);
 +
 + /*
 +  * It is important to not drop the kvm reference until the next grace
 +  * period because there might be lockless references in flight up
 +  * until then
 +  */

The lockless references are all harmless even if carried out after the
kvm structure has been removed?  Or does there need to be a -deleted
field that allows the lockless references to ignore a kvm structure that
has already been deleted?

On the other hand, if it really is somehow OK for kvm_set_irq() to be
called on an already-deleted kvm structure, then this code would be OK
as is.

 + synchronize_srcu(irqfd-srcu);
 + kvm_put_kvm(kvm);
  }
 
  static int
 @@ -64,12 +101,28 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int 
 sync, void *key)
  {
   struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait);
 
 - 

Re: [KVM-RFC PATCH 2/2] kvm: use POLLHUP to close an irqfd instead of an explicit ioctl

2009-06-02 Thread Gregory Haskins
Paul E. McKenney wrote:
 On Tue, Jun 02, 2009 at 11:15:38AM -0400, Gregory Haskins wrote:
   
 Assigning an irqfd object to a kvm object creates a relationship that we
 currently manage by having the kvm oject acquire/hold a file* reference to
 the underlying eventfd.  The lifetime of these objects is properly maintained
 by decoupling the two objects whenever the irqfd is closed or kvm is closed,
 whichever comes first.

 However, the irqfd close method is less than ideal since it requires two
 system calls to complete (one for ioctl(kvmfd, IRQFD_DEASSIGN), the other for
 close(eventfd)).  This dual-call approach was utilized because there was no
 notification mechanism on the eventfd side at the time irqfd was implemented.

 Recently, Davide proposed a patch to send a POLLHUP wakeup whenever an
 eventfd is about to close.  So we eliminate the IRQFD_DEASSIGN ioctl (*)
 vector in favor of sensing the desassign automatically when the fd is closed.
 The resulting code is slightly more complex as a result since we need to
 allow either side to sever the relationship independently.  We utilize SRCU
 to guarantee stable concurrent access to the KVM pointer without adding
 additional atomic operations in the fast path.

 At minimum, this design should be acked by both Davide and Paul (cc'd).

 (*) The irqfd patch does not exist in any released tree, so the understanding
 is that we can alter the irqfd specific ABI without taking the normal
 precautions, such as CAP bits.
 

 A few questions and suggestions interspersed below.

   Thanx, Paul
   

Thanks for the review, Paul.

(FYI: This isn't quite what I was asking you about on IRC yesterday, but
it's related...and the SRCU portion of the conversation *did* inspire me
here.  Just note that the stuff I was asking about is still forthcoming)

   
 Signed-off-by: Gregory Haskins ghask...@novell.com
 CC: Davide Libenzi davi...@xmailserver.org
 CC: Michael S. Tsirkin m...@redhat.com
 CC: Paul E. McKenney paul...@linux.vnet.ibm.com
 ---

  include/linux/kvm.h |2 -
  virt/kvm/eventfd.c  |  177 
 +++
  virt/kvm/kvm_main.c |3 +
  3 files changed, 81 insertions(+), 101 deletions(-)

 diff --git a/include/linux/kvm.h b/include/linux/kvm.h
 index 632a856..29b62cc 100644
 --- a/include/linux/kvm.h
 +++ b/include/linux/kvm.h
 @@ -482,8 +482,6 @@ struct kvm_x86_mce {
  };
  #endif

 -#define KVM_IRQFD_FLAG_DEASSIGN (1  0)
 -
  struct kvm_irqfd {
  __u32 fd;
  __u32 gsi;
 diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
 index f3f2ea1..6ed62e2 100644
 --- a/virt/kvm/eventfd.c
 +++ b/virt/kvm/eventfd.c
 @@ -37,26 +37,63 @@
   * 
   */
  struct _irqfd {
 +struct mutex  lock;
 +struct srcu_structsrcu;
  struct kvm   *kvm;
  int   gsi;
 -struct file  *file;
  struct list_head  list;
  poll_tablept;
  wait_queue_head_t*wqh;
  wait_queue_t  wait;
 -struct work_structwork;
 +struct work_structinject;
  };

  static void
  irqfd_inject(struct work_struct *work)
  {
 -struct _irqfd *irqfd = container_of(work, struct _irqfd, work);
 -struct kvm *kvm = irqfd-kvm;
 +struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
 +struct kvm *kvm;
 +int idx;
 +
 +idx = srcu_read_lock(irqfd-srcu);
 +
 +kvm = rcu_dereference(irqfd-kvm);
 +if (kvm) {
 +mutex_lock(kvm-lock);
 +kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd-gsi, 1);
 +kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd-gsi, 0);
 +mutex_unlock(kvm-lock);
 +}
 +
 +srcu_read_unlock(irqfd-srcu, idx);
 +}
 

[1]

 +
 +static void
 +irqfd_disconnect(struct _irqfd *irqfd)
 +{
 +struct kvm *kvm;
 +
 +mutex_lock(irqfd-lock);
 +
 +kvm = rcu_dereference(irqfd-kvm);
 +rcu_assign_pointer(irqfd-kvm, NULL);
 +
 +mutex_unlock(irqfd-lock);
 

[2]

 +
 +if (!kvm)
 +return;

  mutex_lock(kvm-lock);
 -kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd-gsi, 1);
 -kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd-gsi, 0);
 +list_del(irqfd-list);
  mutex_unlock(kvm-lock);
 +
 +/*
 + * It is important to not drop the kvm reference until the next grace
 + * period because there might be lockless references in flight up
 + * until then
 + */
 

 The lockless references are all harmless even if carried out after the
 kvm structure has been removed?

No, but all ([1]) references to my knowledge occur within an srcu
read-side CS, and we only drop the object reference ([3]) outside of
that CS by virtue of the synchronize_srcu() barrier below.  The one
notable exception is [2], which I don't declare as a read-side CS 

[PATCH 0/4] Provide kvm-free implementations of apic/ioapic

2009-06-02 Thread Glauber Costa
In the same way as the previous i8259 patch, this patch cleans up
the apic and ioapic code to provide an implementation that is kvm free.

This reduces the impact of kvm on normal qemu. Also, provides a simpler
code base for kvm devices.


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/4] always halt non-bsp cpu.

2009-06-02 Thread Glauber Costa
This is not kvm specific, and should do fine in plain qemu

Signed-off-by: Glauber Costa glom...@redhat.com
---
 hw/apic.c |3 +--
 1 files changed, 1 insertions(+), 2 deletions(-)

diff --git a/hw/apic.c b/hw/apic.c
index 86aa6b6..2eddba0 100644
--- a/hw/apic.c
+++ b/hw/apic.c
@@ -467,8 +467,7 @@ static void apic_init_ipi(APICState *s)
 
 cpu_reset(s-cpu_env);
 
-if (!(s-apicbase  MSR_IA32_APICBASE_BSP) 
-(!kvm_enabled() || !qemu_kvm_irqchip_in_kernel()))
+if (!(s-apicbase  MSR_IA32_APICBASE_BSP))
 s-cpu_env-halted = 1;
 
 if (kvm_enabled()  !qemu_kvm_irqchip_in_kernel())
-- 
1.5.6.6

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/4] sipi and init: move common code

2009-06-02 Thread Glauber Costa
provide functions to query and reset the state of sipi and
init in cpu's apic. This way we can move the kvm specific functions
out of the apic path.

Signed-off-by: Glauber Costa glom...@redhat.com
---
 cpu-defs.h |2 --
 hw/apic.c  |   49 -
 qemu-kvm.c |   26 ++
 qemu-kvm.h |7 +--
 4 files changed, 55 insertions(+), 29 deletions(-)

diff --git a/cpu-defs.h b/cpu-defs.h
index 1e071e7..e66e928 100644
--- a/cpu-defs.h
+++ b/cpu-defs.h
@@ -140,8 +140,6 @@ typedef struct CPUWatchpoint {
 struct qemu_work_item;
 
 struct KVMCPUState {
-int sipi_needed;
-int init;
 pthread_t thread;
 int signalled;
 int stop;
diff --git a/hw/apic.c b/hw/apic.c
index 2eddba0..c93cbb3 100644
--- a/hw/apic.c
+++ b/hw/apic.c
@@ -86,6 +86,8 @@ typedef struct APICState {
 uint32_t initial_count;
 int64_t initial_count_load_time, next_time;
 QEMUTimer *timer;
+int sipi_needed;
+int init;
 } APICState;
 
 static int apic_io_memory;
@@ -443,6 +445,45 @@ static void apic_get_delivery_bitmask(uint32_t 
*deliver_bitmask,
 }
 }
 
+int apic_init_received(CPUState *env)
+{
+if (!env)
+return 0;
+if (!env-apic_state)
+return 0;
+
+return env-apic_state-init;
+}
+
+int apic_sipi_needed(CPUState *env)
+{
+if (!env)
+return 0;
+if (!env-apic_state)
+return 0;
+
+return env-apic_state-sipi_needed;
+}
+
+void apic_reset_sipi(CPUState *env)
+{
+if (!env)
+return;
+if (!env-apic_state)
+return;
+
+env-apic_state-sipi_needed = 0;
+}
+
+void apic_reset_init(CPUState *env)
+{
+if (!env)
+return;
+if (!env-apic_state)
+return;
+
+env-apic_state-init = 0;
+}
 
 static void apic_init_ipi(APICState *s)
 {
@@ -470,9 +511,8 @@ static void apic_init_ipi(APICState *s)
 if (!(s-apicbase  MSR_IA32_APICBASE_BSP))
 s-cpu_env-halted = 1;
 
-if (kvm_enabled()  !qemu_kvm_irqchip_in_kernel())
-   if (s-cpu_env)
-   kvm_apic_init(s-cpu_env);
+if (s-cpu_env  s-cpu_env-cpu_index != 0)
+   s-init = 1;
 }
 
 /* send a SIPI message to the CPU to start it */
@@ -485,8 +525,7 @@ static void apic_startup(APICState *s, int vector_num)
 cpu_x86_load_seg_cache(env, R_CS, vector_num  8, vector_num  12,
0x, 0);
 env-halted = 0;
-if (kvm_enabled()  !qemu_kvm_irqchip_in_kernel())
-   kvm_update_after_sipi(env);
+s-sipi_needed = 1;
 }
 
 static void apic_deliver(APICState *s, uint8_t dest, uint8_t dest_mode,
diff --git a/qemu-kvm.c b/qemu-kvm.c
index 68d3b92..1441cef 100644
--- a/qemu-kvm.c
+++ b/qemu-kvm.c
@@ -134,19 +134,6 @@ void kvm_update_interrupt_request(CPUState *env)
 }
 }
 
-void kvm_update_after_sipi(CPUState *env)
-{
-env-kvm_cpu_state.sipi_needed = 1;
-kvm_update_interrupt_request(env);
-}
-
-void kvm_apic_init(CPUState *env)
-{
-if (env-cpu_index != 0)
-   env-kvm_cpu_state.init = 1;
-kvm_update_interrupt_request(env);
-}
-
 #include signal.h
 
 static int try_push_interrupts(void *opaque)
@@ -332,7 +319,7 @@ static void kvm_vm_state_change_handler(void *context, int 
running, int reason)
 static void update_regs_for_sipi(CPUState *env)
 {
 kvm_arch_update_regs_for_sipi(env);
-env-kvm_cpu_state.sipi_needed = 0;
+apic_reset_sipi(env);
 }
 
 static void update_regs_for_init(CPUState *env)
@@ -345,11 +332,10 @@ static void update_regs_for_init(CPUState *env)
 
 #ifdef TARGET_I386
 /* restore SIPI vector */
-if(env-kvm_cpu_state.sipi_needed)
+if (apic_sipi_needed(env))
 env-segs[R_CS] = cs;
 #endif
-
-env-kvm_cpu_state.init = 0;
+apic_reset_init(env);
 kvm_arch_load_regs(env);
 }
 
@@ -407,12 +393,12 @@ static int kvm_main_loop_cpu(CPUState *env)
if (env-interrupt_request  (CPU_INTERRUPT_HARD | CPU_INTERRUPT_NMI))
env-halted = 0;
 if (!kvm_irqchip_in_kernel(kvm_context)) {
-   if (env-kvm_cpu_state.init)
+   if (apic_init_received(env))
update_regs_for_init(env);
-   if (env-kvm_cpu_state.sipi_needed)
+   if (apic_sipi_needed(env))
update_regs_for_sipi(env);
 }
-   if (!env-halted  !env-kvm_cpu_state.init)
+   if (!env-halted)
kvm_cpu_exec(env);
env-exit_request = 0;
 env-exception_index = EXCP_INTERRUPT;
diff --git a/qemu-kvm.h b/qemu-kvm.h
index 725589b..eb7dc29 100644
--- a/qemu-kvm.h
+++ b/qemu-kvm.h
@@ -31,9 +31,13 @@ void kvm_remove_all_breakpoints(CPUState *current_env);
 int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap);
 int kvm_qemu_init_env(CPUState *env);
 int kvm_qemu_check_extension(int ext);
-void kvm_apic_init(CPUState *env);
+/* FIXME: there should be an apic.h file */
 /* called from vcpu initialization */
 void qemu_kvm_load_lapic(CPUState *env);
+int apic_init_received(CPUState *env);
+int apic_sipi_needed(CPUState 

[PATCH 3/4] provide a kvm-free implementation of apic

2009-06-02 Thread Glauber Costa
Also, provide a kvm_apic that does not depend highly on common code.

Signed-off-by: Glauber Costa glom...@redhat.com
---
 hw/apic.c  |  249 ++--
 hw/pc.c|7 ++-
 hw/pc.h|1 +
 qemu-kvm-x86.c |5 +-
 4 files changed, 162 insertions(+), 100 deletions(-)

diff --git a/hw/apic.c b/hw/apic.c
index c93cbb3..d7af08f 100644
--- a/hw/apic.c
+++ b/hw/apic.c
@@ -852,103 +852,11 @@ static void apic_mem_writel(void *opaque, 
target_phys_addr_t addr, uint32_t val)
 }
 }
 
-#ifdef KVM_CAP_IRQCHIP
-
-static inline uint32_t kapic_reg(struct kvm_lapic_state *kapic, int reg_id)
-{
-return *((uint32_t *) (kapic-regs + (reg_id  4)));
-}
-
-static inline void kapic_set_reg(struct kvm_lapic_state *kapic,
- int reg_id, uint32_t val)
-{
-*((uint32_t *) (kapic-regs + (reg_id  4))) = val;
-}
-
-static void kvm_kernel_lapic_save_to_user(APICState *s)
-{
-struct kvm_lapic_state apic;
-struct kvm_lapic_state *kapic = apic;
-int i, v;
-
-kvm_get_lapic(kvm_context, s-cpu_env-cpu_index, kapic);
-
-s-id = kapic_reg(kapic, 0x2)  24;
-s-tpr = kapic_reg(kapic, 0x8);
-s-arb_id = kapic_reg(kapic, 0x9);
-s-log_dest = kapic_reg(kapic, 0xd)  24;
-s-dest_mode = kapic_reg(kapic, 0xe)  28;
-s-spurious_vec = kapic_reg(kapic, 0xf);
-for (i = 0; i  8; i++) {
-s-isr[i] = kapic_reg(kapic, 0x10 + i);
-s-tmr[i] = kapic_reg(kapic, 0x18 + i);
-s-irr[i] = kapic_reg(kapic, 0x20 + i);
-}
-s-esr = kapic_reg(kapic, 0x28);
-s-icr[0] = kapic_reg(kapic, 0x30);
-s-icr[1] = kapic_reg(kapic, 0x31);
-for (i = 0; i  APIC_LVT_NB; i++)
-   s-lvt[i] = kapic_reg(kapic, 0x32 + i);
-s-initial_count = kapic_reg(kapic, 0x38);
-s-divide_conf = kapic_reg(kapic, 0x3e);
-
-v = (s-divide_conf  3) | ((s-divide_conf  1)  4);
-s-count_shift = (v + 1)  7;
-
-s-initial_count_load_time = qemu_get_clock(vm_clock);
-apic_timer_update(s, s-initial_count_load_time);
-}
-
-static void kvm_kernel_lapic_load_from_user(APICState *s)
-{
-struct kvm_lapic_state apic;
-struct kvm_lapic_state *klapic = apic;
-int i;
-
-memset(klapic, 0, sizeof apic);
-kapic_set_reg(klapic, 0x2, s-id  24);
-kapic_set_reg(klapic, 0x8, s-tpr);
-kapic_set_reg(klapic, 0xd, s-log_dest  24);
-kapic_set_reg(klapic, 0xe, s-dest_mode  28 | 0x0fff);
-kapic_set_reg(klapic, 0xf, s-spurious_vec);
-for (i = 0; i  8; i++) {
-kapic_set_reg(klapic, 0x10 + i, s-isr[i]);
-kapic_set_reg(klapic, 0x18 + i, s-tmr[i]);
-kapic_set_reg(klapic, 0x20 + i, s-irr[i]);
-}
-kapic_set_reg(klapic, 0x28, s-esr);
-kapic_set_reg(klapic, 0x30, s-icr[0]);
-kapic_set_reg(klapic, 0x31, s-icr[1]);
-for (i = 0; i  APIC_LVT_NB; i++)
-kapic_set_reg(klapic, 0x32 + i, s-lvt[i]);
-kapic_set_reg(klapic, 0x38, s-initial_count);
-kapic_set_reg(klapic, 0x3e, s-divide_conf);
-
-kvm_set_lapic(kvm_context, s-cpu_env-cpu_index, klapic);
-}
-
-#endif
-
-void qemu_kvm_load_lapic(CPUState *env)
-{
-#ifdef KVM_CAP_IRQCHIP
-if (kvm_enabled()  kvm_vcpu_inited(env)  qemu_kvm_irqchip_in_kernel()) 
{
-kvm_kernel_lapic_load_from_user(env-apic_state);
-}
-#endif
-}
-
-static void apic_save(QEMUFile *f, void *opaque)
+static void apic_save_common(QEMUFile *f, void *opaque)
 {
 APICState *s = opaque;
 int i;
 
-#ifdef KVM_CAP_IRQCHIP
-if (kvm_enabled()  qemu_kvm_irqchip_in_kernel()) {
-kvm_kernel_lapic_save_to_user(s);
-}
-#endif
-
 qemu_put_be32s(f, s-apicbase);
 qemu_put_8s(f, s-id);
 qemu_put_8s(f, s-arb_id);
@@ -976,7 +884,12 @@ static void apic_save(QEMUFile *f, void *opaque)
 qemu_put_timer(f, s-timer);
 }
 
-static int apic_load(QEMUFile *f, void *opaque, int version_id)
+static void apic_save(QEMUFile *f, void *opaque)
+{
+apic_save_common(f, opaque);
+}
+
+static int apic_load_common(QEMUFile *f, void *opaque, int version_id)
 {
 APICState *s = opaque;
 int i;
@@ -1012,12 +925,15 @@ static int apic_load(QEMUFile *f, void *opaque, int 
version_id)
 if (version_id = 2)
 qemu_get_timer(f, s-timer);
 
-qemu_kvm_load_lapic(s-cpu_env);
-
 return 0;
 }
 
-static void apic_reset(void *opaque)
+static int apic_load(QEMUFile *f, void *opaque, int version_id)
+{
+return apic_load_common(f, opaque, version_id);
+}
+
+static void apic_reset_common(void *opaque)
 {
 APICState *s = opaque;
 
@@ -1034,7 +950,11 @@ static void apic_reset(void *opaque)
  */
 s-lvt[APIC_LVT_LINT0] = 0x700;
 }
-qemu_kvm_load_lapic(s-cpu_env);
+}
+
+static void apic_reset(void *opaque)
+{
+apic_reset_common(opaque);
 }
 
 static CPUReadMemoryFunc *apic_mem_read[3] = {
@@ -1081,3 +1001,136 @@ int apic_init(CPUState *env)
 return 0;
 }
 
+#ifdef KVM_CAP_IRQCHIP
+
+static inline uint32_t kapic_reg(struct kvm_lapic_state *kapic, int reg_id)
+{
+

[PATCH 4/4] provide a kvm-free implementation of ioapic

2009-06-02 Thread Glauber Costa
Also, provide a kvm_ioapic that does not depend highly on common code.

Signed-off-by: Glauber Costa glom...@redhat.com
---
 hw/ioapic.c |  162 +-
 hw/pc.c |7 ++-
 hw/pc.h |1 +
 3 files changed, 110 insertions(+), 60 deletions(-)

diff --git a/hw/ioapic.c b/hw/ioapic.c
index 6c178c7..3eb5d5f 100644
--- a/hw/ioapic.c
+++ b/hw/ioapic.c
@@ -198,58 +198,11 @@ static void ioapic_mem_writel(void *opaque, 
target_phys_addr_t addr, uint32_t va
 }
 }
 
-static void kvm_kernel_ioapic_save_to_user(IOAPICState *s)
-{
-#if defined(KVM_CAP_IRQCHIP)  defined(TARGET_I386)
-struct kvm_irqchip chip;
-struct kvm_ioapic_state *kioapic;
-int i;
-
-chip.chip_id = KVM_IRQCHIP_IOAPIC;
-kvm_get_irqchip(kvm_context, chip);
-kioapic = chip.chip.ioapic;
-
-s-id = kioapic-id;
-s-ioregsel = kioapic-ioregsel;
-s-base_address = kioapic-base_address;
-s-irr = kioapic-irr;
-for (i = 0; i  IOAPIC_NUM_PINS; i++) {
-s-ioredtbl[i] = kioapic-redirtbl[i].bits;
-}
-#endif
-}
-
-static void kvm_kernel_ioapic_load_from_user(IOAPICState *s)
-{
-#if defined(KVM_CAP_IRQCHIP)  defined(TARGET_I386)
-struct kvm_irqchip chip;
-struct kvm_ioapic_state *kioapic;
-int i;
-
-chip.chip_id = KVM_IRQCHIP_IOAPIC;
-kioapic = chip.chip.ioapic;
-
-kioapic-id = s-id;
-kioapic-ioregsel = s-ioregsel;
-kioapic-base_address = s-base_address;
-kioapic-irr = s-irr;
-for (i = 0; i  IOAPIC_NUM_PINS; i++) {
-kioapic-redirtbl[i].bits = s-ioredtbl[i];
-}
-
-kvm_set_irqchip(kvm_context, chip);
-#endif
-}
-
-static void ioapic_save(QEMUFile *f, void *opaque)
+static void ioapic_save_common(QEMUFile *f, void *opaque)
 {
 IOAPICState *s = opaque;
 int i;
 
-if (kvm_enabled()  qemu_kvm_irqchip_in_kernel()) {
-kvm_kernel_ioapic_save_to_user(s);
-}
-
 qemu_put_8s(f, s-id);
 qemu_put_8s(f, s-ioregsel);
 qemu_put_be64s(f, s-base_address);
@@ -259,7 +212,12 @@ static void ioapic_save(QEMUFile *f, void *opaque)
 }
 }
 
-static int ioapic_load(QEMUFile *f, void *opaque, int version_id)
+static void ioapic_save(QEMUFile *f, void *opaque)
+{
+ioapic_save_common(f, opaque);
+}
+
+static int ioapic_load_common(QEMUFile *f, void *opaque, int version_id)
 {
 IOAPICState *s = opaque;
 int i;
@@ -283,14 +241,15 @@ static int ioapic_load(QEMUFile *f, void *opaque, int 
version_id)
 qemu_get_be64s(f, s-ioredtbl[i]);
 }
 
-if (kvm_enabled()  qemu_kvm_irqchip_in_kernel()) {
-kvm_kernel_ioapic_load_from_user(s);
-}
-
 return 0;
 }
 
-static void ioapic_reset(void *opaque)
+static int ioapic_load(QEMUFile *f, void *opaque, int version_id)
+{
+return ioapic_load_common(f, opaque, version_id);
+}
+
+static void ioapic_reset_common(void *opaque)
 {
 IOAPICState *s = opaque;
 int i;
@@ -299,11 +258,11 @@ static void ioapic_reset(void *opaque)
 s-base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
 for(i = 0; i  IOAPIC_NUM_PINS; i++)
 s-ioredtbl[i] = 1  16; /* mask LVT */
-#ifdef KVM_CAP_IRQCHIP
-if (kvm_enabled()  qemu_kvm_irqchip_in_kernel()) {
-kvm_kernel_ioapic_load_from_user(s);
-}
-#endif
+}
+
+static void ioapic_reset(void *opaque)
+{
+ioapic_reset_common(opaque);
 }
 
 static CPUReadMemoryFunc *ioapic_mem_read[3] = {
@@ -335,3 +294,88 @@ IOAPICState *ioapic_init(void)
 
 return s;
 }
+
+#ifdef KVM_CAP_IRQCHIP
+static void kvm_kernel_ioapic_save_to_user(IOAPICState *s)
+{
+#if defined(TARGET_I386)
+struct kvm_irqchip chip;
+struct kvm_ioapic_state *kioapic;
+int i;
+
+chip.chip_id = KVM_IRQCHIP_IOAPIC;
+kvm_get_irqchip(kvm_context, chip);
+kioapic = chip.chip.ioapic;
+
+s-id = kioapic-id;
+s-ioregsel = kioapic-ioregsel;
+s-base_address = kioapic-base_address;
+s-irr = kioapic-irr;
+for (i = 0; i  IOAPIC_NUM_PINS; i++) {
+s-ioredtbl[i] = kioapic-redirtbl[i].bits;
+}
+#endif
+}
+
+static void kvm_kernel_ioapic_load_from_user(IOAPICState *s)
+{
+#if defined(TARGET_I386)
+struct kvm_irqchip chip;
+struct kvm_ioapic_state *kioapic;
+int i;
+
+chip.chip_id = KVM_IRQCHIP_IOAPIC;
+kioapic = chip.chip.ioapic;
+
+kioapic-id = s-id;
+kioapic-ioregsel = s-ioregsel;
+kioapic-base_address = s-base_address;
+kioapic-irr = s-irr;
+for (i = 0; i  IOAPIC_NUM_PINS; i++) {
+kioapic-redirtbl[i].bits = s-ioredtbl[i];
+}
+
+kvm_set_irqchip(kvm_context, chip);
+#endif
+}
+
+static void kvm_ioapic_save(QEMUFile *f, void *opaque)
+{
+IOAPICState *s = opaque;
+kvm_kernel_ioapic_save_to_user(s);
+
+ioapic_save_common(f, opaque);
+}
+
+static int kvm_ioapic_load(QEMUFile *f, void *opaque, int version_id)
+{
+IOAPICState *s = opaque;
+int r = ioapic_load_common(f, opaque, version_id);
+
+if (r == 0)
+kvm_kernel_ioapic_load_from_user(s);
+return r;
+}
+
+
+static void 

Re: [PATCH 1/4] always halt non-bsp cpu.

2009-06-02 Thread Jan Kiszka
Glauber Costa wrote:
 This is not kvm specific, and should do fine in plain qemu

This is fine with plain qemu already. The problem, IIUC, is that
in-kernel kvm irqchip does not have a chance to remove the halted state
again. Did you test the effect of this patch on that scenario? What
makes it safe to be removed now?

 
 Signed-off-by: Glauber Costa glom...@redhat.com
 ---
  hw/apic.c |3 +--
  1 files changed, 1 insertions(+), 2 deletions(-)
 
 diff --git a/hw/apic.c b/hw/apic.c
 index 86aa6b6..2eddba0 100644
 --- a/hw/apic.c
 +++ b/hw/apic.c
 @@ -467,8 +467,7 @@ static void apic_init_ipi(APICState *s)
  
  cpu_reset(s-cpu_env);
  
 -if (!(s-apicbase  MSR_IA32_APICBASE_BSP) 
 -(!kvm_enabled() || !qemu_kvm_irqchip_in_kernel()))
 +if (!(s-apicbase  MSR_IA32_APICBASE_BSP))
  s-cpu_env-halted = 1;
  
  if (kvm_enabled()  !qemu_kvm_irqchip_in_kernel())

Jan



signature.asc
Description: OpenPGP digital signature


Windows Server 2008 VM performance

2009-06-02 Thread Andrew Theurer
I've been looking at how KVM handles windows guests, and I am a little 
concerned with the CPU overhead.  My test case is as follows:


I am running 4 instances of a J2EE benchmark.  Each instance needs one 
application server and one DB server.  8 VMs in total are used.


I have the same App and DB software for Linux and Windows (and same 
versions) so I can compare between Linux and Windows.  I also have 
another hypervisor which I can test both Windows and Linux VMs.


The host has EPT capable processors.  VMs in KVM are backed with large 
pages.



Test results:

ConfigCPU utilization
---
KVM-85
  Windows Server 2008 64-bit VMs 44.84
  RedHat 5.3 w/ 2.6.29 64-bit VMs24.56
Other-Hypervisor
  Windows Server 2008 64-bit VMs 30.63
  RedHat 5.3 w/ 2.6.18 64-bit VMs27.13

-KVM running Windows VMs uses 46% more CPU than the Other-Hypervisor
-The Other-Hypervisor provides an optimized virtual network driver
-KVM results listed above did not use virtio_net or virtio_disk for 
Windows, but do for Linux
-One extra KVM run (not listed above) was made with virtio_net for 
Windows VMs but only reduced CPU by 2%
-Most of the CPU overhead could be attributed to the DB VMs, where there 
is about 5 MB/sec writes per VM

-I don't have a virtio_block driver for Windows to test.  Does one exist?
-All tests above had 2 vCPUS per VM


Here's a comparison of kvm_stat between Windows (run1) and Linux (run2):

run1  run2run1/run2
  -
efer_relo:  0 0 1
exits:1206880121916 9.899
fpu_reloa: 210969 2086310.112
halt_exit:  15092 13222 1.141
halt_wake:  14466  9294 1.556
host_stat: 211066 45117 4.678
hypercall:  0 0 1
insn_emul: 119582 38126 3.136
insn_emul:  0 0 1
invlpg   :  0 0 1
io_exits : 131051 26349 4.974
irq_exits:   8128 12937 0.628
irq_injec:  29955 21825 1.373
irq_windo:   2504  2022 1.238
kvm_reque:  0 0 1
largepage:  164 0.009
mmio_exit:  59224 0   Inf
mmu_cache:  0 3 0.000
mmu_flood:  0 0 1
mmu_pde_z:  0 0 1
mmu_pte_u:  0 0 1
mmu_pte_w:  0 0 1
mmu_recyc:  0 0 1
mmu_shado:  0 0 1
mmu_unsyn:  0 0 1
mmu_unsyn:  0 0 1
nmi_injec:  0 0 1
nmi_windo:  0 0 1
pf_fixed :  167 0.009
pf_guest :  0 0 1
remote_tl:  0 0 1
request_n:  0 0 1
signal_ex:  0 0 1
tlb_flush:220 14037 0.016


10x the number of exits, a problem?

I happened to try just one vCPU per VM for KVM/Windows VMs, and I was 
surprised how much of a difference it made:


Config   CPU utilization
--   -
KVM-85
  Windows Server 2008 64-bit VMs, 2 vCPU per VM 44.84
  Windows Server 2008 64-bit VMs, 1 vCPU per VM 36.44

A 19% reduction in CPU utilization vs KVM/Windows-2vCPU!  Does not 
explain all the overhead (vs Other-Hypervisor, 2 vCPUs per VM) but, that 
sure seems like a lot between 1 to 2 vCPUs for KVM/Windows-VMs.  I have 
not run with 1 vCPU per VM with Other-Hypervisor, but I will soon.  
Anyway, I also collected kvm_stat for the 1 vCPU case, and here it is 
compared to KVM/Linux VMs with 2 vCPUs:


run1  run2run1/run2
  -
efer_relo:  0 0 1
exits:1184471121916 9.715
fpu_reloa: 192766 20863 9.240
halt_exit:   4697 13222 0.355
halt_wake:   4360  9294 0.469
host_stat: 192828 45117 4.274
hypercall:  0 0 1
insn_emul: 130487 38126 3.422
insn_emul:  0 0 1
invlpg   :  0 0 1
io_exits : 114430 26349 4.343
irq_exits:   7075 12937 0.547
irq_injec:  29930 21825 1.371
irq_windo:   2391  2022 1.182
kvm_reque:  0 0 1
largepage:  064 0.001
mmio_exit:  69028 0   Inf
mmu_cache:  0   

Re: [PATCH 1/4] always halt non-bsp cpu.

2009-06-02 Thread Glauber Costa
On Tue, Jun 02, 2009 at 10:35:47PM +0200, Jan Kiszka wrote:
 Glauber Costa wrote:
  This is not kvm specific, and should do fine in plain qemu
 
 This is fine with plain qemu already. The problem, IIUC, is that
 in-kernel kvm irqchip does not have a chance to remove the halted state
 again. Did you test the effect of this patch on that scenario? What
 makes it safe to be removed now?
IIRC, the in kernel irqchip sets halted = 0 in the very beginning of
the vcpu initialization.

It is tested here with in-kernel irqchip and works, so probably not
a problem, unless you can spot something.


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Windows Server 2008 VM performance

2009-06-02 Thread Avi Kivity

Andrew Theurer wrote:
I've been looking at how KVM handles windows guests, and I am a little 
concerned with the CPU overhead.  My test case is as follows:


I am running 4 instances of a J2EE benchmark.  Each instance needs one 
application server and one DB server.  8 VMs in total are used.


I have the same App and DB software for Linux and Windows (and same 
versions) so I can compare between Linux and Windows.  I also have 
another hypervisor which I can test both Windows and Linux VMs.


The host has EPT capable processors.  VMs in KVM are backed with large 
pages.



Test results:

ConfigCPU utilization
---
KVM-85
  Windows Server 2008 64-bit VMs 44.84
  RedHat 5.3 w/ 2.6.29 64-bit VMs24.56
Other-Hypervisor
  Windows Server 2008 64-bit VMs 30.63
  RedHat 5.3 w/ 2.6.18 64-bit VMs27.13

-KVM running Windows VMs uses 46% more CPU than the Other-Hypervisor
-The Other-Hypervisor provides an optimized virtual network driver
-KVM results listed above did not use virtio_net or virtio_disk for 
Windows, but do for Linux
-One extra KVM run (not listed above) was made with virtio_net for 
Windows VMs but only reduced CPU by 2%


I think the publicly available driver is pretty old and unoptimized.

-Most of the CPU overhead could be attributed to the DB VMs, where 
there is about 5 MB/sec writes per VM

-I don't have a virtio_block driver for Windows to test.  Does one exist?


Exists, not public though.


-All tests above had 2 vCPUS per VM


Here's a comparison of kvm_stat between Windows (run1) and Linux (run2):

run1  run2run1/run2
  -
efer_relo:  0 0 1
exits:1206880121916 9.899


total exits is the prime measure of course.


fpu_reloa: 210969 2086310.112
halt_exit:  15092 13222 1.141
halt_wake:  14466  9294 1.556
host_stat: 211066 45117 4.678


host state reloads measure (approximately) exits to userspace, likely 
due to the unoptimized drivers.



hypercall:  0 0 1
insn_emul: 119582 38126 3.136


again lack of drivers


insn_emul:  0 0 1
invlpg   :  0 0 1
io_exits : 131051 26349 4.974


ditto


irq_exits:   8128 12937 0.628
irq_injec:  29955 21825 1.373
irq_windo:   2504  2022 1.238
kvm_reque:  0 0 1
largepage:  164 0.009
mmio_exit:  59224 0   Inf


wow, linux avoids mmio completely.  good.




10x the number of exits, a problem?


_the_ problem.



I happened to try just one vCPU per VM for KVM/Windows VMs, and I was 
surprised how much of a difference it made:


Config   CPU utilization
--   -
KVM-85
  Windows Server 2008 64-bit VMs, 2 vCPU per VM 44.84
  Windows Server 2008 64-bit VMs, 1 vCPU per VM 36.44

A 19% reduction in CPU utilization vs KVM/Windows-2vCPU!  Does not 
explain all the overhead (vs Other-Hypervisor, 2 vCPUs per VM) but, 
that sure seems like a lot between 1 to 2 vCPUs for KVM/Windows-VMs.  


Inter-process communication is expensive.  Marcelo added some 
optimizations (the sending vcpu used to wait for the target vcpu, now 
they don't).  They're in kvm-86 (of course).  You'll need 2.6.26+ on the 
host for them to take effect (of course, for many features, the more 
recent the host the faster kvm on top can run).


Windows 2008 also implements some hypervisor accelerations which are 
especially useful on smp.  Gleb has started some work on this.  I don't 
know if other_hypervisor implements them.


Finally, smp is expensive! data moves across caches, processors wait for 
each other, etc.


I have not run with 1 vCPU per VM with Other-Hypervisor, but I will 
soon.  Anyway, I also collected kvm_stat for the 1 vCPU case, and here 
it is compared to KVM/Linux VMs with 2 vCPUs:


run1  run2run1/run2
  -
efer_relo:  0 0 1
exits:1184471121916 9.715

Still see the huge difference in vm_exits, so I guess not all is great 
yet.


Yeah, exit rate stayed the same, so it's probably IPC costs and 
intrinsic smp costs.



So, what would be the best course of action for this?



Is there a virtio_block driver to test?  


There is, but it isn't available yet.

Can we find the root cause of the exits (is there a way to get stack 
dump or something that can show where there are coming from)?


Marcelo is working on a super-duper easy to use kvm trace which can show 
what's going on.  The old one is reasonably easy 

[patch 2/4] KVM: MMU audit: update audit_write_protection

2009-06-02 Thread Marcelo Tosatti
- Unsync pages contain writable sptes in the rmap.
- rmaps do not exclusively contain writable sptes anymore.

Signed-off-by: Marcelo Tosatti mtosa...@redhat.com

Index: kvm/arch/x86/kvm/mmu.c
===
--- kvm.orig/arch/x86/kvm/mmu.c
+++ kvm/arch/x86/kvm/mmu.c
@@ -3228,20 +3228,28 @@ static void audit_write_protection(struc
struct kvm_mmu_page *sp;
struct kvm_memory_slot *slot;
unsigned long *rmapp;
+   u64 *spte;
gfn_t gfn;
 
list_for_each_entry(sp, vcpu-kvm-arch.active_mmu_pages, link) {
if (sp-role.direct)
continue;
+   if (sp-unsync)
+   continue;
 
gfn = unalias_gfn(vcpu-kvm, sp-gfn);
slot = gfn_to_memslot_unaliased(vcpu-kvm, sp-gfn);
rmapp = slot-rmap[gfn - slot-base_gfn];
-   if (*rmapp)
-   printk(KERN_ERR %s: (%s) shadow page has writable
-   mappings: gfn %lx role %x\n,
+
+   spte = rmap_next(vcpu-kvm, rmapp, NULL);
+   while (spte) {
+   if (*spte  PT_WRITABLE_MASK)
+   printk(KERN_ERR %s: (%s) shadow page has 
+   writable mappings: gfn %lx role %x\n,
   __func__, audit_msg, sp-gfn,
   sp-role.word);
+   spte = rmap_next(vcpu-kvm, rmapp, spte);
+   }
}
 }
 

-- 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 3/4] KVM: MMU audit: nontrapping ptes in nonleaf level

2009-06-02 Thread Marcelo Tosatti
update_pte sets nonleaf sptes as notrap, so either it or the audit code
is broken.

Choose the audit code.

Signed-off-by: Marcelo Tosatti mtosa...@redhat.com

Index: kvm/arch/x86/kvm/mmu.c
===
--- kvm.orig/arch/x86/kvm/mmu.c
+++ kvm/arch/x86/kvm/mmu.c
@@ -3081,12 +3081,7 @@ static void audit_mappings_page(struct k
 
va = canonicalize(va);
if (level  1) {
-   if (ent == shadow_notrap_nonpresent_pte)
-   printk(KERN_ERR audit: (%s) nontrapping pte
-   in nonleaf level: levels %d gva %lx
-   level %d pte %llx\n, audit_msg,
-  vcpu-arch.mmu.root_level, va, level, 
ent);
-   else
+   if (is_shadow_present_pte(ent))
audit_mappings_page(vcpu, ent, va, level - 1);
} else {
gpa_t gpa = vcpu-arch.mmu.gva_to_gpa(vcpu, va);

-- 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 0/4] mmu audit update

2009-06-02 Thread Marcelo Tosatti
Some updates to the MMU audit code.

The third patch is guessy because I could not find the notrap spte
documentation, all I can see is the page-fault error code mask and match
fields in the VMCS, but can't see the link of that to sptes. Can someone
point it out please?

-- 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 4/4] KVM: MMU audit: audit_mappings tweaks

2009-06-02 Thread Marcelo Tosatti
- Fail early in case gfn_to_pfn returns is_error_pfn.
- For the pre pte write case, avoid spurious gva is valid but spte is notrap 
  messages (the emulation code does the guest write first, so this particular
  case is OK).

Signed-off-by: Marcelo Tosatti mtosa...@redhat.com

Index: kvm/arch/x86/kvm/mmu.c
===
--- kvm.orig/arch/x86/kvm/mmu.c
+++ kvm/arch/x86/kvm/mmu.c
@@ -3089,6 +3089,11 @@ static void audit_mappings_page(struct k
pfn_t pfn = gfn_to_pfn(vcpu-kvm, gfn);
hpa_t hpa = (hpa_t)pfn  PAGE_SHIFT;
 
+   if (is_error_pfn(pfn)) {
+   kvm_release_pfn_clean(pfn);
+   continue;
+   }
+
if (is_shadow_present_pte(ent)
 (ent  PT64_BASE_ADDR_MASK) != hpa)
printk(KERN_ERR xx audit error: (%s) levels %d
@@ -3256,7 +3261,8 @@ static void kvm_mmu_audit(struct kvm_vcp
audit_msg = msg;
audit_rmap(vcpu);
audit_write_protection(vcpu);
-   audit_mappings(vcpu);
+   if (strcmp(pre pte write, audit_msg))
+   audit_mappings(vcpu);
audit_writable_sptes_have_rmaps(vcpu);
dbg = olddbg;
 }

-- 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 1/4] KVM: MMU audit: update count_writable_mappings / count_rmaps

2009-06-02 Thread Marcelo Tosatti
Under testing, count_writable_mappings returns a value that is 2 integers
larger than what count_rmaps returns.

Suspicion is that either of the two functions is counting a duplicate (either
positively or negatively). 

Modifying check_writable_mappings_rmap to check for rmap existance on
all present MMU pages fails to trigger an error, which should keep Avi
happy.

Also introduce mmu_spte_walk to invoke a callback on all present sptes visible
to the current vcpu, might be useful in the future.

Signed-off-by: Marcelo Tosatti mtosa...@redhat.com

Index: kvm/arch/x86/kvm/mmu.c
===
--- kvm.orig/arch/x86/kvm/mmu.c
+++ kvm/arch/x86/kvm/mmu.c
@@ -3017,6 +3017,55 @@ static gva_t canonicalize(gva_t gva)
return gva;
 }
 
+
+typedef void (*inspect_spte_fn) (struct kvm *kvm, struct kvm_mmu_page *sp,
+u64 *sptep);
+
+static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
+   inspect_spte_fn fn)
+{
+   int i;
+
+   for (i = 0; i  PT64_ENT_PER_PAGE; ++i) {
+   u64 ent = sp-spt[i];
+
+   if (is_shadow_present_pte(ent)) {
+   if (sp-role.level  1) {
+   struct kvm_mmu_page *child;
+   child = page_header(ent  PT64_BASE_ADDR_MASK);
+   __mmu_spte_walk(kvm, child, fn);
+   }
+   if (sp-role.level == 1)
+   fn(kvm, sp, sp-spt[i]);
+   }
+   }
+}
+
+static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
+{
+   int i;
+   struct kvm_mmu_page *sp;
+
+   if (!VALID_PAGE(vcpu-arch.mmu.root_hpa))
+   return;
+   if (vcpu-arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+   hpa_t root = vcpu-arch.mmu.root_hpa;
+   sp = page_header(root);
+   __mmu_spte_walk(vcpu-kvm, sp, fn);
+   return;
+   }
+   for (i = 0; i  4; ++i) {
+   hpa_t root = vcpu-arch.mmu.pae_root[i];
+
+   if (root  VALID_PAGE(root)) {
+   root = PT64_BASE_ADDR_MASK;
+   sp = page_header(root);
+   __mmu_spte_walk(vcpu-kvm, sp, fn);
+   }
+   }
+   return;
+}
+
 static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
gva_t va, int level)
 {
@@ -3109,9 +3158,43 @@ static int count_rmaps(struct kvm_vcpu *
return nmaps;
 }
 
-static int count_writable_mappings(struct kvm_vcpu *vcpu)
+void inspect_spte_has_rmap(struct kvm *kvm, struct kvm_mmu_page *sp, u64 
*sptep)
+{
+   unsigned long *rmapp;
+   struct kvm_mmu_page *rev_sp;
+   gfn_t gfn;
+
+   if (*sptep  PT_WRITABLE_MASK) {
+   rev_sp = page_header(__pa(sptep));
+   gfn = rev_sp-gfns[sptep - rev_sp-spt];
+
+   if (!gfn_to_memslot(kvm, gfn)) {
+   printk(KERN_ERR %s: no memslot for gfn %ld\n,
+audit_msg, gfn);
+   printk(KERN_ERR %s: index %ld of sp (gfn=%lx)\n,
+   audit_msg, sptep - rev_sp-spt,
+   rev_sp-gfn);
+   dump_stack();
+   return;
+   }
+
+   rmapp = gfn_to_rmap(kvm, rev_sp-gfns[sptep - rev_sp-spt], 0);
+   if (!*rmapp) {
+   printk(KERN_ERR %s: no rmap for writable spte %llx\n,
+audit_msg, *sptep);
+   dump_stack();
+   }
+   }
+
+}
+
+void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu)
+{
+   mmu_spte_walk(vcpu, inspect_spte_has_rmap);
+}
+
+static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
 {
-   int nmaps = 0;
struct kvm_mmu_page *sp;
int i;
 
@@ -3128,20 +3211,16 @@ static int count_writable_mappings(struc
continue;
if (!(ent  PT_WRITABLE_MASK))
continue;
-   ++nmaps;
+   inspect_spte_has_rmap(vcpu-kvm, sp, pt[i]);
}
}
-   return nmaps;
+   return;
 }
 
 static void audit_rmap(struct kvm_vcpu *vcpu)
 {
-   int n_rmap = count_rmaps(vcpu);
-   int n_actual = count_writable_mappings(vcpu);
-
-   if (n_rmap != n_actual)
-   printk(KERN_ERR %s: (%s) rmap %d actual %d\n,
-  __func__, audit_msg, n_rmap, n_actual);
+   check_writable_mappings_rmap(vcpu);
+   count_rmaps(vcpu);
 }
 
 static void audit_write_protection(struct kvm_vcpu *vcpu)
@@ -3175,6 +3254,7 @@ static void kvm_mmu_audit(struct kvm_vcp
audit_rmap(vcpu);
audit_write_protection(vcpu);

Re: Windows Server 2008 VM performance

2009-06-02 Thread Javier Guerra
On Tue, Jun 2, 2009 at 4:38 PM, Avi Kivity a...@redhat.com wrote:
 Andrew Theurer wrote:
 P.S. Here is the qemu cmd line for the windows VMs:
 /usr/local/bin/qemu-system-x86_64 -name newcastle-xdbt01 -hda
 /dev/disk/by-id/scsi-3600a0b8f1eb1074f4a02b08a

 Use: -drive /dev/very-long-name,cache=none instead of -hda to disable the
 host cache.  Won't make much of a differnence for 5 MB/s though.

would emulating SCSI instead of IDE help while we hope and wait for
virtio_block drivers? (and better virtio_net).

-- 
Javier
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Windows Server 2008 VM performance

2009-06-02 Thread Avi Kivity

Javier Guerra wrote:

On Tue, Jun 2, 2009 at 4:38 PM, Avi Kivity a...@redhat.com wrote:
  

Andrew Theurer wrote:


P.S. Here is the qemu cmd line for the windows VMs:
/usr/local/bin/qemu-system-x86_64 -name newcastle-xdbt01 -hda
/dev/disk/by-id/scsi-3600a0b8f1eb1074f4a02b08a
  

Use: -drive /dev/very-long-name,cache=none instead of -hda to disable the
host cache.  Won't make much of a differnence for 5 MB/s though.



would emulating SCSI instead of IDE help while we hope and wait for
virtio_block drivers? (and better virtio_net).
  


I don't think the scsi emulation currently supports command parallelism, 
but I could be wrong.


Worth trying.

--
Do not meddle in the internals of kernels, for they are subtle and quick to 
panic.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/4] always halt non-bsp cpu.

2009-06-02 Thread Jan Kiszka
Glauber Costa wrote:
 On Tue, Jun 02, 2009 at 10:35:47PM +0200, Jan Kiszka wrote:
 Glauber Costa wrote:
 This is not kvm specific, and should do fine in plain qemu
 This is fine with plain qemu already. The problem, IIUC, is that
 in-kernel kvm irqchip does not have a chance to remove the halted state
 again. Did you test the effect of this patch on that scenario? What
 makes it safe to be removed now?
 IIRC, the in kernel irqchip sets halted = 0 in the very beginning of
 the vcpu initialization.
 
 It is tested here with in-kernel irqchip and works, so probably not
 a problem, unless you can spot something.

At least your patch applied alone breaks -smp 1 here.

But the whole management of env-halted for the in-kernel irqchip in
qemu-kvm is a bit hacky IMHO. Maybe it's time to rethink this. Would be
nice to always see a consistent halted in user space, specifically for
debugging purposes.

Jan



signature.asc
Description: OpenPGP digital signature


Re: [PATCH 1/4] always halt non-bsp cpu.

2009-06-02 Thread Glauber Costa
On Wed, Jun 03, 2009 at 12:01:00AM +0200, Jan Kiszka wrote:
 Glauber Costa wrote:
  On Tue, Jun 02, 2009 at 10:35:47PM +0200, Jan Kiszka wrote:
  Glauber Costa wrote:
  This is not kvm specific, and should do fine in plain qemu
  This is fine with plain qemu already. The problem, IIUC, is that
  in-kernel kvm irqchip does not have a chance to remove the halted state
  again. Did you test the effect of this patch on that scenario? What
  makes it safe to be removed now?
  IIRC, the in kernel irqchip sets halted = 0 in the very beginning of
  the vcpu initialization.
  
  It is tested here with in-kernel irqchip and works, so probably not
  a problem, unless you can spot something.
 
 At least your patch applied alone breaks -smp 1 here.
 
 But the whole management of env-halted for the in-kernel irqchip in
 qemu-kvm is a bit hacky IMHO. Maybe it's time to rethink this. Would be
 nice to always see a consistent halted in user space, specifically for
 debugging purposes.
out of curiosity: did you apply the whole series?

please report with it. I suspect there is a change later on that might
make it work. Of course, this is no excuse, as I'm a huge fan of bisectability.
If this is the case, I'll rework the series in a way that it always work.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


  1   2   >