[RFC v2 4/6] hw/kvm/arm_gic: Implement support for KVM in-kernel ARM GIC
Implement support for using the KVM in-kernel GIC for ARM. Signed-off-by: Peter Maydell peter.mayd...@linaro.org --- hw/a15mpcore.c |8 ++- hw/arm/Makefile.objs |1 + hw/kvm/arm_gic.c | 162 ++ 3 files changed, 170 insertions(+), 1 deletion(-) create mode 100644 hw/kvm/arm_gic.c diff --git a/hw/a15mpcore.c b/hw/a15mpcore.c index fc0a02a..a37fc61 100644 --- a/hw/a15mpcore.c +++ b/hw/a15mpcore.c @@ -19,6 +19,7 @@ */ #include sysbus.h +#include kvm.h /* A15MP private memory region. */ @@ -41,7 +42,12 @@ static int a15mp_priv_init(SysBusDevice *dev) A15MPPrivState *s = FROM_SYSBUS(A15MPPrivState, dev); SysBusDevice *busdev; -s-gic = qdev_create(NULL, arm_gic); +if (kvm_irqchip_in_kernel()) { +s-gic = qdev_create(NULL, kvm-arm_gic); +} else { +s-gic = qdev_create(NULL, arm_gic); +} + qdev_prop_set_uint32(s-gic, num-cpu, s-num_cpu); qdev_prop_set_uint32(s-gic, num-irq, s-num_irq); qdev_prop_set_uint32(s-gic, revision, 2); diff --git a/hw/arm/Makefile.objs b/hw/arm/Makefile.objs index 6d049e7..38b10a8 100644 --- a/hw/arm/Makefile.objs +++ b/hw/arm/Makefile.objs @@ -31,5 +31,6 @@ obj-y += collie.o obj-y += imx_serial.o imx_ccm.o imx_timer.o imx_avic.o obj-y += kzm.o obj-$(CONFIG_FDT) += ../device_tree.o +obj-$(CONFIG_KVM) += kvm/arm_gic.o obj-y := $(addprefix ../,$(obj-y)) diff --git a/hw/kvm/arm_gic.c b/hw/kvm/arm_gic.c new file mode 100644 index 000..3ec538d --- /dev/null +++ b/hw/kvm/arm_gic.c @@ -0,0 +1,162 @@ +/* + * ARM Generic Interrupt Controller using KVM in-kernel support + * + * Copyright (c) 2012 Linaro Limited + * Written by Peter Maydell + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, see http://www.gnu.org/licenses/. + */ + +#include hw/sysbus.h +#include kvm.h +#include hw/arm_gic_internal.h + +#define TYPE_KVM_ARM_GIC kvm-arm_gic +#define KVM_ARM_GIC(obj) \ + OBJECT_CHECK(gic_state, (obj), TYPE_KVM_ARM_GIC) +#define KVM_ARM_GIC_CLASS(klass) \ + OBJECT_CLASS_CHECK(KVMARMGICClass, (klass), TYPE_KVM_ARM_GIC) +#define KVM_ARM_GIC_GET_CLASS(obj) \ + OBJECT_GET_CLASS(KVMARMGICClass, (obj), TYPE_KVM_ARM_GIC) + +typedef struct KVMARMGICClass { +ARMGICCommonClass parent_class; +int (*parent_init)(SysBusDevice *dev); +void (*parent_reset)(DeviceState *dev); +} KVMARMGICClass; + +static void kvm_arm_gic_set_irq(void *opaque, int irq, int level) +{ +/* Meaning of the 'irq' parameter: + * [0..N-1] : external interrupts + * [N..N+31] : PPI (internal) interrupts for CPU 0 + * [N+32..N+63] : PPI (internal interrupts for CPU 1 + * ... + * Convert this to the kernel's desired encoding, which + * has separate fields in the irq number for type, + * CPU number and interrupt number. + */ +gic_state *s = (gic_state *)opaque; +int kvm_irq, irqtype, cpu; + +if (irq (s-num_irq - GIC_INTERNAL)) { +/* External interrupt. The kernel numbers these like the GIC + * hardware, with external interrupt IDs starting after the + * internal ones. + */ +irqtype = KVM_ARM_IRQ_TYPE_SPI; +cpu = 0; +irq += GIC_INTERNAL; +} else { +/* Internal interrupt: decode into (cpu, interrupt id) */ +irqtype = KVM_ARM_IRQ_TYPE_PPI; +irq -= (s-num_irq - GIC_INTERNAL); +cpu = irq / GIC_INTERNAL; +irq %= GIC_INTERNAL; +} +kvm_irq = (irqtype KVM_ARM_IRQ_TYPE_SHIFT) +| (cpu KVM_ARM_IRQ_VCPU_SHIFT) | irq; + +kvm_set_irq(kvm_state, kvm_irq, !!level); +} + +static void kvm_arm_gic_put(gic_state *s) +{ +/* TODO: there isn't currently a kernel interface to set the GIC state */ +} + +static void kvm_arm_gic_get(gic_state *s) +{ +/* TODO: there isn't currently a kernel interface to get the GIC state */ +} + +static void kvm_arm_gic_reset(DeviceState *dev) +{ +gic_state *s = ARM_GIC_COMMON(dev); +KVMARMGICClass *kgc = KVM_ARM_GIC_GET_CLASS(s); +kgc-parent_reset(dev); +kvm_arm_gic_put(s); +} + +static int kvm_arm_gic_init(SysBusDevice *dev) +{ +/* Device instance init function for the GIC sysbus device */ +int i; +gic_state *s = FROM_SYSBUS(gic_state, dev); +KVMARMGICClass *kgc = KVM_ARM_GIC_GET_CLASS(s); + +kgc-parent_init(dev); + +i = s-num_irq - GIC_INTERNAL; +/* For the GIC, also expose incoming GPIO
[RFC v2 3/6] hw/arm_gic: Add presave/postload hooks
Add presave/postload hooks to the ARM GIC common base class. These will be used by the KVM in-kernel GIC subclass to sync state between kernel and userspace when migrating. Signed-off-by: Peter Maydell peter.mayd...@linaro.org --- hw/arm_gic_common.c | 10 ++ hw/arm_gic_internal.h |2 ++ 2 files changed, 12 insertions(+) diff --git a/hw/arm_gic_common.c b/hw/arm_gic_common.c index 360e782..d972755 100644 --- a/hw/arm_gic_common.c +++ b/hw/arm_gic_common.c @@ -23,9 +23,14 @@ static void gic_save(QEMUFile *f, void *opaque) { gic_state *s = (gic_state *)opaque; +ARMGICCommonClass *c = ARM_GIC_COMMON_GET_CLASS(s); int i; int j; +if (c-pre_save) { +c-pre_save(s); +} + qemu_put_be32(f, s-enabled); for (i = 0; i s-num_cpu; i++) { qemu_put_be32(f, s-cpu_enabled[i]); @@ -57,6 +62,7 @@ static void gic_save(QEMUFile *f, void *opaque) static int gic_load(QEMUFile *f, void *opaque, int version_id) { gic_state *s = (gic_state *)opaque; +ARMGICCommonClass *c = ARM_GIC_COMMON_GET_CLASS(s); int i; int j; @@ -91,6 +97,10 @@ static int gic_load(QEMUFile *f, void *opaque, int version_id) s-irq_state[i].trigger = qemu_get_byte(f); } +if (c-post_load) { +c-post_load(s); +} + return 0; } diff --git a/hw/arm_gic_internal.h b/hw/arm_gic_internal.h index db4fad5..183dca6 100644 --- a/hw/arm_gic_internal.h +++ b/hw/arm_gic_internal.h @@ -118,6 +118,8 @@ void gic_init_irqs_and_distributor(gic_state *s, int num_irq); typedef struct ARMGICCommonClass { SysBusDeviceClass parent_class; +void (*pre_save)(gic_state *s); +void (*post_load)(gic_state *s); } ARMGICCommonClass; #define TYPE_ARM_GIC arm_gic -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC v2 0/6] QEMU: Support KVM on ARM
This is a v2 RFC of the QEMU patches to support KVM for ARM on Cortex-A15 hardware. It's intended for use with the kernel tree at git://github.com/virtualopensystems/linux-kvm-arm.git kvm-arm-v12 (ie the 'v2' kernel patches which Christoffer sent out at the start of the month). Patch review appreciated -- I think this is more or less ready to go in once the kernel patches are accepted (assuming no kernel ABI changes, of course). (There are still a few things it would be nice to do better, like driving cp15 access from the cp_regs hashtable rather than hardcoding a small list. If anybody feels any of the todo-type stuff is a blocker to these patches being committed that would be good to know too :-)) Git tree if preferred: git://git.linaro.org/people/pmaydell/qemu-arm.git kvm-arm-v12 Peter Maydell (6): linux-headers: Add ARM KVM headers (not for upstream) ARM: KVM: Add support for KVM on ARM architecture hw/arm_gic: Add presave/postload hooks hw/kvm/arm_gic: Implement support for KVM in-kernel ARM GIC ARM KVM: save and load VFP registers from kernel configure: Enable KVM on ARM configure|2 +- hw/a15mpcore.c |8 +- hw/arm/Makefile.objs |1 + hw/arm_gic_common.c | 10 + hw/arm_gic_internal.h|2 + hw/arm_pic.c | 28 +++ hw/kvm/arm_gic.c | 162 ++ linux-headers/asm-arm/kvm.h | 131 +++ linux-headers/asm-arm/kvm_para.h |1 + linux-headers/asm-generic/kvm_para.h |5 + linux-headers/asm-x86/kvm.h |1 + linux-headers/linux/kvm.h| 16 +- target-arm/Makefile.objs |1 + target-arm/cpu.h |1 + target-arm/helper.c |2 +- target-arm/kvm.c | 400 ++ 16 files changed, 765 insertions(+), 6 deletions(-) create mode 100644 hw/kvm/arm_gic.c create mode 100644 linux-headers/asm-arm/kvm.h create mode 100644 linux-headers/asm-arm/kvm_para.h create mode 100644 linux-headers/asm-generic/kvm_para.h create mode 100644 target-arm/kvm.c -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC v2 6/6] configure: Enable KVM on ARM
Enable KVM on ARM hosts, now that all the necessary components for it exist. Signed-off-by: Peter Maydell peter.mayd...@linaro.org --- configure |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure b/configure index c4a7837..f87bcd6 100755 --- a/configure +++ b/configure @@ -3886,7 +3886,7 @@ case $target_arch2 in echo CONFIG_NO_XEN=y $config_target_mak esac case $target_arch2 in - i386|x86_64|ppcemb|ppc|ppc64|s390x) + arm|i386|x86_64|ppcemb|ppc|ppc64|s390x) # Make sure the target and host cpus are compatible if test $kvm = yes -a $target_softmmu = yes -a \ \( $target_arch2 = $cpu -o \ -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC v2 1/6] linux-headers: Add ARM KVM headers (not for upstream)
This commit adds the ARM KVM headers. This is not to go to QEMU upstream -- the correct path there is that the KVM code will be committed to a mainline upstream kernel, and then upstream QEMU can do a bulk header update from the upstream kernel, which will allow us to drop this temporary commit. This is the result of running update-headers on Christoffer's kvm-arm-v12 branch (commit d151d7a). It includes the new interrupt delivery ABI, the switch to ONE_REG for core and coprocessor registers, and the VFP register definitions. It currently includes some minor x86 header changes which hopefully will have made it into QEMU upstream by the time we submit this for merging. --- linux-headers/asm-arm/kvm.h | 131 ++ linux-headers/asm-arm/kvm_para.h |1 + linux-headers/asm-generic/kvm_para.h |5 ++ linux-headers/asm-x86/kvm.h |1 + linux-headers/linux/kvm.h| 16 - 5 files changed, 151 insertions(+), 3 deletions(-) create mode 100644 linux-headers/asm-arm/kvm.h create mode 100644 linux-headers/asm-arm/kvm_para.h create mode 100644 linux-headers/asm-generic/kvm_para.h diff --git a/linux-headers/asm-arm/kvm.h b/linux-headers/asm-arm/kvm.h new file mode 100644 index 000..b6eaf0c --- /dev/null +++ b/linux-headers/asm-arm/kvm.h @@ -0,0 +1,131 @@ +/* + * Copyright (C) 2012 - Virtual Open Systems and Columbia University + * Author: Christoffer Dall c.d...@virtualopensystems.com + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#ifndef __ARM_KVM_H__ +#define __ARM_KVM_H__ + +#include asm/types.h + +#define __KVM_HAVE_GUEST_DEBUG +#define __KVM_HAVE_IRQ_LINE + +#define KVM_REG_SIZE(id) \ + (1U (((id) KVM_REG_SIZE_MASK) KVM_REG_SIZE_SHIFT)) + +struct kvm_regs { + __u32 usr_regs[15]; /* R0_usr - R14_usr */ + __u32 svc_regs[3]; /* SP_svc, LR_svc, SPSR_svc */ + __u32 abt_regs[3]; /* SP_abt, LR_abt, SPSR_abt */ + __u32 und_regs[3]; /* SP_und, LR_und, SPSR_und */ + __u32 irq_regs[3]; /* SP_irq, LR_irq, SPSR_irq */ + __u32 fiq_regs[8]; /* R8_fiq - R14_fiq, SPSR_fiq */ + __u32 pc; /* The program counter (r15) */ + __u32 cpsr; /* The guest CPSR */ +}; + +/* Supported Processor Types */ +#define KVM_ARM_TARGET_CORTEX_A15 0 +#define KVM_ARM_NUM_TARGETS1 + +struct kvm_vcpu_init { + __u32 target; + __u32 features[7]; +}; + +struct kvm_sregs { +}; + +struct kvm_fpu { +}; + +struct kvm_guest_debug_arch { +}; + +struct kvm_debug_exit_arch { +}; + +struct kvm_sync_regs { +}; + +struct kvm_arch_memory_slot { +}; + +/* For KVM_VCPU_GET_REG_LIST. */ +struct kvm_reg_list { + __u64 n; /* number of regs */ + __u64 reg[0]; +}; + +/* If you need to interpret the index values, here is the key: */ +#define KVM_REG_ARM_COPROC_MASK0x0FFF +#define KVM_REG_ARM_COPROC_SHIFT 16 +#define KVM_REG_ARM_32_OPC2_MASK 0x0007 +#define KVM_REG_ARM_32_OPC2_SHIFT 0 +#define KVM_REG_ARM_OPC1_MASK 0x0078 +#define KVM_REG_ARM_OPC1_SHIFT 3 +#define KVM_REG_ARM_CRM_MASK 0x0780 +#define KVM_REG_ARM_CRM_SHIFT 7 +#define KVM_REG_ARM_32_CRN_MASK0x7800 +#define KVM_REG_ARM_32_CRN_SHIFT 11 + +/* Normal registers are mapped as coprocessor 16. */ +#define KVM_REG_ARM_CORE (0x0010 KVM_REG_ARM_COPROC_SHIFT) +#define KVM_REG_ARM_CORE_REG(name) (offsetof(struct kvm_regs, name) / 4) + +/* Some registers need more space to represent values. */ +#define KVM_REG_ARM_DEMUX (0x0011 KVM_REG_ARM_COPROC_SHIFT) +#define KVM_REG_ARM_DEMUX_ID_MASK 0xFF00 +#define KVM_REG_ARM_DEMUX_ID_SHIFT 8 +#define KVM_REG_ARM_DEMUX_ID_CCSIDR(0x00 KVM_REG_ARM_DEMUX_ID_SHIFT) +#define KVM_REG_ARM_DEMUX_VAL_MASK 0x00FF +#define KVM_REG_ARM_DEMUX_VAL_SHIFT0 + +/* VFP registers: we could overload CP10 like ARM does, but that's ugly. */ +#define KVM_REG_ARM_VFP(0x0012 KVM_REG_ARM_COPROC_SHIFT) +#define KVM_REG_ARM_VFP_MASK 0x +#define KVM_REG_ARM_VFP_BASE_REG 0x0 +#define KVM_REG_ARM_VFP_FPSID 0x1000 +#define
Re: [PATCH v6 03/16] target-i386: Add cpu object access routines for Hypervisor level.
Am 10.10.2012 17:22, schrieb Don Slutz: On 10/09/12 15:13, Don Slutz wrote: On 10/09/12 12:25, Marcelo Tosatti wrote: On Mon, Sep 24, 2012 at 10:32:05AM -0400, Don Slutz wrote: +static void x86_cpuid_set_hv_level(Object *obj, Visitor *v, void *opaque, +const char *name, Error **errp) +{ +X86CPU *cpu = X86_CPU(obj); +uint32_t value; + +visit_type_uint32(v, value, name, errp); +if (error_is_set(errp)) { +return; +} + +if (value != 0 value 0x4000) { +value += 0x4000; +} Whats the purpose of this? Adds ambiguity. [...] This is direct copy with adjustment from x86_cpuid_set_xlevel(): if (value 0x8000) { value += 0x8000; } (Pending patch: http://comments.gmane.org/gmane.comp.emulators.qemu/172703 adds this) (Any pending patch can be changed ;)) The adjustment is that 0 is a legal value. See http://lkml.indiana.edu/hypermail/linux/kernel/1205.0/00100.html This does mean that just like xlevel=1 and xlevel=0x8001 are the same; hypervisor-level=1 and hypervisor-level=0x401 are the same. If this is not wanted, I have no issue with removing it. I have no strong opinion either way, but if there's only one call site, I'd prefer to apply these fixups to user input before setting the property and to have the property setter error out on invalid values. I consider that cleaner than silently fixing up values inside the setter. Regards, Andreas -- SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[GIT PULL] VFIO fixes for 3.7-rc1
Hi Linus, The following changes since commit 2474542f64432398f503373f53bdf620491bcfa8: Merge tag 'for-3.7-rc1' of git://gitorious.org/linux-pwm/linux-pwm (2012-10-10 20:15:24 +0900) are available in the git repository at: git://github.com/awilliam/linux-vfio.git tags/vfio-for-v3.7-rc1 for you to fetch changes up to 899649b7d4ead76c19e39251ca886eebe3f811a8: vfio: Fix PCI INTx disable consistency (2012-10-10 09:10:32 -0600) vfio fixes for v3.7-rc1 This includes a fix for PCI BAR mmaps after recent mm changes, fixing an interrupt race, and fixing a consistency bug in interrupt state when switching interrupt modes. Alex Williamson (3): vfio: Fix PCI mmap after b3b9c293 vfio: Move PCI INTx eventfd setting earlier vfio: Fix PCI INTx disable consistency drivers/vfio/pci/vfio_pci.c | 7 +++ drivers/vfio/pci/vfio_pci_intrs.c | 18 +++--- 2 files changed, 18 insertions(+), 7 deletions(-) -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] s390/kvm: Add documentation for KVM_S390_INTERRUPT.
On Tue, Oct 02, 2012 at 04:25:37PM +0200, Christian Borntraeger wrote: From: Cornelia Huck cornelia.h...@de.ibm.com Signed-off-by: Cornelia Huck cornelia.h...@de.ibm.com Signed-off-by: Martin Schwidefsky schwidef...@de.ibm.com Signed-off-by: Christian Borntraeger borntrae...@de.ibm.com --- Documentation/virtual/kvm/api.txt | 33 + 1 file changed, 33 insertions(+) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 36befa7..c58 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1984,6 +1984,39 @@ return the hash table order in the parameter. (If the guest is using the virtualized real-mode area (VRMA) facility, the kernel will re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.) +4.77 KVM_S390_INTERRUPT + +Capability: basic +Archtectures: s390 typo. +Type: vm ioctl, vcpu ioctl +Parameters: struct kvm_s390_interrupt (in) +Returns: 0 on success, -1 on error + +Allows to inject an interrupt to the guest. Interrupts can be floating +(vm ioctl) or per cpu (vcpu ioctl), depending on the interrupt type. + +Interrupt parameters are passed via kvm_s390_interrupt: + +struct kvm_s390_interrupt { + __u32 type; + __u32 parm; + __u64 parm64; +}; No need for a reserved area for extensibility? + +type can be one of the following: + +KVM_S390_SIGP_STOP (vcpu) - sigp restart +KVM_S390_PROGRAM_INT (vcpu) - program check; code in parm +KVM_S390_SIGP_SET_PREFIX (vcpu) - sigp set prefix; prefix address in parm +KVM_S390_RESTART (vcpu) - restart +KVM_S390_INT_VIRTIO (vm) - virtio external interrupt; external interrupt +parameters in parm and parm64 +KVM_S390_INT_SERVICE (vm) - sclp external interrupt; sclp parameter in parm +KVM_S390_INT_EMERGENCY (vcpu) - sigp emergency; source cpu in parm +KVM_S390_INT_EXTERNAL_CALL (vcpu) - sigp external call; source cpu in parm + +Note that the vcpu ioctl is asynchronous to vpcu execution. typo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v4 5/5] KVM: MMU: introduce FNAME(prefetch_gpte)
On Sun, Oct 07, 2012 at 08:05:11PM +0800, Xiao Guangrong wrote: The only difference between FNAME(update_pte) and FNAME(pte_prefetch) is that the former is allowed to prefetch gfn from dirty logged slot, so introduce a common function to prefetch spte Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- arch/x86/kvm/paging_tmpl.h | 55 +++ 1 files changed, 24 insertions(+), 31 deletions(-) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 36a80ed..f887e4c 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -305,31 +305,43 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker, addr, access); } -static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - u64 *spte, const void *pte) +static bool +FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + u64 *spte, pt_element_t gpte, bool no_dirty_log) { - pt_element_t gpte; unsigned pte_access; + gfn_t gfn; pfn_t pfn; - gpte = *(const pt_element_t *)pte; if (prefetch_invalid_gpte(vcpu, sp, spte, gpte)) - return; + return false; pgprintk(%s: gpte %llx spte %p\n, __func__, (u64)gpte, spte); + + gfn = gpte_to_gfn(gpte); pte_access = sp-role.access gpte_access(vcpu, gpte); protect_clean_gpte(pte_access, gpte); - pfn = gfn_to_pfn_atomic(vcpu-kvm, gpte_to_gfn(gpte)); + pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, + no_dirty_log (pte_access ACC_WRITE_MASK)); Is this a bugfix? -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] s390/kvm: Add documentation for KVM_S390_INTERRUPT.
On 10.10.2012, at 18:27, Marcelo Tosatti wrote: On Tue, Oct 02, 2012 at 04:25:37PM +0200, Christian Borntraeger wrote: From: Cornelia Huck cornelia.h...@de.ibm.com Signed-off-by: Cornelia Huck cornelia.h...@de.ibm.com Signed-off-by: Martin Schwidefsky schwidef...@de.ibm.com Signed-off-by: Christian Borntraeger borntrae...@de.ibm.com --- Documentation/virtual/kvm/api.txt | 33 + 1 file changed, 33 insertions(+) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 36befa7..c58 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1984,6 +1984,39 @@ return the hash table order in the parameter. (If the guest is using the virtualized real-mode area (VRMA) facility, the kernel will re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.) +4.77 KVM_S390_INTERRUPT + +Capability: basic +Archtectures: s390 typo. +Type: vm ioctl, vcpu ioctl +Parameters: struct kvm_s390_interrupt (in) +Returns: 0 on success, -1 on error + +Allows to inject an interrupt to the guest. Interrupts can be floating +(vm ioctl) or per cpu (vcpu ioctl), depending on the interrupt type. + +Interrupt parameters are passed via kvm_s390_interrupt: + +struct kvm_s390_interrupt { +__u32 type; +__u32 parm; +__u64 parm64; +}; No need for a reserved area for extensibility? This is documentation for an existing ioctl, no? And yes, a reserved area would've been nice ;). But it's too late for that one now. Alex + +type can be one of the following: + +KVM_S390_SIGP_STOP (vcpu) - sigp restart +KVM_S390_PROGRAM_INT (vcpu) - program check; code in parm +KVM_S390_SIGP_SET_PREFIX (vcpu) - sigp set prefix; prefix address in parm +KVM_S390_RESTART (vcpu) - restart +KVM_S390_INT_VIRTIO (vm) - virtio external interrupt; external interrupt + parameters in parm and parm64 +KVM_S390_INT_SERVICE (vm) - sclp external interrupt; sclp parameter in parm +KVM_S390_INT_EMERGENCY (vcpu) - sigp emergency; source cpu in parm +KVM_S390_INT_EXTERNAL_CALL (vcpu) - sigp external call; source cpu in parm + +Note that the vcpu ioctl is asynchronous to vpcu execution. typo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] s390/kvm: Add documentation for KVM_S390_INTERRUPT.
On Wed, 10 Oct 2012 13:27:09 -0300 Marcelo Tosatti mtosa...@redhat.com wrote: +Archtectures: s390 typo. +Note that the vcpu ioctl is asynchronous to vpcu execution. typo Would you like a respin with the typos fixed? -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] s390/kvm: Add documentation for KVM_S390_INTERRUPT.
On Wed, 10 Oct 2012 18:32:14 +0200 Alexander Graf ag...@suse.de wrote: On 10.10.2012, at 18:27, Marcelo Tosatti wrote: On Tue, Oct 02, 2012 at 04:25:37PM +0200, Christian Borntraeger wrote: From: Cornelia Huck cornelia.h...@de.ibm.com Signed-off-by: Cornelia Huck cornelia.h...@de.ibm.com Signed-off-by: Martin Schwidefsky schwidef...@de.ibm.com Signed-off-by: Christian Borntraeger borntrae...@de.ibm.com --- Documentation/virtual/kvm/api.txt | 33 + 1 file changed, 33 insertions(+) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 36befa7..c58 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1984,6 +1984,39 @@ return the hash table order in the parameter. (If the guest is using the virtualized real-mode area (VRMA) facility, the kernel will re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.) +4.77 KVM_S390_INTERRUPT + +Capability: basic +Archtectures: s390 typo. +Type: vm ioctl, vcpu ioctl +Parameters: struct kvm_s390_interrupt (in) +Returns: 0 on success, -1 on error + +Allows to inject an interrupt to the guest. Interrupts can be floating +(vm ioctl) or per cpu (vcpu ioctl), depending on the interrupt type. + +Interrupt parameters are passed via kvm_s390_interrupt: + +struct kvm_s390_interrupt { + __u32 type; + __u32 parm; + __u64 parm64; +}; No need for a reserved area for extensibility? This is documentation for an existing ioctl, no? And yes, a reserved area would've been nice ;). But it's too late for that one now. Indeed. We'll have to keep this for backward compatibility and introduce a new mechanism for larger interrupt structures. Alex + +type can be one of the following: + +KVM_S390_SIGP_STOP (vcpu) - sigp restart +KVM_S390_PROGRAM_INT (vcpu) - program check; code in parm +KVM_S390_SIGP_SET_PREFIX (vcpu) - sigp set prefix; prefix address in parm +KVM_S390_RESTART (vcpu) - restart +KVM_S390_INT_VIRTIO (vm) - virtio external interrupt; external interrupt + parameters in parm and parm64 +KVM_S390_INT_SERVICE (vm) - sclp external interrupt; sclp parameter in parm +KVM_S390_INT_EMERGENCY (vcpu) - sigp emergency; source cpu in parm +KVM_S390_INT_EXTERNAL_CALL (vcpu) - sigp external call; source cpu in parm + +Note that the vcpu ioctl is asynchronous to vpcu execution. typo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [RFC v2 4/6] hw/kvm/arm_gic: Implement support for KVM in-kernel ARM GIC
Am 10.10.2012 17:07, schrieb Peter Maydell: Implement support for using the KVM in-kernel GIC for ARM. Signed-off-by: Peter Maydell peter.mayd...@linaro.org --- hw/a15mpcore.c |8 ++- hw/arm/Makefile.objs |1 + hw/kvm/arm_gic.c | 162 ++ 3 files changed, 170 insertions(+), 1 deletion(-) create mode 100644 hw/kvm/arm_gic.c diff --git a/hw/a15mpcore.c b/hw/a15mpcore.c index fc0a02a..a37fc61 100644 --- a/hw/a15mpcore.c +++ b/hw/a15mpcore.c @@ -19,6 +19,7 @@ */ #include sysbus.h +#include kvm.h /* A15MP private memory region. */ @@ -41,7 +42,12 @@ static int a15mp_priv_init(SysBusDevice *dev) A15MPPrivState *s = FROM_SYSBUS(A15MPPrivState, dev); SysBusDevice *busdev; -s-gic = qdev_create(NULL, arm_gic); +if (kvm_irqchip_in_kernel()) { +s-gic = qdev_create(NULL, kvm-arm_gic); +} else { +s-gic = qdev_create(NULL, arm_gic); +} You could follow x86 more closely by just selecting apic_type. (Then if we drop/change qdev_create() at some point it's one change less.) + qdev_prop_set_uint32(s-gic, num-cpu, s-num_cpu); qdev_prop_set_uint32(s-gic, num-irq, s-num_irq); qdev_prop_set_uint32(s-gic, revision, 2); [...] diff --git a/hw/kvm/arm_gic.c b/hw/kvm/arm_gic.c new file mode 100644 index 000..3ec538d --- /dev/null +++ b/hw/kvm/arm_gic.c @@ -0,0 +1,162 @@ +/* + * ARM Generic Interrupt Controller using KVM in-kernel support + * + * Copyright (c) 2012 Linaro Limited + * Written by Peter Maydell + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, see http://www.gnu.org/licenses/. + */ + +#include hw/sysbus.h +#include kvm.h +#include hw/arm_gic_internal.h + +#define TYPE_KVM_ARM_GIC kvm-arm_gic kvm-arm-gic? +#define KVM_ARM_GIC(obj) \ + OBJECT_CHECK(gic_state, (obj), TYPE_KVM_ARM_GIC) +#define KVM_ARM_GIC_CLASS(klass) \ + OBJECT_CLASS_CHECK(KVMARMGICClass, (klass), TYPE_KVM_ARM_GIC) +#define KVM_ARM_GIC_GET_CLASS(obj) \ + OBJECT_GET_CLASS(KVMARMGICClass, (obj), TYPE_KVM_ARM_GIC) + +typedef struct KVMARMGICClass { +ARMGICCommonClass parent_class; +int (*parent_init)(SysBusDevice *dev); +void (*parent_reset)(DeviceState *dev); +} KVMARMGICClass; + +static void kvm_arm_gic_set_irq(void *opaque, int irq, int level) +{ +/* Meaning of the 'irq' parameter: + * [0..N-1] : external interrupts + * [N..N+31] : PPI (internal) interrupts for CPU 0 + * [N+32..N+63] : PPI (internal interrupts for CPU 1 + * ... + * Convert this to the kernel's desired encoding, which + * has separate fields in the irq number for type, + * CPU number and interrupt number. + */ +gic_state *s = (gic_state *)opaque; You don't happen to have time to clean up gic_state - GICState before spreading its use? ;) [...] +static TypeInfo arm_gic_info = { static const +.name = TYPE_KVM_ARM_GIC, +.parent = TYPE_ARM_GIC_COMMON, +.instance_size = sizeof(gic_state), +.class_init = kvm_arm_gic_class_init, +.class_size = sizeof(KVMARMGICClass), +}; + +static void arm_gic_register_types(void) kvm_arm_gic_register_types? +{ +type_register_static(arm_gic_info); +} + +type_init(arm_gic_register_types) Andreas -- SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [RFC v2 3/6] hw/arm_gic: Add presave/postload hooks
Am 10.10.2012 17:07, schrieb Peter Maydell: Add presave/postload hooks to the ARM GIC common base class. These will be used by the KVM in-kernel GIC subclass to sync state between kernel and userspace when migrating. Signed-off-by: Peter Maydell peter.mayd...@linaro.org Reviewed-by: Andreas Färber afaer...@suse.de /-F -- SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [RFC v2 4/6] hw/kvm/arm_gic: Implement support for KVM in-kernel ARM GIC
On 10 October 2012 18:23, Andreas Färber afaer...@suse.de wrote: Am 10.10.2012 17:07, schrieb Peter Maydell: Implement support for using the KVM in-kernel GIC for ARM. Signed-off-by: Peter Maydell peter.mayd...@linaro.org --- hw/a15mpcore.c |8 ++- hw/arm/Makefile.objs |1 + hw/kvm/arm_gic.c | 162 ++ 3 files changed, 170 insertions(+), 1 deletion(-) create mode 100644 hw/kvm/arm_gic.c diff --git a/hw/a15mpcore.c b/hw/a15mpcore.c index fc0a02a..a37fc61 100644 --- a/hw/a15mpcore.c +++ b/hw/a15mpcore.c @@ -19,6 +19,7 @@ */ #include sysbus.h +#include kvm.h /* A15MP private memory region. */ @@ -41,7 +42,12 @@ static int a15mp_priv_init(SysBusDevice *dev) A15MPPrivState *s = FROM_SYSBUS(A15MPPrivState, dev); SysBusDevice *busdev; -s-gic = qdev_create(NULL, arm_gic); +if (kvm_irqchip_in_kernel()) { +s-gic = qdev_create(NULL, kvm-arm_gic); +} else { +s-gic = qdev_create(NULL, arm_gic); +} You could follow x86 more closely by just selecting apic_type. (Then if we drop/change qdev_create() at some point it's one change less.) Good idea. + qdev_prop_set_uint32(s-gic, num-cpu, s-num_cpu); qdev_prop_set_uint32(s-gic, num-irq, s-num_irq); qdev_prop_set_uint32(s-gic, revision, 2); [...] diff --git a/hw/kvm/arm_gic.c b/hw/kvm/arm_gic.c new file mode 100644 index 000..3ec538d --- /dev/null +++ b/hw/kvm/arm_gic.c @@ -0,0 +1,162 @@ +/* + * ARM Generic Interrupt Controller using KVM in-kernel support + * + * Copyright (c) 2012 Linaro Limited + * Written by Peter Maydell + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, see http://www.gnu.org/licenses/. + */ + +#include hw/sysbus.h +#include kvm.h +#include hw/arm_gic_internal.h + +#define TYPE_KVM_ARM_GIC kvm-arm_gic kvm-arm-gic? I was trying to follow arm_gic but yes, I guess it's better to go with the recommended style (which is all-hyphens, right?) +#define KVM_ARM_GIC(obj) \ + OBJECT_CHECK(gic_state, (obj), TYPE_KVM_ARM_GIC) +#define KVM_ARM_GIC_CLASS(klass) \ + OBJECT_CLASS_CHECK(KVMARMGICClass, (klass), TYPE_KVM_ARM_GIC) +#define KVM_ARM_GIC_GET_CLASS(obj) \ + OBJECT_GET_CLASS(KVMARMGICClass, (obj), TYPE_KVM_ARM_GIC) + +typedef struct KVMARMGICClass { +ARMGICCommonClass parent_class; +int (*parent_init)(SysBusDevice *dev); +void (*parent_reset)(DeviceState *dev); +} KVMARMGICClass; + +static void kvm_arm_gic_set_irq(void *opaque, int irq, int level) +{ +/* Meaning of the 'irq' parameter: + * [0..N-1] : external interrupts + * [N..N+31] : PPI (internal) interrupts for CPU 0 + * [N+32..N+63] : PPI (internal interrupts for CPU 1 + * ... + * Convert this to the kernel's desired encoding, which + * has separate fields in the irq number for type, + * CPU number and interrupt number. + */ +gic_state *s = (gic_state *)opaque; You don't happen to have time to clean up gic_state - GICState before spreading its use? ;) I can submit a patch if you like... [...] +static TypeInfo arm_gic_info = { static const Yep. +.name = TYPE_KVM_ARM_GIC, +.parent = TYPE_ARM_GIC_COMMON, +.instance_size = sizeof(gic_state), +.class_init = kvm_arm_gic_class_init, +.class_size = sizeof(KVMARMGICClass), +}; + +static void arm_gic_register_types(void) kvm_arm_gic_register_types? Oops, yes, cut-n-pasto. -- PMM -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC 1/2] kvm: Handle undercommitted guest case in PLE handler
On 10/10/2012 07:54 PM, Andrew Theurer wrote: I ran 'perf sched map' on the dbench workload for medium and large VMs, and I thought I would share some of the results. I think it helps to visualize what's going on regarding the yielding. These files are png bitmaps, generated from processing output from 'perf sched map' (and perf data generated from 'perf sched record'). The Y axis is the host cpus, each row being 10 pixels high. For these tests, there are 80 host cpus, so the total height is 800 pixels. The X axis is time (in microseconds), with each pixel representing 1 microsecond. Each bitmap plots 30,000 microseconds. The bitmaps are quite wide obviously, and zooming in/out while viewing is recommended. Each row (each host cpu) is assigned a color based on what thread is running. vCPUs of the same VM are assigned a common color (like red, blue, magenta, etc), and each vCPU has a unique brightness for that color. There are a maximum of 12 assignable colors, so in any VMs 12 revert to vCPU color of gray. I would use more colors, but it becomes harder to distinguish one color from another. The white color represents missing data from perf, and black color represents any thread which is not a vCPU. For the following tests, VMs were pinned to host NUMA nodes and to specific cpus to help with consistency and operate within the constraints of the last test (gang scheduler). Here is a good example of PLE. These are 10-way VMs, 16 of them (as described above only 12 of the VMs have a color, rest are gray). https://docs.google.com/open?id=0B6tfUNlZ-14wdmFqUmE5QjJHMFU This looks very nice to visualize what is happening. Beginning of the graph looks little messy but later it is clear. If you zoom out and look at the whole bitmap, you may notice the 4ms intervals of the scheduler. They are pretty well aligned across all cpus. Normally, for cpu bound workloads, we would expect to see each thread to run for 4 ms, then something else getting to run, and so on. That is mostly true in this test. We have 2x over-commit and we generally see the switching of threads at 4ms. One thing to note is that not all vCPU threads for the same VM run at exactly the same time, and that is expected and the whole reason for lock-holder preemption. Now, if you zoom in on the bitmap, you should notice within the 4ms intervals there is some task switching going on. This is most likely because of the yield_to initiated by the PLE handler. In this case there is not that much yielding to do. It's quite clean, and the performance is quite good. Below is an example of PLE, but this time with 20-way VMs, 8 of them. CPU over-commit is still 2x. https://docs.google.com/open?id=0B6tfUNlZ-14wdmFqUmE5QjJHMFU I think this link still 10x16. Could you paste the link again? This one looks quite different. In short, it's a mess. The switching between tasks can be lower than 10 microseconds. It basically never recovers. There is constant yielding all the time. Below is again 8 x 20-way VMs, but this time I tried out Nikunj's gang scheduling patches. While I am not recommending gang scheduling, I think it's a good data point. The performance is 3.88x the PLE result. https://docs.google.com/open?id=0B6tfUNlZ-14wWXdscWcwNTVEY3M Note that the task switching intervals of 4ms are quite obvious again, and this time all vCPUs from same VM run at the same time. It represents the best possible outcome. Anyway, I thought the bitmaps might help better visualize what's going on. -Andrew -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC 1/2] kvm: Handle undercommitted guest case in PLE handler
On 10/10/2012 08:29 AM, Andrew Theurer wrote: On Wed, 2012-10-10 at 00:21 +0530, Raghavendra K T wrote: * Avi Kivity a...@redhat.com [2012-10-04 17:00:28]: On 10/04/2012 03:07 PM, Peter Zijlstra wrote: On Thu, 2012-10-04 at 14:41 +0200, Avi Kivity wrote: Again the numbers are ridiculously high for arch_local_irq_restore. Maybe there's a bad perf/kvm interaction when we're injecting an interrupt, I can't believe we're spending 84% of the time running the popf instruction. Smells like a software fallback that doesn't do NMI, hrtimer based sampling typically hits popf where we re-enable interrupts. Good nose, that's probably it. Raghavendra, can you ensure that the PMU is properly exposed? 'dmesg' in the guest will tell. If it isn't, -cpu host will expose it (and a good idea anyway to get best performance). Hi Avi, you are right. SandyBridge machine result was not proper. I cleaned up the services, enabled PMU, re-ran all the test again. Here is the summary: We do get good benefit by increasing ple window. Though we don't see good benefit for kernbench and sysbench, for ebizzy, we get huge improvement for 1x scenario. (almost 2/3rd of ple disabled case). Let me know if you think we can increase the default ple_window itself to 16k. I am experimenting with V2 version of undercommit improvement(this) patch series, But I think if you wish to go for increase of default ple_window, then we would have to measure the benefit of patches when ple_window = 16k. I can respin the whole series including this default ple_window change. I also have the perf kvm top result for both ebizzy and kernbench. I think they are in expected lines now. Improvements 16 core PLE machine with 16 vcpu guest base = 3.6.0-rc5 + ple handler optimization patches base_pleopt_16k = base + ple_window = 16k base_pleopt_32k = base + ple_window = 32k base_pleopt_nople = base + ple_gap = 0 kernbench, hackbench, sysbench (time in sec lower is better) ebizzy (rec/sec higher is better) % improvements w.r.t base (ple_window = 4k) ---+---+-+---+ |base_pleopt_16k| base_pleopt_32k | base_pleopt_nople | ---+---+-+---+ kernbench_1x | 0.42371 | 1.15164| 0.09320 | kernbench_2x | -1.40981 | -17.48282 | -570.77053 | ---+---+-+---+ sysbench_1x| -0.92367 | 0.24241 | -0.27027 | sysbench_2x| -2.22706 |-0.30896 | -1.27573 | sysbench_3x| -0.75509 | 0.09444 | -2.97756 | ---+---+-+---+ ebizzy_1x | 54.99976 | 67.29460| 74.14076 | ebizzy_2x | -8.83386 |-27.38403| -96.22066 | ---+---+-+---+ perf kvm top observation for kernbench and ebizzy (nople, 4k, 32k window) Is the perf data for 1x overcommit? Yes, 16vcpu guest on 16 core pleopt ple_gap=0 ebizzy : 18131 records/s 63.78% [guest.kernel] [g] _raw_spin_lock_irqsave 5.65% [guest.kernel] [g] smp_call_function_many 3.12% [guest.kernel] [g] clear_page 3.02% [guest.kernel] [g] down_read_trylock 1.85% [guest.kernel] [g] async_page_fault 1.81% [guest.kernel] [g] up_read 1.76% [guest.kernel] [g] native_apic_mem_write 1.70% [guest.kernel] [g] find_vma Does 'perf kvm top' not give host samples at the same time? Would be nice to see the host overhead as a function of varying ple window. I would expect that to be the major difference between 4/16/32k window sizes. No, I did something like this perf kvm --guestvmlinux ./vmlinux.guest top -g -U -d 3. Yes that is a good idea. (I am getting some segfaults with perf top, I think it is already fixed but yet to see the patch that fixes) A big concern I have (if this is 1x overcommit) for ebizzy is that it has just terrible scalability to begin with. I do not think we should try to optimize such a bad workload. I think my way of running dbench has some flaw, so I went to ebizzy. Could you let me know how you generally run dbench? -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC 1/2] kvm: Handle undercommitted guest case in PLE handler
On 10/10/12 11:54 AM, Raghavendra K T wrote: No, I did something like this perf kvm --guestvmlinux ./vmlinux.guest top -g -U -d 3. Yes that is a good idea. (I am getting some segfaults with perf top, I think it is already fixed but yet to see the patch that fixes) What version of perf: perf --version -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] virt/kvm: change kvm_assign_device() to print return value when iommu_attach_device() fails
On Mon, Oct 08, 2012 at 06:36:11PM -0600, Shuah Khan wrote: Change existing kernel error message to include return value from iommu_attach_device() when it fails. This will help debug device assignment failures more effectively. Signed-off-by: Shuah Khan shuah.k...@hp.com --- virt/kvm/iommu.c |6 +- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c index 037cb67..18e1e30 100644 --- a/virt/kvm/iommu.c +++ b/virt/kvm/iommu.c @@ -168,11 +168,7 @@ int kvm_assign_device(struct kvm *kvm, r = iommu_attach_device(domain, pdev-dev); if (r) { - printk(KERN_ERR assign device %x:%x:%x.%x failed, - pci_domain_nr(pdev-bus), - pdev-bus-number, - PCI_SLOT(pdev-devfn), - PCI_FUNC(pdev-devfn)); + dev_err(pdev-dev, kvm assign device failed ret %d, r); return r; } Why removal of domain,bus,slot,func from the message? -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 3/3] KVM: Add KVM_VCPU_GET_REG_LIST/KVM_CAP_REG_LIST.
On Wed, Sep 05, 2012 at 05:28:46PM +0930, Rusty Russell wrote: This is a generic interface to find out what you can use KVM_GET_ONE_REG/KVM_SET_ONE_REG on. Archs need to define KVM_HAVE_REG_LIST and then kvm_arch_num_regs() and kvm_arch_copy_reg_indices() functions. It's inspired by KVM_GET_MSR_INDEX_LIST, except it's a per-vcpu ioctl, and uses 64-bit indices. Signed-off-by: Rusty Russell rusty.russ...@linaro.org --- Documentation/virtual/kvm/api.txt | 20 include/linux/kvm.h | 12 include/linux/kvm_host.h |5 - virt/kvm/kvm_main.c | 20 4 files changed, 56 insertions(+), 1 deletion(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index b91bfd4..f30c3d0 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1985,6 +1985,26 @@ the virtualized real-mode area (VRMA) facility, the kernel will re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.) +4.76 KVM_VCPU_GET_REG_LIST + +Capability: KVM_CAP_REG_LIST +Architectures: all +Type: vcpu ioctl +Parameters: struct kvm_reg_list (in/out) +Returns: 0 on success; -1 on error +Errors: + E2BIG: the reg index list is too big to fit in the array specified by + the user (the number required will be written into n). + +struct kvm_reg_list { + __u64 n; /* number of registers in reg[] */ + __u64 reg[0]; +}; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 169a001..453fe93 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2082,6 +2082,23 @@ out_free2: break; } #endif +#ifdef KVM_HAVE_REG_LIST + case KVM_VCPU_GET_REG_LIST: { + struct kvm_reg_list __user *user_list = argp; + struct kvm_reg_list reg_list; + unsigned n; + + if (copy_from_user(reg_list, user_list, sizeof reg_list)) + return -EFAULT; + n = reg_list.n; + reg_list.n = kvm_arch_num_regs(vcpu); The code does not actually support more than 2^32 registers, does it? Why __u64 n ? -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC 1/2] kvm: Handle undercommitted guest case in PLE handler
On 10/10/2012 11:33 PM, David Ahern wrote: On 10/10/12 11:54 AM, Raghavendra K T wrote: No, I did something like this perf kvm --guestvmlinux ./vmlinux.guest top -g -U -d 3. Yes that is a good idea. (I am getting some segfaults with perf top, I think it is already fixed but yet to see the patch that fixes) What version of perf: perf --version perf version 2.6.32-279.el6.x86_64.debug (I searched that it is fixed in 288. could not dig-out actual patch though) -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] virt/kvm: change kvm_assign_device() to print return value when iommu_attach_device() fails
On Wed, Oct 10, 2012 at 10:51 AM, Marcelo Tosatti mtosa...@redhat.com wrote: On Mon, Oct 08, 2012 at 06:36:11PM -0600, Shuah Khan wrote: Change existing kernel error message to include return value from iommu_attach_device() when it fails. This will help debug device assignment failures more effectively. Signed-off-by: Shuah Khan shuah.k...@hp.com --- virt/kvm/iommu.c |6 +- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c index 037cb67..18e1e30 100644 --- a/virt/kvm/iommu.c +++ b/virt/kvm/iommu.c @@ -168,11 +168,7 @@ int kvm_assign_device(struct kvm *kvm, r = iommu_attach_device(domain, pdev-dev); if (r) { - printk(KERN_ERR assign device %x:%x:%x.%x failed, - pci_domain_nr(pdev-bus), - pdev-bus-number, - PCI_SLOT(pdev-devfn), - PCI_FUNC(pdev-devfn)); + dev_err(pdev-dev, kvm assign device failed ret %d, r); return r; } Why removal of domain,bus,slot,func from the message? dev_err() includes that information like this: pci :00:1d.0: assign device failed ret 0 Without the removal of domain,bus,slot,func from the message: It will look like: pci :00:1d.0: assign device 0:0:1d.0 failed ret 0 Initially I left it in there and after testing, realized it is duplicate information and decided to remove it. -- Shuah -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] s390/kvm: Add documentation for KVM_S390_INTERRUPT.
On Wed, Oct 10, 2012 at 06:37:57PM +0200, Cornelia Huck wrote: On Wed, 10 Oct 2012 13:27:09 -0300 Marcelo Tosatti mtosa...@redhat.com wrote: +Archtectures: s390 typo. +Note that the vcpu ioctl is asynchronous to vpcu execution. typo Would you like a respin with the typos fixed? No. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] virt/kvm: change kvm_assign_device() to print return value when iommu_attach_device() fails
On Mon, Oct 08, 2012 at 06:36:11PM -0600, Shuah Khan wrote: Change existing kernel error message to include return value from iommu_attach_device() when it fails. This will help debug device assignment failures more effectively. Signed-off-by: Shuah Khan shuah.k...@hp.com Applied, thanks. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 00/14] KVM/ARM Implementation
On Mon, Oct 01, 2012 at 05:09:59AM -0400, Christoffer Dall wrote: The following series implements KVM support for ARM processors, specifically on the Cortex A-15 platform. We feel this is ready to be merged. Work is done in collaboration between Columbia University, Virtual Open Systems and ARM/Linaro. The patch series applies to Linux 3.6 with a number of merges: 1. git://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git branch: hyp-mode-boot-next (e5a04cb0b4a) 2. git://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git branch: timers-next (437814c44c) 3. git://git.kernel.org/pub/scm/virt/kvm/kvm.git branch: next (1e08ec4a) This is Version 12 of the patch series, the first 10 versions were reviewed on the KVM/ARM and KVM mailing lists. Changes can also be pulled from: git://github.com/virtualopensystems/linux-kvm-arm.git branch: kvm-arm-v12 branch: kvm-arm-v12-vgic branch: kvm-arm-v12-vgic-timers A non-flattened edition of the patch series, which can always be merged, can be found at: git://github.com/virtualopensystems/linux-kvm-arm.git kvm-arm-master This patch series requires QEMU compatibility. Use the branch git://github.com/virtualopensystems/qemu.git kvm-arm Following this patch series, which implements core KVM support are two other patch series implementing Virtual Generic Interrupt Controller (VGIC) support and Architected Generic Timers. All three patch series should be applied for full QEMU compatibility. The implementation is broken up into a logical set of patches, the first are preparatory patches: 1. ARM: Add page table defines for KVM 3. ARM: Section based HYP idmaps 3. ARM: Factor out cpuid implementor and part_number fields The main implementation is broken up into separate patches, the first containing a skeleton of files, makefile changes, the basic user space interface and KVM architecture specific stubs. Subsequent patches implement parts of the system as listed: 4. Skeleton and reset hooks 5. Hypervisor initialization 6. Memory virtualization setup (hyp mode mappings and 2nd stage) 7. Inject IRQs and FIQs from userspace 8. World-switch implementation and Hyp exception vectors 9. Emulation framework and coproc emulation 10. Coproc user space API 11. Demux multiplexed coproc registers 12. User spac API to get/set VFP registers 13. Handle guest user memory aborts 14. Handle guest MMIO aborts Testing: Tested on FAST Models and Versatile Express test-chip2. Tested by running three simultaenous VMs, all running SMP, on an SMP host, each VM running hackbench and cyclictest and with extreme memory pressure applied to the host with swapping enabled to provoke page eviction. Also tested KSM merging and GCC inside VMs. Fully boots both Ubuntu (user space Thumb-2) and Debian (user space ARM) guests. For a guide on how to set up a testing environment and try out these patches, see: http://www.virtualopensystems.com/media/pdf/kvm-arm-guide.pdf Changes since v11: - Memory setup and page table defines reworked - We do not export unused perf bitfields anymore - No module support anymore and following cleanup - Hide vcpu register accessors - Fix unmap range mmu notifier race condition - Factored out A15 coprocs in separate file - Factored out world-switch assembly macros to separate file - Add dmux of multiplexed coprocs to user space - Add VFP get/set interface to user space - Addressed various cleanup comments from reviewers Changes since v10: - Boot in Hyp mode and user HVC to initialize HVBAR - Support VGIC - Support Arch timers - Support Thumb-2 mmio instruction decoding - Transition to GET_ONE/SET_ONE register API - Added KVM_VCPU_GET_REG_LIST - New interrupt injection API - Don't pin guest pages anymore - Fix race condition in page fault handler - Cleanup guest instruction copying. - Fix race when copying SMP guest instructions - Inject data/prefetch aborts when guest does something strange Changes since v9: - Addressed reviewer comments (see mailing list archive) - Limit the user of .arch_extensiion sec/virt for compilers that need them - VFP/Neon Support (Antonios Motakis) - Run exit handling under preemption and still handle guest cache ops - Add support for IO mapping at Hyp level (VGIC prep) - Add support for IO mapping at Guest level (VGIC prep) - Remove backdoor call to irq_svc - Complete rework of CP15 handling and register reset (Rusty Russell) - Don't use HSTR for anything else than CR 15 - New ioctl to set emulation target core (only A15 supported for now) - Support KVM_GET_MSRS / KVM_SET_MSRS - Add page accounting and page table eviction - Change pgd lock to spinlock and fix sleeping in atomic bugs - Check kvm_condition_valid for HVC traps of undefs - Added a naive implementation of kvm_unmap_hva_range
Re: [PATCH 2/3] s390/kvm: Add documentation for KVM_S390_INTERRUPT.
On Wed, Oct 10, 2012 at 03:43:10PM -0300, Marcelo Tosatti wrote: On Wed, Oct 10, 2012 at 06:37:57PM +0200, Cornelia Huck wrote: On Wed, 10 Oct 2012 13:27:09 -0300 Marcelo Tosatti mtosa...@redhat.com wrote: +Archtectures: s390 typo. +Note that the vcpu ioctl is asynchronous to vpcu execution. typo Would you like a respin with the typos fixed? No. Applied, thanks. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC 1/2] kvm: Handle undercommitted guest case in PLE handler
On Wed, 2012-10-10 at 23:13 +0530, Raghavendra K T wrote: On 10/10/2012 07:54 PM, Andrew Theurer wrote: I ran 'perf sched map' on the dbench workload for medium and large VMs, and I thought I would share some of the results. I think it helps to visualize what's going on regarding the yielding. These files are png bitmaps, generated from processing output from 'perf sched map' (and perf data generated from 'perf sched record'). The Y axis is the host cpus, each row being 10 pixels high. For these tests, there are 80 host cpus, so the total height is 800 pixels. The X axis is time (in microseconds), with each pixel representing 1 microsecond. Each bitmap plots 30,000 microseconds. The bitmaps are quite wide obviously, and zooming in/out while viewing is recommended. Each row (each host cpu) is assigned a color based on what thread is running. vCPUs of the same VM are assigned a common color (like red, blue, magenta, etc), and each vCPU has a unique brightness for that color. There are a maximum of 12 assignable colors, so in any VMs 12 revert to vCPU color of gray. I would use more colors, but it becomes harder to distinguish one color from another. The white color represents missing data from perf, and black color represents any thread which is not a vCPU. For the following tests, VMs were pinned to host NUMA nodes and to specific cpus to help with consistency and operate within the constraints of the last test (gang scheduler). Here is a good example of PLE. These are 10-way VMs, 16 of them (as described above only 12 of the VMs have a color, rest are gray). https://docs.google.com/open?id=0B6tfUNlZ-14wdmFqUmE5QjJHMFU This looks very nice to visualize what is happening. Beginning of the graph looks little messy but later it is clear. If you zoom out and look at the whole bitmap, you may notice the 4ms intervals of the scheduler. They are pretty well aligned across all cpus. Normally, for cpu bound workloads, we would expect to see each thread to run for 4 ms, then something else getting to run, and so on. That is mostly true in this test. We have 2x over-commit and we generally see the switching of threads at 4ms. One thing to note is that not all vCPU threads for the same VM run at exactly the same time, and that is expected and the whole reason for lock-holder preemption. Now, if you zoom in on the bitmap, you should notice within the 4ms intervals there is some task switching going on. This is most likely because of the yield_to initiated by the PLE handler. In this case there is not that much yielding to do. It's quite clean, and the performance is quite good. Below is an example of PLE, but this time with 20-way VMs, 8 of them. CPU over-commit is still 2x. https://docs.google.com/open?id=0B6tfUNlZ-14wdmFqUmE5QjJHMFU I think this link still 10x16. Could you paste the link again? Oops https://docs.google.com/open?id=0B6tfUNlZ-14wSGtYYzZtRTcyVjQ This one looks quite different. In short, it's a mess. The switching between tasks can be lower than 10 microseconds. It basically never recovers. There is constant yielding all the time. Below is again 8 x 20-way VMs, but this time I tried out Nikunj's gang scheduling patches. While I am not recommending gang scheduling, I think it's a good data point. The performance is 3.88x the PLE result. https://docs.google.com/open?id=0B6tfUNlZ-14wWXdscWcwNTVEY3M Note that the task switching intervals of 4ms are quite obvious again, and this time all vCPUs from same VM run at the same time. It represents the best possible outcome. Anyway, I thought the bitmaps might help better visualize what's going on. -Andrew -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC 1/2] kvm: Handle undercommitted guest case in PLE handler
On Wed, 2012-10-10 at 23:24 +0530, Raghavendra K T wrote: On 10/10/2012 08:29 AM, Andrew Theurer wrote: On Wed, 2012-10-10 at 00:21 +0530, Raghavendra K T wrote: * Avi Kivity a...@redhat.com [2012-10-04 17:00:28]: On 10/04/2012 03:07 PM, Peter Zijlstra wrote: On Thu, 2012-10-04 at 14:41 +0200, Avi Kivity wrote: Again the numbers are ridiculously high for arch_local_irq_restore. Maybe there's a bad perf/kvm interaction when we're injecting an interrupt, I can't believe we're spending 84% of the time running the popf instruction. Smells like a software fallback that doesn't do NMI, hrtimer based sampling typically hits popf where we re-enable interrupts. Good nose, that's probably it. Raghavendra, can you ensure that the PMU is properly exposed? 'dmesg' in the guest will tell. If it isn't, -cpu host will expose it (and a good idea anyway to get best performance). Hi Avi, you are right. SandyBridge machine result was not proper. I cleaned up the services, enabled PMU, re-ran all the test again. Here is the summary: We do get good benefit by increasing ple window. Though we don't see good benefit for kernbench and sysbench, for ebizzy, we get huge improvement for 1x scenario. (almost 2/3rd of ple disabled case). Let me know if you think we can increase the default ple_window itself to 16k. I am experimenting with V2 version of undercommit improvement(this) patch series, But I think if you wish to go for increase of default ple_window, then we would have to measure the benefit of patches when ple_window = 16k. I can respin the whole series including this default ple_window change. I also have the perf kvm top result for both ebizzy and kernbench. I think they are in expected lines now. Improvements 16 core PLE machine with 16 vcpu guest base = 3.6.0-rc5 + ple handler optimization patches base_pleopt_16k = base + ple_window = 16k base_pleopt_32k = base + ple_window = 32k base_pleopt_nople = base + ple_gap = 0 kernbench, hackbench, sysbench (time in sec lower is better) ebizzy (rec/sec higher is better) % improvements w.r.t base (ple_window = 4k) ---+---+-+---+ |base_pleopt_16k| base_pleopt_32k | base_pleopt_nople | ---+---+-+---+ kernbench_1x | 0.42371 | 1.15164| 0.09320 | kernbench_2x | -1.40981 | -17.48282 | -570.77053 | ---+---+-+---+ sysbench_1x| -0.92367 | 0.24241 | -0.27027 | sysbench_2x| -2.22706 |-0.30896 | -1.27573 | sysbench_3x| -0.75509 | 0.09444 | -2.97756 | ---+---+-+---+ ebizzy_1x | 54.99976 | 67.29460| 74.14076 | ebizzy_2x | -8.83386 |-27.38403| -96.22066 | ---+---+-+---+ perf kvm top observation for kernbench and ebizzy (nople, 4k, 32k window) Is the perf data for 1x overcommit? Yes, 16vcpu guest on 16 core pleopt ple_gap=0 ebizzy : 18131 records/s 63.78% [guest.kernel] [g] _raw_spin_lock_irqsave 5.65% [guest.kernel] [g] smp_call_function_many 3.12% [guest.kernel] [g] clear_page 3.02% [guest.kernel] [g] down_read_trylock 1.85% [guest.kernel] [g] async_page_fault 1.81% [guest.kernel] [g] up_read 1.76% [guest.kernel] [g] native_apic_mem_write 1.70% [guest.kernel] [g] find_vma Does 'perf kvm top' not give host samples at the same time? Would be nice to see the host overhead as a function of varying ple window. I would expect that to be the major difference between 4/16/32k window sizes. No, I did something like this perf kvm --guestvmlinux ./vmlinux.guest top -g -U -d 3. Yes that is a good idea. (I am getting some segfaults with perf top, I think it is already fixed but yet to see the patch that fixes) A big concern I have (if this is 1x overcommit) for ebizzy is that it has just terrible scalability to begin with. I do not think we should try to optimize such a bad workload. I think my way of running dbench has some flaw, so I went to ebizzy. Could you let me know how you generally run dbench? I mount a tmpfs and then specify that mount for dbench to run on. This eliminates all IO. I use a 300 second run time and number of threads is equal to number of vcpus. All of the VMs of course need to have a synchronized start. I would also make sure you are using a recent kernel for dbench, where the dcache scalability is much improved. Without any
Guest status - Windows 8 developer preview
Hey, I just installed Windows 8 developer preview on our libvirt/kvm setup here at work, for IE10 testing. Host: - Debian 6.0.4 - 2.6.32-5-amd64 - AMD FX(tm)-6100 Six-Core Processor (3.3 Ghz) - 4 gig ram Guest is set up to use 1 gig of ram, it's the 32-bit version installed. Everything appears to work correctly. We access it using VNC which has been working fine. Performance is acceptable, although we might think of bumping up the allocated guest ram. CPU usage appears a bit on the high side, I'm monitoring the situation. Cheers, Josh -- leet programming master www.karmabunny.com.au talk // 08 7120 7100 post // po box 188, stepney sa 5069 street // the church, 6 george street, stepney sa 5069 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH] Enabling IA32_TSC_ADJUST for guest VM
Marcelo, You are suggesting that I: Kvm_host.h: Struct kvm_x86_ops { ... change Int (*set_msr)(struct kvm_vcpu * vcpu, u32 mrs_index, u64 data); to Int (*set_msr)(struct kvm_vcpu * vcpu, u32 mrs_index, u64 data, bool from_guest); ... }; and so on down the line to set_msr_common(), kvm_write_tsc()... in a separate patch before other related patches? As far as the initialization after live migration, I will provide some output with explanation once I am able to again. At the moment, I have hosed my system and need to figure out what's wrong and fix it first. Thanks, Will -Original Message- From: Marcelo Tosatti [mailto:mtosa...@redhat.com] Sent: Wednesday, October 10, 2012 5:53 AM To: Auld, Will Cc: Avi Kivity; kvm@vger.kernel.org; Zhang, Xiantao; Liu, Jinsong Subject: Re: [PATCH] Enabling IA32_TSC_ADJUST for guest VM On Tue, Oct 09, 2012 at 04:10:30PM +, Auld, Will wrote: I am just testing the second version of this patch. It addresses all the comments so far except Marcelo's issue with breaking the function compute_guest_tsc(). Lets try to merge the missing patch from Zachary first (that'll make it clear). I needed to put the call for updating the TSC_ADJUST_MSR in kvm_write_tsc() to ensure it is only called from user space. Other changes added to vmcs offset should not be tracked in TSC_ADJUST_MSR. Please have a separate, earlier patch making that explicit (by passing a bool to kvm_x86_ops-set_msr then to kvm_set_msr_common). that = whether msr write is guest initiated or not. I had some trouble with the order of initialization during live migration. TSC_ADJUST is initialized first but then wiped out by multiple initializations of tsc. The fix for this is to not update TSC_ADJUST if the vmcs offset is not actually changing with the tsc write. So, after migration outcome is that vmcs offset gets defined independent from the migrating value of TSC_ADJUST. I believe this is what we want to happen. Can you please be more explicit regarding wiped out by multiple initializations of tsc ? It is probably best to maintain TSC_ADJUST separately, in software, and then calculate TSC_OFFSET. Thanks, Will -Original Message- From: Avi Kivity [mailto:a...@redhat.com] Sent: Tuesday, October 09, 2012 5:12 AM To: Marcelo Tosatti Cc: Auld, Will; kvm@vger.kernel.org; Zhang, Xiantao Subject: Re: [PATCH] Enabling IA32_TSC_ADJUST for guest VM On 10/08/2012 07:30 PM, Marcelo Tosatti wrote: From Intel's manual: • If an execution of WRMSR to the IA32_TIME_STAMP_COUNTER MSR adds (or subtracts) value X from the TSC, the logical processor also adds (or subtracts) value X from the IA32_TSC_ADJUST MSR. This is not handled in the patch. To support migration, it will be necessary to differentiate between guest initiated and userspace-model initiated msr write. That is, only guest initiated TSC writes should affect the value of IA32_TSC_ADJUST MSR. Avi, any better idea? I think we need that anyway, since there are some read-only MSRs that need to be configured by the host (nvmx capabilities). So if we add that feature it will be useful elsewhere. I don't think it's possible to do it in any other way: Local offset value of the IA32_TSC for a logical processor. Reset value is Zero. A write to IA32_TSC will modify the local offset in IA32_TSC_ADJUST and the content of IA32_TSC, but does not affect the internal invariant TSC hardware. What we want to do is affect the internal invariant TSC hardware, so we can't do that through the normal means. btw, will tsc writes from userspace (after live migration) cause tsc skew? If so we should think how to model a guest-wide tsc. -- error compiling committee.c: too many arguments to function N?r??yb?X??ǧv?^?){.n?+h????ܨ}???Ơz?j:+v??? zZ+??+zf???h???~i???z??w??)ߢf
Re: [Qemu-devel] Using PCI config space to indicate config location
Rusty Russell ru...@rustcorp.com.au writes: Jamie Lokier ja...@shareable.org writes: Rusty Russell wrote: I don't think it'll be that bad; reset clears the device to unknown, bar0 moves it from unknown-legacy mode, bar1/2/3 changes it from unknown-modern mode, and anything else is bad (I prefer being strict so we catch bad implementations from the beginning). Will that work, if the guest with kernel that uses modern mode, kexecs to an older (but presumed reliable) kernel that only knows about legacy mode? I.e. will the replacement kernel, or (ideally) replacement driver on the rare occasion that is needed on a running kernel, be able to reset the device hard enough? Well, you need to reset the device, so yes. MST said something which made me think harder about this case. Either there needs to be a way to tell what mode the device is in, or legacy reset has to work, even in modern mode. The latter is preferable, since it allows an older kernel to do the reset. Now, since qemu would almost certainly have to add code to stop that working, it'll probably be fine. But I'd really like to add a strict mode to qemu virtio which does extra sanity checking for driver authors, and that might break this. That's OK. Thanks, Rusty. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] Using PCI config space to indicate config location
Michael S. Tsirkin m...@redhat.com writes: On Wed, Oct 10, 2012 at 08:36:12AM -0500, Anthony Liguori wrote: Rusty Russell ru...@rustcorp.com.au writes: Gerd Hoffmann kra...@redhat.com writes: So how about this: (1) Add a vendor specific pci capability for new-style virtio. Specifies the pci bar used for new-style virtio registers. Guests can use it to figure whenever new-style virtio is supported and to map the correct bar (which will probably be bar 1 in most cases). This was closer to the original proposal[1], which I really liked (you can layout bars however you want). Anthony thought that vendor capabilities were a PCI-e feature, but it seems they're blessed in PCI 2.3. 2.3 was standardized in 2002. Are we confident that vendor extensions play nice with pre-2.3 OSes like Win2k, WinXP, etc? 2.2 (1998) had the capability IDs linked list, indicated by bit 4 in the status register, but reserved ids 7 and above. ID 9 (vendor specific) was added in 2.3; it should be ignored, but will require testing of course, like any change. 2.1 didn't have the capability ID linked list at all; bit 4 in the status register was reserved. QEMU's pci.c has capability support, and sets the capabiliy status bit and list pointer when the driver requests (eg. the eepro100). Pre 2.3 it was the case that *all* space outside the capability linked list, and any capability not recognized by space, was considered vendor specific. So there's no problem really. Oh in theory, sure. In practice, almost certainly. But this needs to be proved with actual testing. Cheers, Rusty. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/3] virtio-net: inline header support
Paolo Bonzini pbonz...@redhat.com writes: Il 09/10/2012 06:59, Rusty Russell ha scritto: Paolo Bonzini pbonz...@redhat.com writes: Il 05/10/2012 07:43, Rusty Russell ha scritto: That's good. But virtio_blk's scsi command is insoluble AFAICT. As I said to Anthony, the best rules are always and never, so I'd really rather not have to grandfather that in. It is, but we can add a rule that if the (transport) flag VIRTIO_RING_F_ANY_HEADER_SG is set, the cdb field is always 32 bytes in virtio-blk. Could we do that? It's the cmd length I'm concerned about; is it always 32 in practice for some reason? It is always 32 or less except in very obscure cases that are pretty much confined to iSCSI. We don't care about the obscure cases, and the extra bytes don't hurt. BTW, 32 is the default cdb_size used by virtio-scsi. Currently qemu does: struct sg_io_hdr hdr; memset(hdr, 0, sizeof(struct sg_io_hdr)); hdr.interface_id = 'S'; hdr.cmd_len = req-elem.out_sg[1].iov_len; hdr.cmdp = req-elem.out_sg[1].iov_base; hdr.dxfer_len = 0; If it's a command which expects more output data, there's no way to guess where the boundary is between that command and the data. Yep, so I understood the problem right. OK. Well, Anthony wants qemu to be robust in this regard, so I am tempted to rework all the qemu drivers to handle arbitrary layouts. They could use a good audit anyway. This would become a glaring exception, but I'm tempted to fix it to 32 bytes at the same time as we get the new pci layout (ie. for the virtio 1.0 spec). The Linux driver would carefully be backwards compatible, of course, and the spec would document why. Cheers, Rusty. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 6/7] spapr: Pass PowerPCCPU to spapr_hypercall()
Needed for changing the hypercall handlers' argument type to PowerPCCPU. Signed-off-by: Andreas Färber afaer...@suse.de --- hw/spapr.c |2 +- hw/spapr.h |2 +- hw/spapr_hcall.c |4 +++- target-ppc/kvm.c |3 ++- 4 Dateien geändert, 7 Zeilen hinzugefügt(+), 4 Zeilen entfernt(-) diff --git a/hw/spapr.c b/hw/spapr.c index 914d60c..ac1f7de 100644 --- a/hw/spapr.c +++ b/hw/spapr.c @@ -578,7 +578,7 @@ static void emulate_spapr_hypercall(PowerPCCPU *cpu) hcall_dprintf(Hypercall made with MSR[PR]=1\n); env-gpr[3] = H_PRIVILEGE; } else { -env-gpr[3] = spapr_hypercall(env, env-gpr[3], env-gpr[4]); +env-gpr[3] = spapr_hypercall(cpu, env-gpr[3], env-gpr[4]); } } diff --git a/hw/spapr.h b/hw/spapr.h index e984e3f..feea401 100644 --- a/hw/spapr.h +++ b/hw/spapr.h @@ -288,7 +288,7 @@ typedef target_ulong (*spapr_hcall_fn)(CPUPPCState *env, sPAPREnvironment *spapr target_ulong *args); void spapr_register_hypercall(target_ulong opcode, spapr_hcall_fn fn); -target_ulong spapr_hypercall(CPUPPCState *env, target_ulong opcode, +target_ulong spapr_hypercall(PowerPCCPU *cpu, target_ulong opcode, target_ulong *args); int spapr_allocate_irq(int hint, bool lsi); diff --git a/hw/spapr_hcall.c b/hw/spapr_hcall.c index 194d9c2..1594f30 100644 --- a/hw/spapr_hcall.c +++ b/hw/spapr_hcall.c @@ -678,9 +678,11 @@ void spapr_register_hypercall(target_ulong opcode, spapr_hcall_fn fn) *slot = fn; } -target_ulong spapr_hypercall(CPUPPCState *env, target_ulong opcode, +target_ulong spapr_hypercall(PowerPCCPU *cpu, target_ulong opcode, target_ulong *args) { +CPUPPCState *env = cpu-env; + if ((opcode = MAX_HCALL_OPCODE) ((opcode 0x3) == 0)) { spapr_hcall_fn fn = papr_hypercall_table[opcode / 4]; diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c index d7d8e8f..9d381ce 100644 --- a/target-ppc/kvm.c +++ b/target-ppc/kvm.c @@ -818,7 +818,8 @@ int kvm_arch_handle_exit(CPUPPCState *env, struct kvm_run *run) #ifdef CONFIG_PSERIES case KVM_EXIT_PAPR_HCALL: dprintf(handle PAPR hypercall\n); -run-papr_hcall.ret = spapr_hypercall(env, run-papr_hcall.nr, +run-papr_hcall.ret = spapr_hypercall(ppc_env_get_cpu(env), + run-papr_hcall.nr, run-papr_hcall.args); ret = 0; break; -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 3/7] target-ppc: Rename kvm_kick_{env = cpu} and pass PowerPCCPU
Needed for changing qemu_cpu_kick() argument type to CPUState. Signed-off-by: Andreas Färber afaer...@suse.de --- target-ppc/kvm.c |8 ++-- 1 Datei geändert, 6 Zeilen hinzugefügt(+), 2 Zeilen entfernt(-) diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c index 5cbe98a..d7d8e8f 100644 --- a/target-ppc/kvm.c +++ b/target-ppc/kvm.c @@ -73,8 +73,11 @@ static int cap_hior; */ static QEMUTimer *idle_timer; -static void kvm_kick_env(void *env) +static void kvm_kick_cpu(void *opaque) { +PowerPCCPU *cpu = opaque; +CPUPPCState *env = cpu-env; + qemu_cpu_kick(env); } @@ -375,6 +378,7 @@ static inline void kvm_fixup_page_sizes(CPUPPCState *env) int kvm_arch_init_vcpu(CPUPPCState *cenv) { +PowerPCCPU *cpu = ppc_env_get_cpu(cenv); int ret; /* Gather server mmu info from KVM and update the CPU state */ @@ -386,7 +390,7 @@ int kvm_arch_init_vcpu(CPUPPCState *cenv) return ret; } -idle_timer = qemu_new_timer_ns(vm_clock, kvm_kick_env, cenv); +idle_timer = qemu_new_timer_ns(vm_clock, kvm_kick_cpu, cpu); /* Some targets support access to KVM's guest TLB. */ switch (cenv-mmu_model) { -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Using PCI config space to indicate config location
Michael S. Tsirkin m...@redhat.com writes: On Mon, Oct 08, 2012 at 12:51:25PM +1030, Rusty Russell wrote: Note before anyone gets confused; we were talking about using the PCI config space to indicate what BAR(s) the virtio stuff is in. An alternative would be to simply specify a new layout format in BAR1. One problem we are still left with is this: device specific config accesses are still non atomic. This is a problem for multibyte fields such as MAC address where MAC could change while we are accessing it. It's also a problem for related fields, eg. console width and height, or disk geometry. I was thinking about some backwards compatible way to solve this, but if we are willing to break compatiblity or use some mode switch, how about we give up on virtio config space completely, and do everything besides IO and ISR through guest memory? I think there's still a benefit in the simple publishing of information: I don't really want to add a control queue for the console. But inevitably, once-static information can change in later versions, and it's horrible to have config information plus a bit that says don't use this, use the control queue. Here's a table from a quick audit: Driver Config Device changesDriver writes... after init? net YY NN block YY YY console YY NN rng NN NN balloon YY YY scsiYN YN 9p YN NN For config space reads, I suggest the driver publish a generation count. For writes, the standard seems to be a commit latch. We could abuse the generation count for this: the driver writes to it to commit config changes. ie: /* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */ struct virtio_pci_common_cfg { /* About the whole device. */ __le32 device_feature_select; /* read-write */ __le32 device_feature; /* read-only */ __le32 guest_feature_select;/* read-write */ __le32 guest_feature; /* read-only */ __le32 config_gen_and_latch;/* read-write */ __le16 msix_config; /* read-write */ __u8 device_status; /* read-write */ __u8 unused; /* About a specific virtqueue. */ __le16 queue_select;/* read-write */ __le16 queue_align; /* read-write, power of 2. */ __le16 queue_size; /* read-write, power of 2. */ __le16 queue_msix_vector;/* read-write */ __le64 queue_address; /* read-write: 0x == DNE. */ }; Thoughts? Rusty. PS. Let's make all the virtio-device config LE, too... -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [kvmarm] [PATCH v2 12/14] KVM: ARM: VFP userspace interface
Christoffer Dall c.d...@virtualopensystems.com writes: On Tue, Oct 9, 2012 at 2:11 PM, Peter Maydell peter.mayd...@linaro.org wrote: On 1 October 2012 10:11, Christoffer Dall c.d...@virtualopensystems.com wrote: From: Rusty Russell rusty.russ...@linaro.org ... +ARM 32-bit VFP control registers have the following id bit patterns: + 0x4002 0012 1 selector:3 + +ARM 64-bit FP registers have the following id bit patterns: + 0x4002 0012 0 selector:3 + In the other entries in this section, foo:3 indicates a 3 bit field. But I think for these two it's trying to indicate a 3 byte field. ... right you are, I'll modify this. Thanks! Rusty. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] vhost-blk: Add vhost-blk support v3
vhost-blk is an in-kernel virito-blk device accelerator. Due to lack of proper in-kernel AIO interface, this version converts guest's I/O request to bio and use submit_bio() to submit I/O directly. So this version any supports raw block device as guest's disk image, e.g. /dev/sda, /dev/ram0. We can add file based image support to vhost-blk once we have in-kernel AIO interface. There are some work in progress for in-kernel AIO interface from Dave Kleikamp and Zach Brown: http://marc.info/?l=linux-fsdevelm=133312234313122 Performance evaluation: - 1) LKVM Fio with libaio ioengine on Fusion IO device using kvm tool IOPS Before After Improvement seq-read 107 121 +13.0% seq-write 130 179 +37.6% rnd-read 102 122 +19.6% rnd-write 125 159 +27.0% 2) QEMU Fio with libaio ioengine on Fusion IO device using QEMU IOPS Before After Improvement seq-read 76 123 +61.8% seq-write 139 173 +24.4% rnd-read 73 120 +64.3% rnd-write 75 156 +108.0% Userspace bits: - 1) LKVM The latest vhost-blk userspace bits for kvm tool can be found here: g...@github.com:asias/linux-kvm.git blk.vhost-blk 2) QEMU The latest vhost-blk userspace prototype for QEMU can be found here: g...@github.com:asias/qemu.git blk.vhost-blk Changes in V3: - Sending REQ_FLUSH bio instead of vfs_fsync, thanks Christoph! - Check file passed by user is a raw block device file Signed-off-by: Asias He as...@redhat.com --- drivers/vhost/Kconfig | 1 + drivers/vhost/Kconfig.blk | 10 + drivers/vhost/Makefile| 2 + drivers/vhost/blk.c | 669 ++ drivers/vhost/blk.h | 8 + 5 files changed, 690 insertions(+) create mode 100644 drivers/vhost/Kconfig.blk create mode 100644 drivers/vhost/blk.c create mode 100644 drivers/vhost/blk.h diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig index 202bba6..acd8038 100644 --- a/drivers/vhost/Kconfig +++ b/drivers/vhost/Kconfig @@ -11,4 +11,5 @@ config VHOST_NET if STAGING source drivers/vhost/Kconfig.tcm +source drivers/vhost/Kconfig.blk endif diff --git a/drivers/vhost/Kconfig.blk b/drivers/vhost/Kconfig.blk new file mode 100644 index 000..ff8ab76 --- /dev/null +++ b/drivers/vhost/Kconfig.blk @@ -0,0 +1,10 @@ +config VHOST_BLK + tristate Host kernel accelerator for virtio blk (EXPERIMENTAL) + depends on BLOCK EXPERIMENTAL m + ---help--- + This kernel module can be loaded in host kernel to accelerate + guest block with virtio_blk. Not to be confused with virtio_blk + module itself which needs to be loaded in guest kernel. + + To compile this driver as a module, choose M here: the module will + be called vhost_blk. diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile index a27b053..1a8a4a5 100644 --- a/drivers/vhost/Makefile +++ b/drivers/vhost/Makefile @@ -2,3 +2,5 @@ obj-$(CONFIG_VHOST_NET) += vhost_net.o vhost_net-y := vhost.o net.o obj-$(CONFIG_TCM_VHOST) += tcm_vhost.o +obj-$(CONFIG_VHOST_BLK) += vhost_blk.o +vhost_blk-y := blk.o diff --git a/drivers/vhost/blk.c b/drivers/vhost/blk.c new file mode 100644 index 000..4a4e793 --- /dev/null +++ b/drivers/vhost/blk.c @@ -0,0 +1,669 @@ +/* + * Copyright (C) 2011 Taobao, Inc. + * Author: Liu Yuan tailai...@taobao.com + * + * Copyright (C) 2012 Red Hat, Inc. + * Author: Asias He as...@redhat.com + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + * virtio-blk server in host kernel. + */ + +#include linux/miscdevice.h +#include linux/module.h +#include linux/vhost.h +#include linux/virtio_blk.h +#include linux/mutex.h +#include linux/file.h +#include linux/kthread.h +#include linux/blkdev.h + +#include vhost.c +#include vhost.h +#include blk.h + +#define BLK_HDR0 + +static DEFINE_IDA(vhost_blk_index_ida); + +enum { + VHOST_BLK_VQ_REQ = 0, + VHOST_BLK_VQ_MAX = 1, +}; + +struct req_page_list { + struct page **pages; + int pages_nr; +}; + +struct vhost_blk_req { + struct llist_node llnode; + struct req_page_list *pl; + struct vhost_blk *blk; + + struct iovec *iov; + int iov_nr; + + struct bio **bio; + atomic_t bio_nr; + + sector_t sector; + int write; + u16 head; + long len; + + u8 *status; +}; + +struct vhost_blk { + struct task_struct *host_kick; + struct vhost_blk_req *reqs; + struct vhost_virtqueue vq; + struct llist_head llhead; + struct vhost_dev dev; + u16 reqs_nr; + int index; +}; + +static inline int iov_num_pages(struct iovec *iov) +{ + return (PAGE_ALIGN((unsigned long)iov-iov_base + iov-iov_len) - + ((unsigned long)iov-iov_base PAGE_MASK)) PAGE_SHIFT; +} + +static int vhost_blk_setup(struct vhost_blk *blk) +{ +
Using taskset , powertop and perf with qemu-kvm
Hello , I wish to run some cpu intensive tasks on a vm , use only one core specifically #0 of the 6 available ( Xeon 3670 ) and all the while record basic perf events like cpu-cycles , instructions etc. I am using taskset to define the cpu-affinity of the qemu process : taskset -p 0x1 `pgrep qemu` (assuming only 1 vm is running ) I can verify that the vm is now running only on core 0 using top. top also shows that the qemu process is using 99% of it's share of CPU time ( %CPU column in top ) upon running the tasks in vm. However what I fail to see is the increase in time spent of the core 0 in the C0 ( active ) state via powertop. It barely manages to cross 12%. If I run the same tasks ( FFT benchmark ) on the host on a different core ( say #1 ) and monitor it using powertop again , I can see that C0 % for core 1 is above 100% ( 104 ~ 107 ) So why does not the output of top ( qemu is using 99% of it's cpu time , which is spent on core 0 ) co-relate with powertop ( which says core 0 is idle for around 88% of time ) ? Also , powertop reports that core 2 is being used heavily ( 104 % ) and I suspect this is one which is executing the vm operations. But why ? There are no other cpu/memory intensive tasks running on the host. Next , I am using the perf kvm record function to save the stats. If I use all the 7 core counters available along-with -a option , perf generates ~2MB/second . Is there a way to save just the raw count of the events ? Also , if instead of -a , if I give -C 0 , perf just saves around 0.016MB/s as opposed to 1.894MB/s in case of -a option. If the vm is indeed using core 0 , then perf should have recorded the events. On a last note , does perf kvm automatically selects the vm to profile ( in case multiple vms are running ) on the basis of the guest.kallsyms and guest.modules files passed to it ? Thanks , Jaspal -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Using PCI config space to indicate config location
On Mon, Oct 08, 2012 at 12:51:25PM +1030, Rusty Russell wrote: (Topic updated, cc's trimmed). Anthony Liguori aligu...@us.ibm.com writes: Rusty Russell ru...@rustcorp.com.au writes: 4) The only significant change to the spec is that we use PCI capabilities, so we can have infinite feature bits. (see http://lists.linuxfoundation.org/pipermail/virtualization/2011-December/019198.html) We discussed this on IRC last night. I don't think PCI capabilites are a good mechanism to use... PCI capabilities are there to organize how the PCI config space is allocated to allow vendor extensions to co-exist with future PCI extensions. But we've never used the PCI config space within virtio-pci. We do everything in BAR0. I don't think there's any real advantage of using the config space vs. a BAR for virtio-pci. Note before anyone gets confused; we were talking about using the PCI config space to indicate what BAR(s) the virtio stuff is in. An alternative would be to simply specify a new layout format in BAR1. One problem we are still left with is this: device specific config accesses are still non atomic. This is a problem for multibyte fields such as MAC address where MAC could change while we are accessing it. I was thinking about some backwards compatible way to solve this, but if we are willing to break compatiblity or use some mode switch, how about we give up on virtio config space completely, and do everything besides IO and ISR through guest memory? -- MST -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] Using PCI config space to indicate config location
On Tue, Oct 09, 2012 at 10:26:12AM +1030, Rusty Russell wrote: Anthony Liguori aligu...@us.ibm.com writes: Gerd Hoffmann kra...@redhat.com writes: Hi, So we could have for virtio something like this: Capabilities: [??] virtio-regs: legacy: BAR=0 offset=0 virtio-pci: BAR=1 offset=1000 virtio-cfg: BAR=1 offset=1800 This would be a vendor specific PCI capability so lspci wouldn't automatically know how to parse it. Sure, would need a patch to actually parse+print the cap, /me was just trying to make my point clear in a simple way. 2) ISTR an argument about mapping the ISR register separately, for performance, but I can't find a reference to it. I think the rationale is that ISR really needs to be PIO but everything else doesn't. PIO is much faster on x86 because it doesn't require walking page tables or instruction emulation to handle the exit. Is this still a pressing issue? With MSI-X enabled ISR isn't needed, correct? Which would imply that pretty much only old guests without MSI-X support need this, and we don't need to worry that much when designing something new ... It wasn't that long ago that MSI-X wasn't supported.. I think we should continue to keep ISR as PIO as it is a fast path. No problem if we allow to have both legacy layout and new layout at the same time. Guests can continue to use ISR @ BAR0 in PIO space for existing virtio devices, even in case they want use mmio for other registers - all fine. New virtio devices can support MSI-X from day one and decide to not expose a legacy layout PIO bar. I think having BAR1 be an MMIO mirror of the registers + a BAR2 for virtio configuration space is probably not that bad of a solution. Well, we also want to clean up the registers, so how about: BAR0: legacy, as is. If you access this, don't use the others. BAR1: new format virtio-pci layout. If you use this, don't use BAR0. BAR2: virtio-cfg. If you use this, don't use BAR0. BAR3: ISR. If you use this, don't use BAR0. One problem here is there are only 3 64-bit BARs under PCI. There's no point to make IO BAR 64-bit on x86, but there might be for memory. I prefer the cases exclusive (ie. use one or the other) as a clear path to remove the legacy layout; and leaving the ISR in BAR0 leaves us with an ugly corner case in future (ISR is BAR0 + 19? WTF?). As to MMIO vs PIO, the BARs are self-describing, so we should explicitly endorse that and leave it to the devices. The detection is simple: if BAR1 has non-zero length, it's new-style, otherwise legacy. Thoughts? Rusty. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] Using PCI config space to indicate config location
Jamie Lokier ja...@shareable.org writes: Rusty Russell wrote: I don't think it'll be that bad; reset clears the device to unknown, bar0 moves it from unknown-legacy mode, bar1/2/3 changes it from unknown-modern mode, and anything else is bad (I prefer being strict so we catch bad implementations from the beginning). Will that work, if the guest with kernel that uses modern mode, kexecs to an older (but presumed reliable) kernel that only knows about legacy mode? I.e. will the replacement kernel, or (ideally) replacement driver on the rare occasion that is needed on a running kernel, be able to reset the device hard enough? Well, you need to reset the device, so yes. Cheers, Rusty. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] Using PCI config space to indicate config location
Gerd Hoffmann kra...@redhat.com writes: So how about this: (1) Add a vendor specific pci capability for new-style virtio. Specifies the pci bar used for new-style virtio registers. Guests can use it to figure whenever new-style virtio is supported and to map the correct bar (which will probably be bar 1 in most cases). This was closer to the original proposal[1], which I really liked (you can layout bars however you want). Anthony thought that vendor capabilities were a PCI-e feature, but it seems they're blessed in PCI 2.3. So let's return to that proposal, giving something like this: /* IDs for different capabilities. Must all exist. */ /* FIXME: Do we win from separating ISR, NOTIFY and COMMON? */ /* Common configuration */ #define VIRTIO_PCI_CAP_COMMON_CFG 1 /* Notifications */ #define VIRTIO_PCI_CAP_NOTIFY_CFG 2 /* ISR access */ #define VIRTIO_PCI_CAP_ISR_CFG 3 /* Device specific confiuration */ #define VIRTIO_PCI_CAP_DEVICE_CFG 4 /* This is the PCI capability header: */ struct virtio_pci_cap { u8 cap_vndr;/* Generic PCI field: PCI_CAP_ID_VNDR */ u8 cap_next;/* Generic PCI field: next ptr. */ u8 cap_len; /* Generic PCI field: sizeof(struct virtio_pci_cap). */ u8 cfg_type;/* One of the VIRTIO_PCI_CAP_*_CFG. */ u8 bar; /* Where to find it. */ u8 unused; __le16 offset; /* Offset within bar. */ __le32 length; /* Length. */ }; This means qemu can point the isr_cfg into the legacy area if it wants. In fact, it can put everything in BAR0 if it wants. Thoughts? Rusty. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Proposal for virtio standardization.
Cornelia Huck cornelia.h...@de.ibm.com writes: On Thu, 27 Sep 2012 09:59:33 +0930 Rusty Russell ru...@rustcorp.com.au wrote: 3) Various clarifications, formalizations and cleanups to the spec text, and possibly elimination of old deprecated features. 4) The only significant change to the spec is that we use PCI capabilities, so we can have infinite feature bits. (see http://lists.linuxfoundation.org/pipermail/virtualization/2011-December/019198.html) Infinite only applies to virtio-pci, no? Yes, you already have infinite feature bits for ccw, as does every bus we did since PCI. Sounds like a good idea. I'll be happy to review the spec with an eye to virtio-ccw. Thanks! Rusty. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] Using PCI config space to indicate config location
Anthony Liguori aligu...@us.ibm.com writes: Rusty Russell ru...@rustcorp.com.au writes: Anthony Liguori aligu...@us.ibm.com writes: We'll never remove legacy so we shouldn't plan on it. There are literally hundreds of thousands of VMs out there with the current virtio drivers installed in them. We'll be supporting them for a very, very long time :-) You will be supporting this for qemu on x86, sure. And PPC. Yeah, Ben was a bit early fixing the virtio bugs I guess. Oh well. As I think we're still in the growth phase for virtio, I prioritize future spec cleanliness pretty high. But I think you'll be surprised how fast this is deprecated: 1) Bigger queues for block devices (guest-specified ringsize) 2) Smaller rings for openbios (guest-specified alignment) 3) All-mmio mode (powerpc) 4) Whatever network features get numbers 31. We can do all of these things with incremental change to the existing layout. That's the only way what I'm suggesting is different. Now extend your proposal to add all those features we want. We add if()s to the driver that we need to keep forever. Let's just look at what your proposal get us: bool has_feature(struct virtio_pci_dev *dev, u32 num) { /* Do we need to set selector? */ if (dev-extended_features) { unsigned long selector_off = VIRTIO_PCI_HOST_FEATURES_SELECT; if (dev-using_bar0 !dev-msi_active) selector_off -= 32; writel(dev-config + selector_off, num / 32); num = 31; } else if (num 31) return false; return readl(dev-config + VIRTIO_PCI_HOST_FEATURES) (1 num); } vs two cases which independent with methods set at detection time: virtio_pci_legacy.c: bool has_feature(struct virtio_pci_dev *dev, u32 num) { if (num 31) return false; return readl(dev-config + VIRTIO_PCI_LEGACY_HOST_FEATURES) (1 num); } virtio_pci_new.c: bool has_feature(struct virtio_pci_dev *dev, u32 num) { writel(dev-config + VIRTIO_PCI_HOST_FEATURES_SELECT, num / 32); num = 31; return readl(dev-config + VIRTIO_PCI_HOST_FEATURES) (1 num); } You want to reorder all of the fields and have a driver flag day. But I strongly suspect we'll decide we need to do the same exercise again in 4 years when we now need to figure out how to take advantage of transactional memory or some other whiz-bang hardware feature. It's not a flag day, as it's backwards compatible. Sure, we might have a clean cut again. I'm not afraid to rewrite this code to make it cleaner; perhaps this is somewhere qemu could benefit from a similar approach. What I'm suggesting is: struct { __le32 host_features; /* read-only */ __le32 guest_features; /* read/write */ __le32 queue_pfn; /* read/write */ __le16 queue_size; /* read-only */ __le16 queue_sel; /* read/write */ __le16 queue_notify;/* read/write */ u8 status; /* read/write */ u8 isr; /* read-only, clear on read */ __le16 msi_config_vector; /* read/write */ __le16 msi_queue_vector;/* read/write */ __le32 host_feature_select; /* read/write */ __le32 guest_feature_select;/* read/write */ __le32 queue_pfn_hi;/* read/write */ }; With the additional semantic that the virtio-config space is overlayed on top of the register set in BAR0 unless the VIRTIO_PCI_F_SEPARATE_CONFIG feature is acknowledged. This feature acts as a latch and when set, removes the config space overlay. If the config space overlays the registers, the offset in BAR0 of the overlay depends on whether MSI is enabled or not in the PCI device. BAR1 is an MMIO mirror of BAR0 except that the config space is never overlayed in BAR1 regardless of VIRTIO_PCI_F_SEPARATE_CONFIG. Creating a horrible mess for new code in order to support old code. But I have to maintain the new code, and this is horrible. You mean, accesses to bar1/2/3 change it to modern mode. That's a pretty big side effect of a read. They don't need to, but I'd prefer they did to keep implementations from mixing old and new. See above. A guest could happily just use BAR1/BAR2 and completely ignore BAR0 provided that BAR1/BAR2 are present. But x86 guests want BAR0 because IO space is faster than MMIO. Right? So, not happily. It also means the guest drivers don't need separate code paths for legacy vs. modern. They can switch between the two by just changing a single pointer. This is wrong. Of course they have separate code paths, but now you've got lots of crossover, rather than two distinct ones. The implementation ends up being pretty trivial too. We can use the same MemoryRegion in QEMU for both PIO and MMIO. The kernel code stays the same. It just takes a couple if()s to handle
Re: Using taskset , powertop and perf with qemu-kvm
On Wed, Oct 10, 2012 at 01:22:22PM +0530, Jaspal wrote: Hello , I wish to run some cpu intensive tasks on a vm , use only one core specifically #0 of the 6 available ( Xeon 3670 ) and all the while record basic perf events like cpu-cycles , instructions etc. I am using taskset to define the cpu-affinity of the qemu process : taskset -p 0x1 `pgrep qemu` (assuming only 1 vm is running ) pgrep tells you only the pid of the io thread. vcpu runs in a separate thread. I can verify that the vm is now running only on core 0 using top. top also shows that the qemu process is using 99% of it's share of CPU time ( %CPU column in top ) upon running the tasks in vm. However what I fail to see is the increase in time spent of the core 0 in the C0 ( active ) state via powertop. It barely manages to cross 12%. If I run the same tasks ( FFT benchmark ) on the host on a different core ( say #1 ) and monitor it using powertop again , I can see that C0 % for core 1 is above 100% ( 104 ~ 107 ) So why does not the output of top ( qemu is using 99% of it's cpu time , which is spent on core 0 ) co-relate with powertop ( which says core 0 is idle for around 88% of time ) ? Also , powertop reports that core 2 is being used heavily ( 104 % ) and I suspect this is one which is executing the vm operations. But why ? There are no other cpu/memory intensive tasks running on the host. Next , I am using the perf kvm record function to save the stats. If I use all the 7 core counters available along-with -a option , perf generates ~2MB/second . Is there a way to save just the raw count of the events ? Also , if instead of -a , if I give -C 0 , perf just saves around 0.016MB/s as opposed to 1.894MB/s in case of -a option. If the vm is indeed using core 0 , then perf should have recorded the events. On a last note , does perf kvm automatically selects the vm to profile ( in case multiple vms are running ) on the basis of the guest.kallsyms and guest.modules files passed to it ? Thanks , Jaspal -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] Using PCI config space to indicate config location
On Wed, Oct 10, 2012 at 02:14:11PM +1030, Rusty Russell wrote: See above. A guest could happily just use BAR1/BAR2 and completely ignore BAR0 provided that BAR1/BAR2 are present. But x86 guests want BAR0 because IO space is faster than MMIO. Right? Or to be a bit more precise, ATM on x86 emulating IO instructions is faster than MMIO. IIUC this is since you get the address/data in registers and don't need to decode them. -- MST -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Using taskset , powertop and perf with qemu-kvm
On 10/10/2012 03:38 PM, Gleb Natapov wrote: On Wed, Oct 10, 2012 at 01:22:22PM +0530, Jaspal wrote: Hello , I wish to run some cpu intensive tasks on a vm , use only one core specifically #0 of the 6 available ( Xeon 3670 ) and all the while record basic perf events like cpu-cycles , instructions etc. I am using taskset to define the cpu-affinity of the qemu process : taskset -p 0x1 `pgrep qemu` (assuming only 1 vm is running ) pgrep tells you only the pid of the io thread. vcpu runs in a separate thread. I can verify that the vm is now running only on core 0 using top. top also shows that the qemu process is using 99% of it's share of CPU time ( %CPU column in top ) upon running the tasks in vm. However what I fail to see is the increase in time spent of the core 0 in the C0 ( active ) state via powertop. It barely manages to cross 12%. If I run the same tasks ( FFT benchmark ) on the host on a different core ( say #1 ) and monitor it using powertop again , I can see that C0 % for core 1 is above 100% ( 104 ~ 107 ) So why does not the output of top ( qemu is using 99% of it's cpu time , which is spent on core 0 ) co-relate with powertop ( which says core 0 is idle for around 88% of time ) ? Also , powertop reports that core 2 is being used heavily ( 104 % ) and I suspect this is one which is executing the vm operations. But why ? There are no other cpu/memory intensive tasks running on the host. Next , I am using the perf kvm record function to save the stats. If I use all the 7 core counters available along-with -a option , perf generates ~2MB/second . Is there a way to save just the raw count of the events ? Also , if instead of -a , if I give -C 0 , perf just saves around 0.016MB/s as opposed to 1.894MB/s in case of -a option. If the vm is indeed using core 0 , then perf should have recorded the events. On a last note , does perf kvm automatically selects the vm to profile ( in case multiple vms are running ) on the basis of the guest.kallsyms and guest.modules files passed to it ? Thanks , Jaspal -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- Gleb. Thanks. It works using the correct tid. -- Jaspal -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Enabling IA32_TSC_ADJUST for guest VM
On Tue, Oct 09, 2012 at 04:10:30PM +, Auld, Will wrote: I am just testing the second version of this patch. It addresses all the comments so far except Marcelo's issue with breaking the function compute_guest_tsc(). Lets try to merge the missing patch from Zachary first (that'll make it clear). I needed to put the call for updating the TSC_ADJUST_MSR in kvm_write_tsc() to ensure it is only called from user space. Other changes added to vmcs offset should not be tracked in TSC_ADJUST_MSR. Please have a separate, earlier patch making that explicit (by passing a bool to kvm_x86_ops-set_msr then to kvm_set_msr_common). that = whether msr write is guest initiated or not. I had some trouble with the order of initialization during live migration. TSC_ADJUST is initialized first but then wiped out by multiple initializations of tsc. The fix for this is to not update TSC_ADJUST if the vmcs offset is not actually changing with the tsc write. So, after migration outcome is that vmcs offset gets defined independent from the migrating value of TSC_ADJUST. I believe this is what we want to happen. Can you please be more explicit regarding wiped out by multiple initializations of tsc ? It is probably best to maintain TSC_ADJUST separately, in software, and then calculate TSC_OFFSET. Thanks, Will -Original Message- From: Avi Kivity [mailto:a...@redhat.com] Sent: Tuesday, October 09, 2012 5:12 AM To: Marcelo Tosatti Cc: Auld, Will; kvm@vger.kernel.org; Zhang, Xiantao Subject: Re: [PATCH] Enabling IA32_TSC_ADJUST for guest VM On 10/08/2012 07:30 PM, Marcelo Tosatti wrote: From Intel's manual: • If an execution of WRMSR to the IA32_TIME_STAMP_COUNTER MSR adds (or subtracts) value X from the TSC, the logical processor also adds (or subtracts) value X from the IA32_TSC_ADJUST MSR. This is not handled in the patch. To support migration, it will be necessary to differentiate between guest initiated and userspace-model initiated msr write. That is, only guest initiated TSC writes should affect the value of IA32_TSC_ADJUST MSR. Avi, any better idea? I think we need that anyway, since there are some read-only MSRs that need to be configured by the host (nvmx capabilities). So if we add that feature it will be useful elsewhere. I don't think it's possible to do it in any other way: Local offset value of the IA32_TSC for a logical processor. Reset value is Zero. A write to IA32_TSC will modify the local offset in IA32_TSC_ADJUST and the content of IA32_TSC, but does not affect the internal invariant TSC hardware. What we want to do is affect the internal invariant TSC hardware, so we can't do that through the normal means. btw, will tsc writes from userspace (after live migration) cause tsc skew? If so we should think how to model a guest-wide tsc. -- error compiling committee.c: too many arguments to function N?r??yb?X??ǧv?^?){.n?+h????ܨ}???Ơz?j:+v???zZ+??+zf???h???~i???z??w??)ߢf -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v6 05/16] target-i386: Use Hypervisor level in -machine pc,accel=kvm.
On 10/09/12 15:11, Marcelo Tosatti wrote: On Tue, Oct 09, 2012 at 03:09:17PM -0400, Don Slutz wrote: On 10/09/12 14:47, Marcelo Tosatti wrote: On Tue, Oct 09, 2012 at 03:27:26PM -0300, Marcelo Tosatti wrote: On Tue, Oct 09, 2012 at 02:18:05PM -0300, Marcelo Tosatti wrote: On Mon, Sep 24, 2012 at 10:32:07AM -0400, Don Slutz wrote: Also known as Paravirtualization level. This change is based on: Microsoft Hypervisor CPUID Leaves: http://msdn.microsoft.com/en-us/library/windows/hardware/ff542428%28v=vs.85%29.aspx Linux kernel change starts with: http://fixunix.com/kernel/538707-use-cpuid-communicate-hypervisor.html Also: http://lkml.indiana.edu/hypermail/linux/kernel/1205.0/00100.html VMware documention on CPUIDs (Mechanisms to determine if software is running in a VMware virtual machine): http://kb.vmware.com/selfservice/microsites/search.do?language=en_UScmd=displayKCexternalId=1009458 QEMU knows this is KVM_CPUID_SIGNATURE (0x4000). Signed-off-by: Don Slutz d...@cloudswitch.com --- target-i386/kvm.c |4 ++-- 1 files changed, 2 insertions(+), 2 deletions(-) diff --git a/target-i386/kvm.c b/target-i386/kvm.c index 895d848..8462c75 100644 --- a/target-i386/kvm.c +++ b/target-i386/kvm.c @@ -389,12 +389,12 @@ int kvm_arch_init_vcpu(CPUX86State *env) c = cpuid_data.entries[cpuid_i++]; memset(c, 0, sizeof(*c)); c-function = KVM_CPUID_SIGNATURE -if (!hyperv_enabled()) { +if (!env-cpuid_hv_level_set) { memcpy(signature, KVMKVMKVM\0\0\0, 12); c-eax = 0; } else { memcpy(signature, Microsoft Hv, 12); -c-eax = HYPERV_CPUID_MIN; +c-eax = env-cpuid_hv_level; This breaks hyperv_enabled() checks. Don, are you certain it is worthwhile to make this configurable? Can you explain why, under your scenario, it is worthwhile? Because these are separate problems: - Fake VMWare hypervisor (which seems to be your main goal). - Make CPUID HV leafs configurable via command line. Err, meant via properties. Point is, why have VMWare CPUID configuration as data, if there are reasons to believe code is a better fit (code as in current Hyper-V implementation). Nevermind, its the right thing to do. Just separate the patchset please: 1) Create object properties. 2) Export VMWare CPUID via properties. 3) Convert Hyper-V. Be careful to make sure Hyper-V's current options are functional in 3). Did you mean 3 patch sets (or more)? Or just a different order? -Don Slutz Different order. Patches should be logically related (think of what information the reviewer needs). Please write changelogs for every patch. Using this order causes support for Hyper-V to stop working in the middle of the patch set. How about this order: 1) Create object properties. 2) Convert Hyper-V to set the new properties. 3) Change kvm.c to use the new properties. 4) Export VMWare CPUID via properties. 5) Change accel=tcg to use the new properties. -Don Slutz -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v6 06/16] target-i386: Use Hypervisor level in -machine pc,accel=tcg.
On 10/09/12 15:01, Marcelo Tosatti wrote: On Mon, Sep 24, 2012 at 10:32:08AM -0400, Don Slutz wrote: Also known as Paravirtualization level. This change is based on: Microsoft Hypervisor CPUID Leaves: http://msdn.microsoft.com/en-us/library/windows/hardware/ff542428%28v=vs.85%29.aspx Linux kernel change starts with: http://fixunix.com/kernel/538707-use-cpuid-communicate-hypervisor.html Also: http://lkml.indiana.edu/hypermail/linux/kernel/1205.0/00100.html VMware documention on CPUIDs (Mechanisms to determine if software is running in a VMware virtual machine): http://kb.vmware.com/selfservice/microsites/search.do?language=en_UScmd=displayKCexternalId=1009458 QEMU knows this as KVM_CPUID_SIGNATURE (0x4000) in kvm on linux. This does not provide vendor support in tcg yet. Signed-off-by: Don Slutz d...@cloudswitch.com --- target-i386/cpu.c | 27 +++ 1 files changed, 27 insertions(+), 0 deletions(-) diff --git a/target-i386/cpu.c b/target-i386/cpu.c index 48bdaf9..920278b 100644 --- a/target-i386/cpu.c +++ b/target-i386/cpu.c @@ -1651,6 +1651,21 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count, index = env-cpuid_xlevel; } } +} else if (index 0x4000) { +if (env-cpuid_hv_level_set) { +uint32_t real_level = env-cpuid_hv_level; + +/* Handle Hypervisor CPUIDs */ +if (real_level 0x4000) { +real_level = 0x4000; +} +if (index real_level) { +index = real_level; +} +} else { +if (index env-cpuid_level) +index = env-cpuid_level; +} Whats the purpose of this checks? They control what happens for CPUID requests outside the defined ranges. Very similar to code just above this. Please provide changelogs for each patch. In progress. } else { if (index env-cpuid_level) index = env-cpuid_level; @@ -1789,6 +1804,18 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count, *edx = 0; } break; +case 0x4000: +*eax = env-cpuid_hv_level; +*ebx = 0; +*ecx = 0; +*edx = 0; +break; +case 0x4001: +*eax = env-cpuid_kvm_features; +*ebx = 0; +*ecx = 0; +*edx = 0; +break; case 0x8000: *eax = env-cpuid_xlevel; *ebx = env-cpuid_vendor1; -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -Don Slutz -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] Using PCI config space to indicate config location
Rusty Russell ru...@rustcorp.com.au writes: Gerd Hoffmann kra...@redhat.com writes: So how about this: (1) Add a vendor specific pci capability for new-style virtio. Specifies the pci bar used for new-style virtio registers. Guests can use it to figure whenever new-style virtio is supported and to map the correct bar (which will probably be bar 1 in most cases). This was closer to the original proposal[1], which I really liked (you can layout bars however you want). Anthony thought that vendor capabilities were a PCI-e feature, but it seems they're blessed in PCI 2.3. 2.3 was standardized in 2002. Are we confident that vendor extensions play nice with pre-2.3 OSes like Win2k, WinXP, etc? I still think it's a bad idea to rely on something so new in something as fundamental as virtio-pci unless we have to. Regards, Anthony Liguori So let's return to that proposal, giving something like this: /* IDs for different capabilities. Must all exist. */ /* FIXME: Do we win from separating ISR, NOTIFY and COMMON? */ /* Common configuration */ #define VIRTIO_PCI_CAP_COMMON_CFG 1 /* Notifications */ #define VIRTIO_PCI_CAP_NOTIFY_CFG 2 /* ISR access */ #define VIRTIO_PCI_CAP_ISR_CFG3 /* Device specific confiuration */ #define VIRTIO_PCI_CAP_DEVICE_CFG 4 /* This is the PCI capability header: */ struct virtio_pci_cap { u8 cap_vndr;/* Generic PCI field: PCI_CAP_ID_VNDR */ u8 cap_next;/* Generic PCI field: next ptr. */ u8 cap_len; /* Generic PCI field: sizeof(struct virtio_pci_cap). */ u8 cfg_type;/* One of the VIRTIO_PCI_CAP_*_CFG. */ u8 bar; /* Where to find it. */ u8 unused; __le16 offset; /* Offset within bar. */ __le32 length; /* Length. */ }; This means qemu can point the isr_cfg into the legacy area if it wants. In fact, it can put everything in BAR0 if it wants. Thoughts? Rusty. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] Using PCI config space to indicate config location
On Wed, Oct 10, 2012 at 08:36:12AM -0500, Anthony Liguori wrote: Rusty Russell ru...@rustcorp.com.au writes: Gerd Hoffmann kra...@redhat.com writes: So how about this: (1) Add a vendor specific pci capability for new-style virtio. Specifies the pci bar used for new-style virtio registers. Guests can use it to figure whenever new-style virtio is supported and to map the correct bar (which will probably be bar 1 in most cases). This was closer to the original proposal[1], which I really liked (you can layout bars however you want). Anthony thought that vendor capabilities were a PCI-e feature, but it seems they're blessed in PCI 2.3. 2.3 was standardized in 2002. Are we confident that vendor extensions play nice with pre-2.3 OSes like Win2k, WinXP, etc? I still think it's a bad idea to rely on something so new in something as fundamental as virtio-pci unless we have to. Regards, Anthony Liguori Pre 2.3 it was the case that *all* space outside the capability linked list, and any capability not recognized by space, was considered vendor specific. So there's no problem really. So let's return to that proposal, giving something like this: /* IDs for different capabilities. Must all exist. */ /* FIXME: Do we win from separating ISR, NOTIFY and COMMON? */ /* Common configuration */ #define VIRTIO_PCI_CAP_COMMON_CFG 1 /* Notifications */ #define VIRTIO_PCI_CAP_NOTIFY_CFG 2 /* ISR access */ #define VIRTIO_PCI_CAP_ISR_CFG 3 /* Device specific confiuration */ #define VIRTIO_PCI_CAP_DEVICE_CFG 4 /* This is the PCI capability header: */ struct virtio_pci_cap { u8 cap_vndr;/* Generic PCI field: PCI_CAP_ID_VNDR */ u8 cap_next;/* Generic PCI field: next ptr. */ u8 cap_len; /* Generic PCI field: sizeof(struct virtio_pci_cap). */ u8 cfg_type;/* One of the VIRTIO_PCI_CAP_*_CFG. */ u8 bar; /* Where to find it. */ u8 unused; __le16 offset; /* Offset within bar. */ __le32 length; /* Length. */ }; This means qemu can point the isr_cfg into the legacy area if it wants. In fact, it can put everything in BAR0 if it wants. Thoughts? Rusty. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v6 05/16] target-i386: Use Hypervisor level in -machine pc,accel=kvm.
On Wed, Oct 10, 2012 at 09:03:20AM -0400, Don Slutz wrote: Did you mean 3 patch sets (or more)? Or just a different order? -Don Slutz Different order. Patches should be logically related (think of what information the reviewer needs). Please write changelogs for every patch. Using this order causes support for Hyper-V to stop working in the middle of the patch set. How about this order: 1) Create object properties. 2) Convert Hyper-V to set the new properties. 3) Change kvm.c to use the new properties. 4) Export VMWare CPUID via properties. 5) Change accel=tcg to use the new properties. -Don Slutz Fine, as long as change from item A) does not leak to item B). -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH RFC 1/2] kvm: Handle undercommitted guest case in PLE handler
I ran 'perf sched map' on the dbench workload for medium and large VMs, and I thought I would share some of the results. I think it helps to visualize what's going on regarding the yielding. These files are png bitmaps, generated from processing output from 'perf sched map' (and perf data generated from 'perf sched record'). The Y axis is the host cpus, each row being 10 pixels high. For these tests, there are 80 host cpus, so the total height is 800 pixels. The X axis is time (in microseconds), with each pixel representing 1 microsecond. Each bitmap plots 30,000 microseconds. The bitmaps are quite wide obviously, and zooming in/out while viewing is recommended. Each row (each host cpu) is assigned a color based on what thread is running. vCPUs of the same VM are assigned a common color (like red, blue, magenta, etc), and each vCPU has a unique brightness for that color. There are a maximum of 12 assignable colors, so in any VMs 12 revert to vCPU color of gray. I would use more colors, but it becomes harder to distinguish one color from another. The white color represents missing data from perf, and black color represents any thread which is not a vCPU. For the following tests, VMs were pinned to host NUMA nodes and to specific cpus to help with consistency and operate within the constraints of the last test (gang scheduler). Here is a good example of PLE. These are 10-way VMs, 16 of them (as described above only 12 of the VMs have a color, rest are gray). https://docs.google.com/open?id=0B6tfUNlZ-14wdmFqUmE5QjJHMFU If you zoom out and look at the whole bitmap, you may notice the 4ms intervals of the scheduler. They are pretty well aligned across all cpus. Normally, for cpu bound workloads, we would expect to see each thread to run for 4 ms, then something else getting to run, and so on. That is mostly true in this test. We have 2x over-commit and we generally see the switching of threads at 4ms. One thing to note is that not all vCPU threads for the same VM run at exactly the same time, and that is expected and the whole reason for lock-holder preemption. Now, if you zoom in on the bitmap, you should notice within the 4ms intervals there is some task switching going on. This is most likely because of the yield_to initiated by the PLE handler. In this case there is not that much yielding to do. It's quite clean, and the performance is quite good. Below is an example of PLE, but this time with 20-way VMs, 8 of them. CPU over-commit is still 2x. https://docs.google.com/open?id=0B6tfUNlZ-14wdmFqUmE5QjJHMFU This one looks quite different. In short, it's a mess. The switching between tasks can be lower than 10 microseconds. It basically never recovers. There is constant yielding all the time. Below is again 8 x 20-way VMs, but this time I tried out Nikunj's gang scheduling patches. While I am not recommending gang scheduling, I think it's a good data point. The performance is 3.88x the PLE result. https://docs.google.com/open?id=0B6tfUNlZ-14wWXdscWcwNTVEY3M Note that the task switching intervals of 4ms are quite obvious again, and this time all vCPUs from same VM run at the same time. It represents the best possible outcome. Anyway, I thought the bitmaps might help better visualize what's going on. -Andrew -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] x86, add hypervisor name to dump_stack()
Debugging crash, panics, stack trace WARN_ONs, etc., from both virtual and bare-metal boots can get difficult very quickly. While there are ways to decipher the output and determine if the output is from a virtual guest, the in-kernel hypervisors now have a single registration point and set x86_hyper. We can use this to output a single extra line on virtual machines that indicates the hypervisor type. Signed-off-by: Prarit Bhargava pra...@redhat.com Cc: Avi Kivity a...@redhat.com Cc: Gleb Natapov g...@redhat.com Cc: Alex Williamson alex.william...@redhat.com Cc: Marcelo Tostatti mtosa...@redhat.com Cc: Ingo Molnar mi...@redhat.com Cc: kvm@vger.kernel.org Cc: x...@kernel.org --- arch/x86/kernel/dumpstack.c |3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index ae42418b..75a635e 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -17,6 +17,7 @@ #include linux/sysfs.h #include asm/stacktrace.h +#include asm/hypervisor.h int panic_on_unrecovered_nmi; @@ -193,6 +194,8 @@ void dump_stack(void) init_utsname()-release, (int)strcspn(init_utsname()-version, ), init_utsname()-version); + if (x86_hyper x86_hyper-name) + printk(Hypervisor: %s\n, x86_hyper-name); show_trace(NULL, NULL, stack, bp); } EXPORT_SYMBOL(dump_stack); -- 1.7.9.3 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v4 1/5] KVM: MMU: fix release noslot pfn
On Sun, Oct 07, 2012 at 08:01:34PM +0800, Xiao Guangrong wrote: We can not directly call kvm_release_pfn_clean to release the pfn since we can meet noslot pfn which is used to cache mmio info into spte Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com --- arch/x86/kvm/mmu.c |3 +-- virt/kvm/kvm_main.c |4 +--- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d289fee..6f85fe0 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2497,8 +2497,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, } } - if (!is_error_pfn(pfn)) - kvm_release_pfn_clean(pfn); + kvm_release_pfn_clean(pfn); } static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index cc3f6dc..b65ec97 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1322,9 +1322,7 @@ EXPORT_SYMBOL_GPL(kvm_release_page_clean); void kvm_release_pfn_clean(pfn_t pfn) { - WARN_ON(is_error_pfn(pfn)); - - if (!kvm_is_mmio_pfn(pfn)) + if (!is_error_pfn(pfn) !kvm_is_mmio_pfn(pfn)) put_page(pfn_to_page(pfn)); } EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); -- 1.7.7.6 Why does is_error_pfn() return true for mmio spte? Its not an error, after all. Please kill is_invalid_pfn and use - is_error_pfn for checking for errors (mmio spte is not an error pfn, its a special pfn) - add explicit is_noslot_pfn checks where necessary in the code (say to avoid interpreting a noslot_pfn's pfn address bits). (should have noticed this earlier, sorry). -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v6 03/16] target-i386: Add cpu object access routines for Hypervisor level.
On 10/09/12 15:13, Don Slutz wrote: On 10/09/12 12:25, Marcelo Tosatti wrote: On Mon, Sep 24, 2012 at 10:32:05AM -0400, Don Slutz wrote: These are modeled after x86_cpuid_get_xlevel and x86_cpuid_set_xlevel. Signed-off-by: Don Slutz d...@cloudswitch.com --- target-i386/cpu.c | 29 + 1 files changed, 29 insertions(+), 0 deletions(-) diff --git a/target-i386/cpu.c b/target-i386/cpu.c index 25ca986..451de12 100644 --- a/target-i386/cpu.c +++ b/target-i386/cpu.c @@ -1166,6 +1166,32 @@ static void x86_cpuid_set_tsc_freq(Object *obj, Visitor *v, void *opaque, cpu-env.tsc_khz = value / 1000; } +static void x86_cpuid_get_hv_level(Object *obj, Visitor *v, void *opaque, +const char *name, Error **errp) +{ +X86CPU *cpu = X86_CPU(obj); + +visit_type_uint32(v, cpu-env.cpuid_hv_level, name, errp); +} + +static void x86_cpuid_set_hv_level(Object *obj, Visitor *v, void *opaque, +const char *name, Error **errp) +{ +X86CPU *cpu = X86_CPU(obj); +uint32_t value; + +visit_type_uint32(v, value, name, errp); +if (error_is_set(errp)) { +return; +} + +if (value != 0 value 0x4000) { +value += 0x4000; +} Whats the purpose of this? Adds ambiguity. Will add more info in this commit message. -Don -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Not clear on how to add info in the commit message. This is direct copy with adjustment from x86_cpuid_set_xlevel(): if (value 0x8000) { value += 0x8000; } (Pending patch: http://comments.gmane.org/gmane.comp.emulators.qemu/172703 adds this) The adjustment is that 0 is a legal value. See http://lkml.indiana.edu/hypermail/linux/kernel/1205.0/00100.html This does mean that just like xlevel=1 and xlevel=0x8001 are the same; hypervisor-level=1 and hypervisor-level=0x401 are the same. If this is not wanted, I have no issue with removing it. -Don Slutz -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC v2 5/6] ARM KVM: save and load VFP registers from kernel
Add support for saving and restoring VFP register state from the kernel. This includes a check that the KVM-created CPU has full VFP support (as the TCG Cortex-A15 model always does), since for the moment ARM QEMU doesn't have any way to tweak optional features on created CPUs. Signed-off-by: Peter Maydell peter.mayd...@linaro.org --- target-arm/kvm.c | 78 +++--- 1 file changed, 75 insertions(+), 3 deletions(-) diff --git a/target-arm/kvm.c b/target-arm/kvm.c index fee60e1..f17e7fd 100644 --- a/target-arm/kvm.c +++ b/target-arm/kvm.c @@ -39,10 +39,28 @@ int kvm_arch_init(KVMState *s) int kvm_arch_init_vcpu(CPUARMState *env) { struct kvm_vcpu_init init; +int ret; +uint64_t v; +struct kvm_one_reg r; init.target = KVM_ARM_TARGET_CORTEX_A15; memset(init.features, 0, sizeof(init.features)); -return kvm_vcpu_ioctl(env, KVM_ARM_VCPU_INIT, init); +ret = kvm_vcpu_ioctl(env, KVM_ARM_VCPU_INIT, init); +if (ret) { +return ret; +} +/* Query the kernel to make sure it supports 32 VFP + * registers: QEMU's cortex-a15 CPU is always a + * VFP-D32 core. The simplest way to do this is just + * to attempt to read register d31. + */ +r.id = KVM_REG_ARM | KVM_REG_SIZE_U64 | KVM_REG_ARM_VFP | 31; +r.addr = (uintptr_t)(v); +ret = kvm_vcpu_ioctl(env, KVM_GET_ONE_REG, r); +if (ret == ENOENT) { +return EINVAL; +} +return ret; } struct reg { @@ -68,6 +86,13 @@ struct reg { offsetof(CPUARMState, QEMUFIELD) \ } +#define VFPSYSREG(R) \ +{ \ +KVM_REG_ARM | KVM_REG_SIZE_U32 | KVM_REG_ARM_VFP | \ +KVM_REG_ARM_VFP_##R, \ +offsetof(CPUARMState, vfp.xregs[ARM_VFP_##R]) \ +} + const struct reg regs[] = { /* R0_usr .. R14_usr */ COREREG(usr_regs[0], regs[0]), @@ -115,6 +140,13 @@ const struct reg regs[] = { CP15REG(1, 0, 0, 0, cp15.c1_sys), /* SCTLR */ CP15REG(2, 0, 0, 2, cp15.c2_control), /* TTBCR */ CP15REG(3, 0, 0, 0, cp15.c3), /* DACR */ +/* VFP system registers */ +VFPSYSREG(FPSID), +VFPSYSREG(MVFR1), +VFPSYSREG(MVFR0), +VFPSYSREG(FPEXC), +VFPSYSREG(FPINST), +VFPSYSREG(FPINST2), }; int kvm_arch_put_registers(CPUARMState *env, int level) @@ -122,7 +154,7 @@ int kvm_arch_put_registers(CPUARMState *env, int level) struct kvm_one_reg r; int mode, bn; int ret, i; -uint32_t cpsr; +uint32_t cpsr, fpscr; uint64_t ttbr; /* Make sure the banked regs are properly set */ @@ -173,6 +205,26 @@ int kvm_arch_put_registers(CPUARMState *env, int level) (2 KVM_REG_ARM_CRM_SHIFT) | (1 KVM_REG_ARM_OPC1_SHIFT); r.addr = (uintptr_t)(ttbr); ret = kvm_vcpu_ioctl(env, KVM_SET_ONE_REG, r); +if (ret) { +return ret; +} + +/* VFP registers */ +r.id = KVM_REG_ARM | KVM_REG_SIZE_U64 | KVM_REG_ARM_VFP; +for (i = 0; i 32; i++) { +r.addr = (uintptr_t)(env-vfp.regs[i]); +ret = kvm_vcpu_ioctl(env, KVM_SET_ONE_REG, r); +if (ret) { +return ret; +} +r.id++; +} + +r.id = KVM_REG_ARM | KVM_REG_SIZE_U32 | KVM_REG_ARM_VFP | +KVM_REG_ARM_VFP_FPSCR; +fpscr = vfp_get_fpscr(env); +r.addr = (uintptr_t)fpscr; +ret = kvm_vcpu_ioctl(env, KVM_SET_ONE_REG, r); return ret; } @@ -182,7 +234,7 @@ int kvm_arch_get_registers(CPUARMState *env) struct kvm_one_reg r; int mode, bn; int ret, i; -uint32_t cpsr; +uint32_t cpsr, fpscr; uint64_t ttbr; for (i = 0; i ARRAY_SIZE(regs); i++) { @@ -247,6 +299,26 @@ int kvm_arch_get_registers(CPUARMState *env) env-cp15.c2_mask = ~(((uint32_t)0xu) env-cp15.c2_control); env-cp15.c2_base_mask = ~((uint32_t)0x3fffu env-cp15.c2_control); +/* VFP registers */ +r.id = KVM_REG_ARM | KVM_REG_SIZE_U64 | KVM_REG_ARM_VFP; +for (i = 0; i 32; i++) { +r.addr = (uintptr_t)(env-vfp.regs[i]); +ret = kvm_vcpu_ioctl(env, KVM_GET_ONE_REG, r); +if (ret) { +return ret; +} +r.id++; +} + +r.id = KVM_REG_ARM | KVM_REG_SIZE_U32 | KVM_REG_ARM_VFP | +KVM_REG_ARM_VFP_FPSCR; +r.addr = (uintptr_t)fpscr; +ret = kvm_vcpu_ioctl(env, KVM_GET_ONE_REG, r); +if (ret) { +return ret; +} +vfp_set_fpscr(env, fpscr); + return 0; } -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC v2 2/6] ARM: KVM: Add support for KVM on ARM architecture
From: Christoffer Dall cd...@cs.columbia.edu Add basic support for KVM on ARM architecture. Signed-off-by: Christoffer Dall cd...@cs.columbia.edu [PMM: Minor tweaks and code cleanup, switch to ONE_REG] Signed-off-by: Peter Maydell peter.mayd...@linaro.org --- hw/arm_pic.c | 28 target-arm/Makefile.objs |1 + target-arm/cpu.h |1 + target-arm/helper.c |2 +- target-arm/kvm.c | 328 ++ 5 files changed, 359 insertions(+), 1 deletion(-) create mode 100644 target-arm/kvm.c diff --git a/hw/arm_pic.c b/hw/arm_pic.c index ffb4d41..72272dc 100644 --- a/hw/arm_pic.c +++ b/hw/arm_pic.c @@ -9,6 +9,7 @@ #include hw.h #include arm-misc.h +#include kvm.h /* Input 0 is IRQ and input 1 is FIQ. */ static void arm_pic_cpu_handler(void *opaque, int irq, int level) @@ -34,7 +35,34 @@ static void arm_pic_cpu_handler(void *opaque, int irq, int level) } } +#ifdef CONFIG_KVM +static void kvm_arm_pic_cpu_handler(void *opaque, int irq, int level) +{ +ARMCPU *cpu = opaque; +CPUARMState *env = cpu-env; +int kvm_irq = KVM_ARM_IRQ_TYPE_CPU KVM_ARM_IRQ_TYPE_SHIFT; + +switch (irq) { +case ARM_PIC_CPU_IRQ: +kvm_irq |= KVM_ARM_IRQ_CPU_IRQ; +break; +case ARM_PIC_CPU_FIQ: +kvm_irq |= KVM_ARM_IRQ_CPU_FIQ; +break; +default: +hw_error(kvm_arm_pic_cpu_handler: Bad interrupt line %d\n, irq); +} +kvm_irq |= env-cpu_index KVM_ARM_IRQ_VCPU_SHIFT; +kvm_set_irq(kvm_state, kvm_irq, level ? 1 : 0); +} +#endif + qemu_irq *arm_pic_init_cpu(ARMCPU *cpu) { +#ifdef CONFIG_KVM +if (kvm_enabled()) { +return qemu_allocate_irqs(kvm_arm_pic_cpu_handler, cpu, 2); +} +#endif return qemu_allocate_irqs(arm_pic_cpu_handler, cpu, 2); } diff --git a/target-arm/Makefile.objs b/target-arm/Makefile.objs index b6f1a9e..d89b57c 100644 --- a/target-arm/Makefile.objs +++ b/target-arm/Makefile.objs @@ -1,4 +1,5 @@ obj-y += arm-semi.o obj-$(CONFIG_SOFTMMU) += machine.o +obj-$(CONFIG_KVM) += kvm.o obj-y += translate.o op_helper.o helper.o cpu.o obj-y += neon_helper.o iwmmxt_helper.o diff --git a/target-arm/cpu.h b/target-arm/cpu.h index ff4de10..8c7e574 100644 --- a/target-arm/cpu.h +++ b/target-arm/cpu.h @@ -236,6 +236,7 @@ ARMCPU *cpu_arm_init(const char *cpu_model); void arm_translate_init(void); int cpu_arm_exec(CPUARMState *s); void do_interrupt(CPUARMState *); +int bank_number(CPUARMState *env, int mode); void switch_mode(CPUARMState *, int); uint32_t do_arm_semihosting(CPUARMState *env); diff --git a/target-arm/helper.c b/target-arm/helper.c index 58340bd..f62f3da 100644 --- a/target-arm/helper.c +++ b/target-arm/helper.c @@ -1619,7 +1619,7 @@ uint32_t HELPER(get_r13_banked)(CPUARMState *env, uint32_t mode) #else /* Map CPU modes onto saved register banks. */ -static inline int bank_number(CPUARMState *env, int mode) +int bank_number(CPUARMState *env, int mode) { switch (mode) { case ARM_CPU_MODE_USR: diff --git a/target-arm/kvm.c b/target-arm/kvm.c new file mode 100644 index 000..fee60e1 --- /dev/null +++ b/target-arm/kvm.c @@ -0,0 +1,328 @@ +/* + * ARM implementation of KVM hooks + * + * Copyright Christoffer Dall 2009-2010 + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include stdio.h +#include sys/types.h +#include sys/ioctl.h +#include sys/mman.h + +#include linux/kvm.h + +#include qemu-common.h +#include qemu-timer.h +#include sysemu.h +#include kvm.h +#include cpu.h +#include device_tree.h +#include hw/arm-misc.h + +const KVMCapabilityInfo kvm_arch_required_capabilities[] = { +KVM_CAP_LAST_INFO +}; + +int kvm_arch_init(KVMState *s) +{ +/* For ARM interrupt delivery is always asynchronous, + * whether we are using an in-kernel VGIC or not. + */ +kvm_async_interrupts_allowed = true; +return 0; +} + +int kvm_arch_init_vcpu(CPUARMState *env) +{ +struct kvm_vcpu_init init; + +init.target = KVM_ARM_TARGET_CORTEX_A15; +memset(init.features, 0, sizeof(init.features)); +return kvm_vcpu_ioctl(env, KVM_ARM_VCPU_INIT, init); +} + +struct reg { +uint64_t id; +int offset; +}; + +#define COREREG(KERNELNAME, QEMUFIELD) \ +{\ +KVM_REG_ARM | KVM_REG_SIZE_U32 | \ +KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(KERNELNAME), \ +offsetof(CPUARMState, QEMUFIELD) \ +} + +#define CP15REG(CRN, CRM, OPC1, OPC2, QEMUFIELD) \ +{\ +KVM_REG_ARM | KVM_REG_SIZE_U32 | \ +(15 KVM_REG_ARM_COPROC_SHIFT) | \ +((CRN) KVM_REG_ARM_32_CRN_SHIFT) |\ +((CRM) KVM_REG_ARM_CRM_SHIFT) | \ +((OPC1) KVM_REG_ARM_OPC1_SHIFT) | \ +((OPC2)