[RFC v2 4/6] hw/kvm/arm_gic: Implement support for KVM in-kernel ARM GIC

2012-10-10 Thread Peter Maydell
Implement support for using the KVM in-kernel GIC for ARM.

Signed-off-by: Peter Maydell peter.mayd...@linaro.org
---
 hw/a15mpcore.c   |8 ++-
 hw/arm/Makefile.objs |1 +
 hw/kvm/arm_gic.c |  162 ++
 3 files changed, 170 insertions(+), 1 deletion(-)
 create mode 100644 hw/kvm/arm_gic.c

diff --git a/hw/a15mpcore.c b/hw/a15mpcore.c
index fc0a02a..a37fc61 100644
--- a/hw/a15mpcore.c
+++ b/hw/a15mpcore.c
@@ -19,6 +19,7 @@
  */
 
 #include sysbus.h
+#include kvm.h
 
 /* A15MP private memory region.  */
 
@@ -41,7 +42,12 @@ static int a15mp_priv_init(SysBusDevice *dev)
 A15MPPrivState *s = FROM_SYSBUS(A15MPPrivState, dev);
 SysBusDevice *busdev;
 
-s-gic = qdev_create(NULL, arm_gic);
+if (kvm_irqchip_in_kernel()) {
+s-gic = qdev_create(NULL, kvm-arm_gic);
+} else {
+s-gic = qdev_create(NULL, arm_gic);
+}
+
 qdev_prop_set_uint32(s-gic, num-cpu, s-num_cpu);
 qdev_prop_set_uint32(s-gic, num-irq, s-num_irq);
 qdev_prop_set_uint32(s-gic, revision, 2);
diff --git a/hw/arm/Makefile.objs b/hw/arm/Makefile.objs
index 6d049e7..38b10a8 100644
--- a/hw/arm/Makefile.objs
+++ b/hw/arm/Makefile.objs
@@ -31,5 +31,6 @@ obj-y += collie.o
 obj-y += imx_serial.o imx_ccm.o imx_timer.o imx_avic.o
 obj-y += kzm.o
 obj-$(CONFIG_FDT) += ../device_tree.o
+obj-$(CONFIG_KVM) += kvm/arm_gic.o
 
 obj-y := $(addprefix ../,$(obj-y))
diff --git a/hw/kvm/arm_gic.c b/hw/kvm/arm_gic.c
new file mode 100644
index 000..3ec538d
--- /dev/null
+++ b/hw/kvm/arm_gic.c
@@ -0,0 +1,162 @@
+/*
+ * ARM Generic Interrupt Controller using KVM in-kernel support
+ *
+ * Copyright (c) 2012 Linaro Limited
+ * Written by Peter Maydell
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see http://www.gnu.org/licenses/.
+ */
+
+#include hw/sysbus.h
+#include kvm.h
+#include hw/arm_gic_internal.h
+
+#define TYPE_KVM_ARM_GIC kvm-arm_gic
+#define KVM_ARM_GIC(obj) \
+ OBJECT_CHECK(gic_state, (obj), TYPE_KVM_ARM_GIC)
+#define KVM_ARM_GIC_CLASS(klass) \
+ OBJECT_CLASS_CHECK(KVMARMGICClass, (klass), TYPE_KVM_ARM_GIC)
+#define KVM_ARM_GIC_GET_CLASS(obj) \
+ OBJECT_GET_CLASS(KVMARMGICClass, (obj), TYPE_KVM_ARM_GIC)
+
+typedef struct KVMARMGICClass {
+ARMGICCommonClass parent_class;
+int (*parent_init)(SysBusDevice *dev);
+void (*parent_reset)(DeviceState *dev);
+} KVMARMGICClass;
+
+static void kvm_arm_gic_set_irq(void *opaque, int irq, int level)
+{
+/* Meaning of the 'irq' parameter:
+ *  [0..N-1] : external interrupts
+ *  [N..N+31] : PPI (internal) interrupts for CPU 0
+ *  [N+32..N+63] : PPI (internal interrupts for CPU 1
+ *  ...
+ * Convert this to the kernel's desired encoding, which
+ * has separate fields in the irq number for type,
+ * CPU number and interrupt number.
+ */
+gic_state *s = (gic_state *)opaque;
+int kvm_irq, irqtype, cpu;
+
+if (irq  (s-num_irq - GIC_INTERNAL)) {
+/* External interrupt. The kernel numbers these like the GIC
+ * hardware, with external interrupt IDs starting after the
+ * internal ones.
+ */
+irqtype = KVM_ARM_IRQ_TYPE_SPI;
+cpu = 0;
+irq += GIC_INTERNAL;
+} else {
+/* Internal interrupt: decode into (cpu, interrupt id) */
+irqtype = KVM_ARM_IRQ_TYPE_PPI;
+irq -= (s-num_irq - GIC_INTERNAL);
+cpu = irq / GIC_INTERNAL;
+irq %= GIC_INTERNAL;
+}
+kvm_irq = (irqtype  KVM_ARM_IRQ_TYPE_SHIFT)
+| (cpu  KVM_ARM_IRQ_VCPU_SHIFT) | irq;
+
+kvm_set_irq(kvm_state, kvm_irq, !!level);
+}
+
+static void kvm_arm_gic_put(gic_state *s)
+{
+/* TODO: there isn't currently a kernel interface to set the GIC state */
+}
+
+static void kvm_arm_gic_get(gic_state *s)
+{
+/* TODO: there isn't currently a kernel interface to get the GIC state */
+}
+
+static void kvm_arm_gic_reset(DeviceState *dev)
+{
+gic_state *s = ARM_GIC_COMMON(dev);
+KVMARMGICClass *kgc = KVM_ARM_GIC_GET_CLASS(s);
+kgc-parent_reset(dev);
+kvm_arm_gic_put(s);
+}
+
+static int kvm_arm_gic_init(SysBusDevice *dev)
+{
+/* Device instance init function for the GIC sysbus device */
+int i;
+gic_state *s = FROM_SYSBUS(gic_state, dev);
+KVMARMGICClass *kgc = KVM_ARM_GIC_GET_CLASS(s);
+
+kgc-parent_init(dev);
+
+i = s-num_irq - GIC_INTERNAL;
+/* For the GIC, also expose incoming GPIO 

[RFC v2 3/6] hw/arm_gic: Add presave/postload hooks

2012-10-10 Thread Peter Maydell
Add presave/postload hooks to the ARM GIC common base class.
These will be used by the KVM in-kernel GIC subclass to sync
state between kernel and userspace when migrating.

Signed-off-by: Peter Maydell peter.mayd...@linaro.org
---
 hw/arm_gic_common.c   |   10 ++
 hw/arm_gic_internal.h |2 ++
 2 files changed, 12 insertions(+)

diff --git a/hw/arm_gic_common.c b/hw/arm_gic_common.c
index 360e782..d972755 100644
--- a/hw/arm_gic_common.c
+++ b/hw/arm_gic_common.c
@@ -23,9 +23,14 @@
 static void gic_save(QEMUFile *f, void *opaque)
 {
 gic_state *s = (gic_state *)opaque;
+ARMGICCommonClass *c = ARM_GIC_COMMON_GET_CLASS(s);
 int i;
 int j;
 
+if (c-pre_save) {
+c-pre_save(s);
+}
+
 qemu_put_be32(f, s-enabled);
 for (i = 0; i  s-num_cpu; i++) {
 qemu_put_be32(f, s-cpu_enabled[i]);
@@ -57,6 +62,7 @@ static void gic_save(QEMUFile *f, void *opaque)
 static int gic_load(QEMUFile *f, void *opaque, int version_id)
 {
 gic_state *s = (gic_state *)opaque;
+ARMGICCommonClass *c = ARM_GIC_COMMON_GET_CLASS(s);
 int i;
 int j;
 
@@ -91,6 +97,10 @@ static int gic_load(QEMUFile *f, void *opaque, int 
version_id)
 s-irq_state[i].trigger = qemu_get_byte(f);
 }
 
+if (c-post_load) {
+c-post_load(s);
+}
+
 return 0;
 }
 
diff --git a/hw/arm_gic_internal.h b/hw/arm_gic_internal.h
index db4fad5..183dca6 100644
--- a/hw/arm_gic_internal.h
+++ b/hw/arm_gic_internal.h
@@ -118,6 +118,8 @@ void gic_init_irqs_and_distributor(gic_state *s, int 
num_irq);
 
 typedef struct ARMGICCommonClass {
 SysBusDeviceClass parent_class;
+void (*pre_save)(gic_state *s);
+void (*post_load)(gic_state *s);
 } ARMGICCommonClass;
 
 #define TYPE_ARM_GIC arm_gic
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC v2 0/6] QEMU: Support KVM on ARM

2012-10-10 Thread Peter Maydell
This is a v2 RFC of the QEMU patches to support KVM for
ARM on Cortex-A15 hardware. It's intended for use with
the kernel tree at
 git://github.com/virtualopensystems/linux-kvm-arm.git kvm-arm-v12

(ie the 'v2' kernel patches which Christoffer sent out at
the start of the month).

Patch review appreciated -- I think this is more or less
ready to go in once the kernel patches are accepted (assuming
no kernel ABI changes, of course). (There are still a few
things it would be nice to do better, like driving cp15
access from the cp_regs hashtable rather than hardcoding
a small list. If anybody feels any of the todo-type stuff
is a blocker to these patches being committed that would
be good to know too :-))

Git tree if preferred:
 git://git.linaro.org/people/pmaydell/qemu-arm.git kvm-arm-v12

Peter Maydell (6):
  linux-headers: Add ARM KVM headers (not for upstream)
  ARM: KVM: Add support for KVM on ARM architecture
  hw/arm_gic: Add presave/postload hooks
  hw/kvm/arm_gic: Implement support for KVM in-kernel ARM GIC
  ARM KVM: save and load VFP registers from kernel
  configure: Enable KVM on ARM

 configure|2 +-
 hw/a15mpcore.c   |8 +-
 hw/arm/Makefile.objs |1 +
 hw/arm_gic_common.c  |   10 +
 hw/arm_gic_internal.h|2 +
 hw/arm_pic.c |   28 +++
 hw/kvm/arm_gic.c |  162 ++
 linux-headers/asm-arm/kvm.h  |  131 +++
 linux-headers/asm-arm/kvm_para.h |1 +
 linux-headers/asm-generic/kvm_para.h |5 +
 linux-headers/asm-x86/kvm.h  |1 +
 linux-headers/linux/kvm.h|   16 +-
 target-arm/Makefile.objs |1 +
 target-arm/cpu.h |1 +
 target-arm/helper.c  |2 +-
 target-arm/kvm.c |  400 ++
 16 files changed, 765 insertions(+), 6 deletions(-)
 create mode 100644 hw/kvm/arm_gic.c
 create mode 100644 linux-headers/asm-arm/kvm.h
 create mode 100644 linux-headers/asm-arm/kvm_para.h
 create mode 100644 linux-headers/asm-generic/kvm_para.h
 create mode 100644 target-arm/kvm.c

-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC v2 6/6] configure: Enable KVM on ARM

2012-10-10 Thread Peter Maydell
Enable KVM on ARM hosts, now that all the necessary components
for it exist.

Signed-off-by: Peter Maydell peter.mayd...@linaro.org
---
 configure |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure b/configure
index c4a7837..f87bcd6 100755
--- a/configure
+++ b/configure
@@ -3886,7 +3886,7 @@ case $target_arch2 in
 echo CONFIG_NO_XEN=y  $config_target_mak
 esac
 case $target_arch2 in
-  i386|x86_64|ppcemb|ppc|ppc64|s390x)
+  arm|i386|x86_64|ppcemb|ppc|ppc64|s390x)
 # Make sure the target and host cpus are compatible
 if test $kvm = yes -a $target_softmmu = yes -a \
   \( $target_arch2 = $cpu -o \
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC v2 1/6] linux-headers: Add ARM KVM headers (not for upstream)

2012-10-10 Thread Peter Maydell
This commit adds the ARM KVM headers. This is not to go to QEMU
upstream -- the correct path there is that the KVM code will be
committed to a mainline upstream kernel, and then upstream QEMU
can do a bulk header update from the upstream kernel, which will
allow us to drop this temporary commit.

This is the result of running update-headers on Christoffer's
kvm-arm-v12 branch (commit d151d7a). It includes the new
interrupt delivery ABI, the switch to ONE_REG for core
and coprocessor registers, and the VFP register definitions.

It currently includes some minor x86 header changes which
hopefully will have made it into QEMU upstream by the time
we submit this for merging.
---
 linux-headers/asm-arm/kvm.h  |  131 ++
 linux-headers/asm-arm/kvm_para.h |1 +
 linux-headers/asm-generic/kvm_para.h |5 ++
 linux-headers/asm-x86/kvm.h  |1 +
 linux-headers/linux/kvm.h|   16 -
 5 files changed, 151 insertions(+), 3 deletions(-)
 create mode 100644 linux-headers/asm-arm/kvm.h
 create mode 100644 linux-headers/asm-arm/kvm_para.h
 create mode 100644 linux-headers/asm-generic/kvm_para.h

diff --git a/linux-headers/asm-arm/kvm.h b/linux-headers/asm-arm/kvm.h
new file mode 100644
index 000..b6eaf0c
--- /dev/null
+++ b/linux-headers/asm-arm/kvm.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (C) 2012 - Virtual Open Systems and Columbia University
+ * Author: Christoffer Dall c.d...@virtualopensystems.com
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#ifndef __ARM_KVM_H__
+#define __ARM_KVM_H__
+
+#include asm/types.h
+
+#define __KVM_HAVE_GUEST_DEBUG
+#define __KVM_HAVE_IRQ_LINE
+
+#define KVM_REG_SIZE(id)   \
+   (1U  (((id)  KVM_REG_SIZE_MASK)  KVM_REG_SIZE_SHIFT))
+
+struct kvm_regs {
+   __u32 usr_regs[15]; /* R0_usr - R14_usr */
+   __u32 svc_regs[3];  /* SP_svc, LR_svc, SPSR_svc */
+   __u32 abt_regs[3];  /* SP_abt, LR_abt, SPSR_abt */
+   __u32 und_regs[3];  /* SP_und, LR_und, SPSR_und */
+   __u32 irq_regs[3];  /* SP_irq, LR_irq, SPSR_irq */
+   __u32 fiq_regs[8];  /* R8_fiq - R14_fiq, SPSR_fiq */
+   __u32 pc;   /* The program counter (r15) */
+   __u32 cpsr; /* The guest CPSR */
+};
+
+/* Supported Processor Types */
+#define KVM_ARM_TARGET_CORTEX_A15  0
+#define KVM_ARM_NUM_TARGETS1
+
+struct kvm_vcpu_init {
+   __u32 target;
+   __u32 features[7];
+};
+
+struct kvm_sregs {
+};
+
+struct kvm_fpu {
+};
+
+struct kvm_guest_debug_arch {
+};
+
+struct kvm_debug_exit_arch {
+};
+
+struct kvm_sync_regs {
+};
+
+struct kvm_arch_memory_slot {
+};
+
+/* For KVM_VCPU_GET_REG_LIST. */
+struct kvm_reg_list {
+   __u64 n; /* number of regs */
+   __u64 reg[0];
+};
+
+/* If you need to interpret the index values, here is the key: */
+#define KVM_REG_ARM_COPROC_MASK0x0FFF
+#define KVM_REG_ARM_COPROC_SHIFT   16
+#define KVM_REG_ARM_32_OPC2_MASK   0x0007
+#define KVM_REG_ARM_32_OPC2_SHIFT  0
+#define KVM_REG_ARM_OPC1_MASK  0x0078
+#define KVM_REG_ARM_OPC1_SHIFT 3
+#define KVM_REG_ARM_CRM_MASK   0x0780
+#define KVM_REG_ARM_CRM_SHIFT  7
+#define KVM_REG_ARM_32_CRN_MASK0x7800
+#define KVM_REG_ARM_32_CRN_SHIFT   11
+
+/* Normal registers are mapped as coprocessor 16. */
+#define KVM_REG_ARM_CORE   (0x0010  KVM_REG_ARM_COPROC_SHIFT)
+#define KVM_REG_ARM_CORE_REG(name) (offsetof(struct kvm_regs, name) / 4)
+
+/* Some registers need more space to represent values. */
+#define KVM_REG_ARM_DEMUX  (0x0011  KVM_REG_ARM_COPROC_SHIFT)
+#define KVM_REG_ARM_DEMUX_ID_MASK  0xFF00
+#define KVM_REG_ARM_DEMUX_ID_SHIFT 8
+#define KVM_REG_ARM_DEMUX_ID_CCSIDR(0x00  KVM_REG_ARM_DEMUX_ID_SHIFT)
+#define KVM_REG_ARM_DEMUX_VAL_MASK 0x00FF
+#define KVM_REG_ARM_DEMUX_VAL_SHIFT0
+
+/* VFP registers: we could overload CP10 like ARM does, but that's ugly. */
+#define KVM_REG_ARM_VFP(0x0012  
KVM_REG_ARM_COPROC_SHIFT)
+#define KVM_REG_ARM_VFP_MASK   0x
+#define KVM_REG_ARM_VFP_BASE_REG   0x0
+#define KVM_REG_ARM_VFP_FPSID  0x1000
+#define 

Re: [PATCH v6 03/16] target-i386: Add cpu object access routines for Hypervisor level.

2012-10-10 Thread Andreas Färber
Am 10.10.2012 17:22, schrieb Don Slutz:
 On 10/09/12 15:13, Don Slutz wrote:
 On 10/09/12 12:25, Marcelo Tosatti wrote:
 On Mon, Sep 24, 2012 at 10:32:05AM -0400, Don Slutz wrote:
 +static void x86_cpuid_set_hv_level(Object *obj, Visitor *v, void
 *opaque,
 +const char *name, Error **errp)
 +{
 +X86CPU *cpu = X86_CPU(obj);
 +uint32_t value;
 +
 +visit_type_uint32(v, value, name, errp);
 +if (error_is_set(errp)) {
 +return;
 +}
 +
 +if (value != 0  value  0x4000) {
 +value += 0x4000;
 +}
 Whats the purpose of this? Adds ambiguity.
[...]
 This is direct copy with adjustment from x86_cpuid_set_xlevel():
 
 if (value  0x8000) {
 value += 0x8000;
 }
 
 (Pending patch:
 http://comments.gmane.org/gmane.comp.emulators.qemu/172703 adds this)

(Any pending patch can be changed ;))

 The adjustment is that 0 is a legal value. See
 http://lkml.indiana.edu/hypermail/linux/kernel/1205.0/00100.html
 
 This does mean that just like xlevel=1 and xlevel=0x8001 are the
 same; hypervisor-level=1 and hypervisor-level=0x401 are the same. 
 If this is not wanted, I have no issue with removing it.

I have no strong opinion either way, but if there's only one call site,
I'd prefer to apply these fixups to user input before setting the
property and to have the property setter error out on invalid values. I
consider that cleaner than silently fixing up values inside the setter.

Regards,
Andreas

-- 
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany
GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[GIT PULL] VFIO fixes for 3.7-rc1

2012-10-10 Thread Alex Williamson
Hi Linus,

The following changes since commit 2474542f64432398f503373f53bdf620491bcfa8:

  Merge tag 'for-3.7-rc1' of git://gitorious.org/linux-pwm/linux-pwm 
(2012-10-10 20:15:24 +0900)

are available in the git repository at:


  git://github.com/awilliam/linux-vfio.git tags/vfio-for-v3.7-rc1

for you to fetch changes up to 899649b7d4ead76c19e39251ca886eebe3f811a8:

  vfio: Fix PCI INTx disable consistency (2012-10-10 09:10:32 -0600)


vfio fixes for v3.7-rc1

This includes a fix for PCI BAR mmaps after recent mm changes, fixing an
interrupt race, and fixing a consistency bug in interrupt state when
switching interrupt modes.


Alex Williamson (3):
  vfio: Fix PCI mmap after b3b9c293
  vfio: Move PCI INTx eventfd setting earlier
  vfio: Fix PCI INTx disable consistency

 drivers/vfio/pci/vfio_pci.c   |  7 +++
 drivers/vfio/pci/vfio_pci_intrs.c | 18 +++---
 2 files changed, 18 insertions(+), 7 deletions(-)


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] s390/kvm: Add documentation for KVM_S390_INTERRUPT.

2012-10-10 Thread Marcelo Tosatti
On Tue, Oct 02, 2012 at 04:25:37PM +0200, Christian Borntraeger wrote:
 From: Cornelia Huck cornelia.h...@de.ibm.com
 
 Signed-off-by: Cornelia Huck cornelia.h...@de.ibm.com
 Signed-off-by: Martin Schwidefsky schwidef...@de.ibm.com
 Signed-off-by: Christian Borntraeger borntrae...@de.ibm.com
 ---
  Documentation/virtual/kvm/api.txt | 33 +
  1 file changed, 33 insertions(+)
 
 diff --git a/Documentation/virtual/kvm/api.txt 
 b/Documentation/virtual/kvm/api.txt
 index 36befa7..c58 100644
 --- a/Documentation/virtual/kvm/api.txt
 +++ b/Documentation/virtual/kvm/api.txt
 @@ -1984,6 +1984,39 @@ return the hash table order in the parameter.  (If the 
 guest is using
  the virtualized real-mode area (VRMA) facility, the kernel will
  re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.)
  
 +4.77 KVM_S390_INTERRUPT
 +
 +Capability: basic
 +Archtectures: s390

typo.

 +Type: vm ioctl, vcpu ioctl
 +Parameters: struct kvm_s390_interrupt (in)
 +Returns: 0 on success, -1 on error
 +
 +Allows to inject an interrupt to the guest. Interrupts can be floating
 +(vm ioctl) or per cpu (vcpu ioctl), depending on the interrupt type.
 +
 +Interrupt parameters are passed via kvm_s390_interrupt:
 +
 +struct kvm_s390_interrupt {
 + __u32 type;
 + __u32 parm;
 + __u64 parm64;
 +};

No need for a reserved area for extensibility?

 +
 +type can be one of the following:
 +
 +KVM_S390_SIGP_STOP (vcpu) - sigp restart
 +KVM_S390_PROGRAM_INT (vcpu) - program check; code in parm
 +KVM_S390_SIGP_SET_PREFIX (vcpu) - sigp set prefix; prefix address in parm
 +KVM_S390_RESTART (vcpu) - restart
 +KVM_S390_INT_VIRTIO (vm) - virtio external interrupt; external interrupt
 +parameters in parm and parm64
 +KVM_S390_INT_SERVICE (vm) - sclp external interrupt; sclp parameter in parm
 +KVM_S390_INT_EMERGENCY (vcpu) - sigp emergency; source cpu in parm
 +KVM_S390_INT_EXTERNAL_CALL (vcpu) - sigp external call; source cpu in parm
 +
 +Note that the vcpu ioctl is asynchronous to vpcu execution.

typo


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 5/5] KVM: MMU: introduce FNAME(prefetch_gpte)

2012-10-10 Thread Marcelo Tosatti
On Sun, Oct 07, 2012 at 08:05:11PM +0800, Xiao Guangrong wrote:
 The only difference between FNAME(update_pte) and FNAME(pte_prefetch)
 is that the former is allowed to prefetch gfn from dirty logged slot,
 so introduce a common function to prefetch spte
 
 Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
 ---
  arch/x86/kvm/paging_tmpl.h |   55 +++
  1 files changed, 24 insertions(+), 31 deletions(-)
 
 diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
 index 36a80ed..f887e4c 100644
 --- a/arch/x86/kvm/paging_tmpl.h
 +++ b/arch/x86/kvm/paging_tmpl.h
 @@ -305,31 +305,43 @@ static int FNAME(walk_addr_nested)(struct guest_walker 
 *walker,
   addr, access);
  }
 
 -static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 -   u64 *spte, const void *pte)
 +static bool
 +FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 +  u64 *spte, pt_element_t gpte, bool no_dirty_log)
  {
 - pt_element_t gpte;
   unsigned pte_access;
 + gfn_t gfn;
   pfn_t pfn;
 
 - gpte = *(const pt_element_t *)pte;
   if (prefetch_invalid_gpte(vcpu, sp, spte, gpte))
 - return;
 + return false;
 
   pgprintk(%s: gpte %llx spte %p\n, __func__, (u64)gpte, spte);
 +
 + gfn = gpte_to_gfn(gpte);
   pte_access = sp-role.access  gpte_access(vcpu, gpte);
   protect_clean_gpte(pte_access, gpte);
 - pfn = gfn_to_pfn_atomic(vcpu-kvm, gpte_to_gfn(gpte));
 + pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
 + no_dirty_log  (pte_access  ACC_WRITE_MASK));

Is this a bugfix?


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] s390/kvm: Add documentation for KVM_S390_INTERRUPT.

2012-10-10 Thread Alexander Graf

On 10.10.2012, at 18:27, Marcelo Tosatti wrote:

 On Tue, Oct 02, 2012 at 04:25:37PM +0200, Christian Borntraeger wrote:
 From: Cornelia Huck cornelia.h...@de.ibm.com
 
 Signed-off-by: Cornelia Huck cornelia.h...@de.ibm.com
 Signed-off-by: Martin Schwidefsky schwidef...@de.ibm.com
 Signed-off-by: Christian Borntraeger borntrae...@de.ibm.com
 ---
 Documentation/virtual/kvm/api.txt | 33 +
 1 file changed, 33 insertions(+)
 
 diff --git a/Documentation/virtual/kvm/api.txt 
 b/Documentation/virtual/kvm/api.txt
 index 36befa7..c58 100644
 --- a/Documentation/virtual/kvm/api.txt
 +++ b/Documentation/virtual/kvm/api.txt
 @@ -1984,6 +1984,39 @@ return the hash table order in the parameter.  (If 
 the guest is using
 the virtualized real-mode area (VRMA) facility, the kernel will
 re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.)
 
 +4.77 KVM_S390_INTERRUPT
 +
 +Capability: basic
 +Archtectures: s390
 
 typo.
 
 +Type: vm ioctl, vcpu ioctl
 +Parameters: struct kvm_s390_interrupt (in)
 +Returns: 0 on success, -1 on error
 +
 +Allows to inject an interrupt to the guest. Interrupts can be floating
 +(vm ioctl) or per cpu (vcpu ioctl), depending on the interrupt type.
 +
 +Interrupt parameters are passed via kvm_s390_interrupt:
 +
 +struct kvm_s390_interrupt {
 +__u32 type;
 +__u32 parm;
 +__u64 parm64;
 +};
 
 No need for a reserved area for extensibility?

This is documentation for an existing ioctl, no?
And yes, a reserved area would've been nice ;). But it's too late for that one 
now.


Alex

 
 +
 +type can be one of the following:
 +
 +KVM_S390_SIGP_STOP (vcpu) - sigp restart
 +KVM_S390_PROGRAM_INT (vcpu) - program check; code in parm
 +KVM_S390_SIGP_SET_PREFIX (vcpu) - sigp set prefix; prefix address in parm
 +KVM_S390_RESTART (vcpu) - restart
 +KVM_S390_INT_VIRTIO (vm) - virtio external interrupt; external interrupt
 +   parameters in parm and parm64
 +KVM_S390_INT_SERVICE (vm) - sclp external interrupt; sclp parameter in parm
 +KVM_S390_INT_EMERGENCY (vcpu) - sigp emergency; source cpu in parm
 +KVM_S390_INT_EXTERNAL_CALL (vcpu) - sigp external call; source cpu in parm
 +
 +Note that the vcpu ioctl is asynchronous to vpcu execution.
 
 typo
 
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] s390/kvm: Add documentation for KVM_S390_INTERRUPT.

2012-10-10 Thread Cornelia Huck
On Wed, 10 Oct 2012 13:27:09 -0300
Marcelo Tosatti mtosa...@redhat.com wrote:

  +Archtectures: s390
 
 typo.

  +Note that the vcpu ioctl is asynchronous to vpcu execution.
 
 typo
 
 

Would you like a respin with the typos fixed?

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] s390/kvm: Add documentation for KVM_S390_INTERRUPT.

2012-10-10 Thread Cornelia Huck
On Wed, 10 Oct 2012 18:32:14 +0200
Alexander Graf ag...@suse.de wrote:

 
 On 10.10.2012, at 18:27, Marcelo Tosatti wrote:
 
  On Tue, Oct 02, 2012 at 04:25:37PM +0200, Christian Borntraeger wrote:
  From: Cornelia Huck cornelia.h...@de.ibm.com
  
  Signed-off-by: Cornelia Huck cornelia.h...@de.ibm.com
  Signed-off-by: Martin Schwidefsky schwidef...@de.ibm.com
  Signed-off-by: Christian Borntraeger borntrae...@de.ibm.com
  ---
  Documentation/virtual/kvm/api.txt | 33 +
  1 file changed, 33 insertions(+)
  
  diff --git a/Documentation/virtual/kvm/api.txt 
  b/Documentation/virtual/kvm/api.txt
  index 36befa7..c58 100644
  --- a/Documentation/virtual/kvm/api.txt
  +++ b/Documentation/virtual/kvm/api.txt
  @@ -1984,6 +1984,39 @@ return the hash table order in the parameter.  (If 
  the guest is using
  the virtualized real-mode area (VRMA) facility, the kernel will
  re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.)
  
  +4.77 KVM_S390_INTERRUPT
  +
  +Capability: basic
  +Archtectures: s390
  
  typo.
  
  +Type: vm ioctl, vcpu ioctl
  +Parameters: struct kvm_s390_interrupt (in)
  +Returns: 0 on success, -1 on error
  +
  +Allows to inject an interrupt to the guest. Interrupts can be floating
  +(vm ioctl) or per cpu (vcpu ioctl), depending on the interrupt type.
  +
  +Interrupt parameters are passed via kvm_s390_interrupt:
  +
  +struct kvm_s390_interrupt {
  +  __u32 type;
  +  __u32 parm;
  +  __u64 parm64;
  +};
  
  No need for a reserved area for extensibility?
 
 This is documentation for an existing ioctl, no?
 And yes, a reserved area would've been nice ;). But it's too late for that 
 one now.

Indeed. We'll have to keep this for backward compatibility and introduce
a new mechanism for larger interrupt structures.

 
 
 Alex
 
  
  +
  +type can be one of the following:
  +
  +KVM_S390_SIGP_STOP (vcpu) - sigp restart
  +KVM_S390_PROGRAM_INT (vcpu) - program check; code in parm
  +KVM_S390_SIGP_SET_PREFIX (vcpu) - sigp set prefix; prefix address in parm
  +KVM_S390_RESTART (vcpu) - restart
  +KVM_S390_INT_VIRTIO (vm) - virtio external interrupt; external interrupt
  + parameters in parm and parm64
  +KVM_S390_INT_SERVICE (vm) - sclp external interrupt; sclp parameter in 
  parm
  +KVM_S390_INT_EMERGENCY (vcpu) - sigp emergency; source cpu in parm
  +KVM_S390_INT_EXTERNAL_CALL (vcpu) - sigp external call; source cpu in parm
  +
  +Note that the vcpu ioctl is asynchronous to vpcu execution.
  
  typo
  
  
 

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [RFC v2 4/6] hw/kvm/arm_gic: Implement support for KVM in-kernel ARM GIC

2012-10-10 Thread Andreas Färber
Am 10.10.2012 17:07, schrieb Peter Maydell:
 Implement support for using the KVM in-kernel GIC for ARM.
 
 Signed-off-by: Peter Maydell peter.mayd...@linaro.org
 ---
  hw/a15mpcore.c   |8 ++-
  hw/arm/Makefile.objs |1 +
  hw/kvm/arm_gic.c |  162 
 ++
  3 files changed, 170 insertions(+), 1 deletion(-)
  create mode 100644 hw/kvm/arm_gic.c
 
 diff --git a/hw/a15mpcore.c b/hw/a15mpcore.c
 index fc0a02a..a37fc61 100644
 --- a/hw/a15mpcore.c
 +++ b/hw/a15mpcore.c
 @@ -19,6 +19,7 @@
   */
  
  #include sysbus.h
 +#include kvm.h
  
  /* A15MP private memory region.  */
  
 @@ -41,7 +42,12 @@ static int a15mp_priv_init(SysBusDevice *dev)
  A15MPPrivState *s = FROM_SYSBUS(A15MPPrivState, dev);
  SysBusDevice *busdev;
  
 -s-gic = qdev_create(NULL, arm_gic);
 +if (kvm_irqchip_in_kernel()) {
 +s-gic = qdev_create(NULL, kvm-arm_gic);
 +} else {
 +s-gic = qdev_create(NULL, arm_gic);
 +}

You could follow x86 more closely by just selecting apic_type. (Then if
we drop/change qdev_create() at some point it's one change less.)

 +
  qdev_prop_set_uint32(s-gic, num-cpu, s-num_cpu);
  qdev_prop_set_uint32(s-gic, num-irq, s-num_irq);
  qdev_prop_set_uint32(s-gic, revision, 2);
[...]
 diff --git a/hw/kvm/arm_gic.c b/hw/kvm/arm_gic.c
 new file mode 100644
 index 000..3ec538d
 --- /dev/null
 +++ b/hw/kvm/arm_gic.c
 @@ -0,0 +1,162 @@
 +/*
 + * ARM Generic Interrupt Controller using KVM in-kernel support
 + *
 + * Copyright (c) 2012 Linaro Limited
 + * Written by Peter Maydell
 + *
 + * This program is free software; you can redistribute it and/or modify
 + * it under the terms of the GNU General Public License as published by
 + * the Free Software Foundation, either version 2 of the License, or
 + * (at your option) any later version.
 + *
 + * This program is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 + * GNU General Public License for more details.
 + *
 + * You should have received a copy of the GNU General Public License along
 + * with this program; if not, see http://www.gnu.org/licenses/.
 + */
 +
 +#include hw/sysbus.h
 +#include kvm.h
 +#include hw/arm_gic_internal.h
 +
 +#define TYPE_KVM_ARM_GIC kvm-arm_gic

kvm-arm-gic?

 +#define KVM_ARM_GIC(obj) \
 + OBJECT_CHECK(gic_state, (obj), TYPE_KVM_ARM_GIC)
 +#define KVM_ARM_GIC_CLASS(klass) \
 + OBJECT_CLASS_CHECK(KVMARMGICClass, (klass), TYPE_KVM_ARM_GIC)
 +#define KVM_ARM_GIC_GET_CLASS(obj) \
 + OBJECT_GET_CLASS(KVMARMGICClass, (obj), TYPE_KVM_ARM_GIC)
 +
 +typedef struct KVMARMGICClass {
 +ARMGICCommonClass parent_class;
 +int (*parent_init)(SysBusDevice *dev);
 +void (*parent_reset)(DeviceState *dev);
 +} KVMARMGICClass;
 +
 +static void kvm_arm_gic_set_irq(void *opaque, int irq, int level)
 +{
 +/* Meaning of the 'irq' parameter:
 + *  [0..N-1] : external interrupts
 + *  [N..N+31] : PPI (internal) interrupts for CPU 0
 + *  [N+32..N+63] : PPI (internal interrupts for CPU 1
 + *  ...
 + * Convert this to the kernel's desired encoding, which
 + * has separate fields in the irq number for type,
 + * CPU number and interrupt number.
 + */
 +gic_state *s = (gic_state *)opaque;

You don't happen to have time to clean up gic_state - GICState before
spreading its use? ;)

[...]
 +static TypeInfo arm_gic_info = {

static const

 +.name = TYPE_KVM_ARM_GIC,
 +.parent = TYPE_ARM_GIC_COMMON,
 +.instance_size = sizeof(gic_state),
 +.class_init = kvm_arm_gic_class_init,
 +.class_size = sizeof(KVMARMGICClass),
 +};
 +
 +static void arm_gic_register_types(void)

kvm_arm_gic_register_types?

 +{
 +type_register_static(arm_gic_info);
 +}
 +
 +type_init(arm_gic_register_types)
 

Andreas

-- 
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany
GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [RFC v2 3/6] hw/arm_gic: Add presave/postload hooks

2012-10-10 Thread Andreas Färber
Am 10.10.2012 17:07, schrieb Peter Maydell:
 Add presave/postload hooks to the ARM GIC common base class.
 These will be used by the KVM in-kernel GIC subclass to sync
 state between kernel and userspace when migrating.
 
 Signed-off-by: Peter Maydell peter.mayd...@linaro.org

Reviewed-by: Andreas Färber afaer...@suse.de

/-F

-- 
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany
GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [RFC v2 4/6] hw/kvm/arm_gic: Implement support for KVM in-kernel ARM GIC

2012-10-10 Thread Peter Maydell
On 10 October 2012 18:23, Andreas Färber afaer...@suse.de wrote:
 Am 10.10.2012 17:07, schrieb Peter Maydell:
 Implement support for using the KVM in-kernel GIC for ARM.

 Signed-off-by: Peter Maydell peter.mayd...@linaro.org
 ---
  hw/a15mpcore.c   |8 ++-
  hw/arm/Makefile.objs |1 +
  hw/kvm/arm_gic.c |  162 
 ++
  3 files changed, 170 insertions(+), 1 deletion(-)
  create mode 100644 hw/kvm/arm_gic.c

 diff --git a/hw/a15mpcore.c b/hw/a15mpcore.c
 index fc0a02a..a37fc61 100644
 --- a/hw/a15mpcore.c
 +++ b/hw/a15mpcore.c
 @@ -19,6 +19,7 @@
   */

  #include sysbus.h
 +#include kvm.h

  /* A15MP private memory region.  */

 @@ -41,7 +42,12 @@ static int a15mp_priv_init(SysBusDevice *dev)
  A15MPPrivState *s = FROM_SYSBUS(A15MPPrivState, dev);
  SysBusDevice *busdev;

 -s-gic = qdev_create(NULL, arm_gic);
 +if (kvm_irqchip_in_kernel()) {
 +s-gic = qdev_create(NULL, kvm-arm_gic);
 +} else {
 +s-gic = qdev_create(NULL, arm_gic);
 +}

 You could follow x86 more closely by just selecting apic_type. (Then if
 we drop/change qdev_create() at some point it's one change less.)

Good idea.


 +
  qdev_prop_set_uint32(s-gic, num-cpu, s-num_cpu);
  qdev_prop_set_uint32(s-gic, num-irq, s-num_irq);
  qdev_prop_set_uint32(s-gic, revision, 2);
 [...]
 diff --git a/hw/kvm/arm_gic.c b/hw/kvm/arm_gic.c
 new file mode 100644
 index 000..3ec538d
 --- /dev/null
 +++ b/hw/kvm/arm_gic.c
 @@ -0,0 +1,162 @@
 +/*
 + * ARM Generic Interrupt Controller using KVM in-kernel support
 + *
 + * Copyright (c) 2012 Linaro Limited
 + * Written by Peter Maydell
 + *
 + * This program is free software; you can redistribute it and/or modify
 + * it under the terms of the GNU General Public License as published by
 + * the Free Software Foundation, either version 2 of the License, or
 + * (at your option) any later version.
 + *
 + * This program is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 + * GNU General Public License for more details.
 + *
 + * You should have received a copy of the GNU General Public License along
 + * with this program; if not, see http://www.gnu.org/licenses/.
 + */
 +
 +#include hw/sysbus.h
 +#include kvm.h
 +#include hw/arm_gic_internal.h
 +
 +#define TYPE_KVM_ARM_GIC kvm-arm_gic

 kvm-arm-gic?

I was trying to follow arm_gic but yes, I guess it's better
to go with the recommended style (which is all-hyphens, right?)

 +#define KVM_ARM_GIC(obj) \
 + OBJECT_CHECK(gic_state, (obj), TYPE_KVM_ARM_GIC)
 +#define KVM_ARM_GIC_CLASS(klass) \
 + OBJECT_CLASS_CHECK(KVMARMGICClass, (klass), TYPE_KVM_ARM_GIC)
 +#define KVM_ARM_GIC_GET_CLASS(obj) \
 + OBJECT_GET_CLASS(KVMARMGICClass, (obj), TYPE_KVM_ARM_GIC)
 +
 +typedef struct KVMARMGICClass {
 +ARMGICCommonClass parent_class;
 +int (*parent_init)(SysBusDevice *dev);
 +void (*parent_reset)(DeviceState *dev);
 +} KVMARMGICClass;
 +
 +static void kvm_arm_gic_set_irq(void *opaque, int irq, int level)
 +{
 +/* Meaning of the 'irq' parameter:
 + *  [0..N-1] : external interrupts
 + *  [N..N+31] : PPI (internal) interrupts for CPU 0
 + *  [N+32..N+63] : PPI (internal interrupts for CPU 1
 + *  ...
 + * Convert this to the kernel's desired encoding, which
 + * has separate fields in the irq number for type,
 + * CPU number and interrupt number.
 + */
 +gic_state *s = (gic_state *)opaque;

 You don't happen to have time to clean up gic_state - GICState before
 spreading its use? ;)

I can submit a patch if you like...

 [...]
 +static TypeInfo arm_gic_info = {

 static const

Yep.

 +.name = TYPE_KVM_ARM_GIC,
 +.parent = TYPE_ARM_GIC_COMMON,
 +.instance_size = sizeof(gic_state),
 +.class_init = kvm_arm_gic_class_init,
 +.class_size = sizeof(KVMARMGICClass),
 +};
 +
 +static void arm_gic_register_types(void)

 kvm_arm_gic_register_types?

Oops, yes, cut-n-pasto.

-- PMM
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC 1/2] kvm: Handle undercommitted guest case in PLE handler

2012-10-10 Thread Raghavendra K T

On 10/10/2012 07:54 PM, Andrew Theurer wrote:

I ran 'perf sched map' on the dbench workload for medium and large VMs,
and I thought I would share some of the results.  I think it helps to
visualize what's going on regarding the yielding.

These files are png bitmaps, generated from processing output from 'perf
sched map' (and perf data generated from 'perf sched record').  The Y
axis is the host cpus, each row being 10 pixels high.  For these tests,
there are 80 host cpus, so the total height is 800 pixels.  The X axis
is time (in microseconds), with each pixel representing 1 microsecond.
Each bitmap plots 30,000 microseconds.  The bitmaps are quite wide
obviously, and zooming in/out while viewing is recommended.

Each row (each host cpu) is assigned a color based on what thread is
running.  vCPUs of the same VM are assigned a common color (like red,
blue, magenta, etc), and each vCPU has a unique brightness for that
color.  There are a maximum of 12 assignable colors, so in any VMs 12
revert to vCPU color of gray. I would use more colors, but it becomes
harder to distinguish one color from another.  The white color
represents missing data from perf, and black color represents any thread
which is not a vCPU.

For the following tests, VMs were pinned to host NUMA nodes and to
specific cpus to help with consistency and operate within the
constraints of the last test (gang scheduler).

Here is a good example of PLE.  These are 10-way VMs, 16 of them (as
described above only 12 of the VMs have a color, rest are gray).

https://docs.google.com/open?id=0B6tfUNlZ-14wdmFqUmE5QjJHMFU


This looks very nice to visualize what is happening. Beginning of the 
graph looks little messy but later it is clear.




If you zoom out and look at the whole bitmap, you may notice the 4ms
intervals of the scheduler.  They are pretty well aligned across all
cpus.  Normally, for cpu bound workloads, we would expect to see each
thread to run for 4 ms, then something else getting to run, and so on.
That is mostly true in this test.  We have 2x over-commit and we
generally see the switching of threads at 4ms.  One thing to note is
that not all vCPU threads for the same VM run at exactly the same time,
and that is expected and the whole reason for lock-holder preemption.
Now, if you zoom in on the bitmap, you should notice within the 4ms
intervals there is some task switching going on.  This is most likely
because of the yield_to initiated by the PLE handler.  In this case
there is not that much yielding to do.   It's quite clean, and the
performance is quite good.

Below is an example of PLE, but this time with 20-way VMs, 8 of them.
CPU over-commit is still 2x.

https://docs.google.com/open?id=0B6tfUNlZ-14wdmFqUmE5QjJHMFU


I think this link still 10x16. Could you paste the link again?



This one looks quite different.  In short, it's a mess.  The switching
between tasks can be lower than 10 microseconds.  It basically never
recovers.  There is constant yielding all the time.

Below is again 8 x 20-way VMs, but this time I tried out Nikunj's gang
scheduling patches.  While I am not recommending gang scheduling, I
think it's a good data point.  The performance is 3.88x the PLE result.

https://docs.google.com/open?id=0B6tfUNlZ-14wWXdscWcwNTVEY3M

Note that the task switching intervals of 4ms are quite obvious again,
and this time all vCPUs from same VM run at the same time.  It
represents the best possible outcome.


Anyway, I thought the bitmaps might help better visualize what's going
on.

-Andrew






--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC 1/2] kvm: Handle undercommitted guest case in PLE handler

2012-10-10 Thread Raghavendra K T

On 10/10/2012 08:29 AM, Andrew Theurer wrote:

On Wed, 2012-10-10 at 00:21 +0530, Raghavendra K T wrote:

* Avi Kivity a...@redhat.com [2012-10-04 17:00:28]:


On 10/04/2012 03:07 PM, Peter Zijlstra wrote:

On Thu, 2012-10-04 at 14:41 +0200, Avi Kivity wrote:


Again the numbers are ridiculously high for arch_local_irq_restore.
Maybe there's a bad perf/kvm interaction when we're injecting an
interrupt, I can't believe we're spending 84% of the time running the
popf instruction.


Smells like a software fallback that doesn't do NMI, hrtimer based
sampling typically hits popf where we re-enable interrupts.


Good nose, that's probably it.  Raghavendra, can you ensure that the PMU
is properly exposed?  'dmesg' in the guest will tell.  If it isn't, -cpu
host will expose it (and a good idea anyway to get best performance).



Hi Avi, you are right. SandyBridge machine result was not proper.
I cleaned up the services, enabled PMU, re-ran all the test again.

Here is the summary:
We do get good benefit by increasing ple window. Though we don't
see good benefit for kernbench and sysbench, for ebizzy, we get huge
improvement for 1x scenario. (almost 2/3rd of ple disabled case).

Let me know if you think we can increase the default ple_window
itself to 16k.

I am experimenting with V2 version of undercommit improvement(this) patch
series, But I think if you wish  to go for increase of
default ple_window, then we would have to measure the benefit of patches
when ple_window = 16k.

I can respin the whole series including this default ple_window change.

I also have the perf kvm top result for both ebizzy and kernbench.
I think they are in expected lines now.

Improvements


16 core PLE machine with 16 vcpu guest

base = 3.6.0-rc5 + ple handler optimization patches
base_pleopt_16k = base + ple_window = 16k
base_pleopt_32k = base + ple_window = 32k
base_pleopt_nople = base + ple_gap = 0
kernbench, hackbench, sysbench (time in sec lower is better)
ebizzy (rec/sec higher is better)

% improvements w.r.t base (ple_window = 4k)
---+---+-+---+
|base_pleopt_16k| base_pleopt_32k | base_pleopt_nople |
---+---+-+---+
kernbench_1x   |  0.42371  |  1.15164|   0.09320 |
kernbench_2x   | -1.40981  | -17.48282   |  -570.77053   |
---+---+-+---+
sysbench_1x| -0.92367  | 0.24241 | -0.27027  |
sysbench_2x| -2.22706  |-0.30896 | -1.27573  |
sysbench_3x| -0.75509  | 0.09444 | -2.97756  |
---+---+-+---+
ebizzy_1x  | 54.99976  | 67.29460|  74.14076 |
ebizzy_2x  | -8.83386  |-27.38403| -96.22066 |
---+---+-+---+

perf kvm top observation for kernbench and ebizzy (nople, 4k, 32k window)



Is the perf data for 1x overcommit?


Yes, 16vcpu guest on 16 core




pleopt   ple_gap=0

ebizzy : 18131 records/s
63.78%  [guest.kernel]  [g] _raw_spin_lock_irqsave
 5.65%  [guest.kernel]  [g] smp_call_function_many
 3.12%  [guest.kernel]  [g] clear_page
 3.02%  [guest.kernel]  [g] down_read_trylock
 1.85%  [guest.kernel]  [g] async_page_fault
 1.81%  [guest.kernel]  [g] up_read
 1.76%  [guest.kernel]  [g] native_apic_mem_write
 1.70%  [guest.kernel]  [g] find_vma


Does 'perf kvm top' not give host samples at the same time?  Would be
nice to see the host overhead as a function of varying ple window.  I
would expect that to be the major difference between 4/16/32k window
sizes.


No, I did something like this
perf kvm  --guestvmlinux ./vmlinux.guest top -g  -U -d 3. Yes that is a
good idea.

(I am getting some segfaults with perf top, I think it is already fixed
but yet to see the patch that fixes)





A big concern I have (if this is 1x overcommit) for ebizzy is that it
has just terrible scalability to begin with.  I do not think we should
try to optimize such a bad workload.



I think my way of running dbench has some flaw, so I went to ebizzy.
Could you let me know how you generally run dbench?

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC 1/2] kvm: Handle undercommitted guest case in PLE handler

2012-10-10 Thread David Ahern

On 10/10/12 11:54 AM, Raghavendra K T wrote:

No, I did something like this
perf kvm  --guestvmlinux ./vmlinux.guest top -g  -U -d 3. Yes that is a
good idea.

(I am getting some segfaults with perf top, I think it is already fixed
but yet to see the patch that fixes)


What version of perf:  perf --version


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] virt/kvm: change kvm_assign_device() to print return value when iommu_attach_device() fails

2012-10-10 Thread Marcelo Tosatti
On Mon, Oct 08, 2012 at 06:36:11PM -0600, Shuah Khan wrote:
 Change existing kernel error message to include return value from
 iommu_attach_device() when it fails. This will help debug device
 assignment failures more effectively.
 
 Signed-off-by: Shuah Khan shuah.k...@hp.com
 ---
  virt/kvm/iommu.c |6 +-
  1 file changed, 1 insertion(+), 5 deletions(-)
 
 diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
 index 037cb67..18e1e30 100644
 --- a/virt/kvm/iommu.c
 +++ b/virt/kvm/iommu.c
 @@ -168,11 +168,7 @@ int kvm_assign_device(struct kvm *kvm,
  
   r = iommu_attach_device(domain, pdev-dev);
   if (r) {
 - printk(KERN_ERR assign device %x:%x:%x.%x failed,
 - pci_domain_nr(pdev-bus),
 - pdev-bus-number,
 - PCI_SLOT(pdev-devfn),
 - PCI_FUNC(pdev-devfn));
 + dev_err(pdev-dev, kvm assign device failed ret %d, r);
   return r;
   }

Why removal of domain,bus,slot,func from the message?

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/3] KVM: Add KVM_VCPU_GET_REG_LIST/KVM_CAP_REG_LIST.

2012-10-10 Thread Marcelo Tosatti
On Wed, Sep 05, 2012 at 05:28:46PM +0930, Rusty Russell wrote:
 This is a generic interface to find out what you can use
 KVM_GET_ONE_REG/KVM_SET_ONE_REG on.  Archs need to define
 KVM_HAVE_REG_LIST and then kvm_arch_num_regs() and
 kvm_arch_copy_reg_indices() functions.
 
 It's inspired by KVM_GET_MSR_INDEX_LIST, except it's a per-vcpu ioctl,
 and uses 64-bit indices.
 
 Signed-off-by: Rusty Russell rusty.russ...@linaro.org
 ---
  Documentation/virtual/kvm/api.txt |   20 
  include/linux/kvm.h   |   12 
  include/linux/kvm_host.h  |5 -
  virt/kvm/kvm_main.c   |   20 
  4 files changed, 56 insertions(+), 1 deletion(-)
 
 diff --git a/Documentation/virtual/kvm/api.txt 
 b/Documentation/virtual/kvm/api.txt
 index b91bfd4..f30c3d0 100644
 --- a/Documentation/virtual/kvm/api.txt
 +++ b/Documentation/virtual/kvm/api.txt
 @@ -1985,6 +1985,26 @@ the virtualized real-mode area (VRMA) facility, the 
 kernel will
  re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.)
  
  
 +4.76 KVM_VCPU_GET_REG_LIST
 +
 +Capability: KVM_CAP_REG_LIST
 +Architectures: all
 +Type: vcpu ioctl
 +Parameters: struct kvm_reg_list (in/out)
 +Returns: 0 on success; -1 on error
 +Errors:
 +  E2BIG: the reg index list is too big to fit in the array specified by
 + the user (the number required will be written into n).
 +
 +struct kvm_reg_list {
 + __u64 n; /* number of registers in reg[] */
 + __u64 reg[0];
 +};

 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
 index 169a001..453fe93 100644
 --- a/virt/kvm/kvm_main.c
 +++ b/virt/kvm/kvm_main.c
 @@ -2082,6 +2082,23 @@ out_free2:
   break;
   }
  #endif
 +#ifdef KVM_HAVE_REG_LIST
 + case KVM_VCPU_GET_REG_LIST: {
 + struct kvm_reg_list __user *user_list = argp;
 + struct kvm_reg_list reg_list;
 + unsigned n;
 +
 + if (copy_from_user(reg_list, user_list, sizeof reg_list))
 + return -EFAULT;
 + n = reg_list.n;
 + reg_list.n = kvm_arch_num_regs(vcpu);

The code does not actually support more than 2^32 registers, does it?
Why __u64 n ?


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC 1/2] kvm: Handle undercommitted guest case in PLE handler

2012-10-10 Thread Raghavendra K T

On 10/10/2012 11:33 PM, David Ahern wrote:

On 10/10/12 11:54 AM, Raghavendra K T wrote:

No, I did something like this
perf kvm  --guestvmlinux ./vmlinux.guest top -g  -U -d 3. Yes that is a
good idea.

(I am getting some segfaults with perf top, I think it is already fixed
but yet to see the patch that fixes)


What version of perf:  perf --version



perf version 2.6.32-279.el6.x86_64.debug

(I searched that it is fixed in 288. could not dig-out actual patch
though)

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] virt/kvm: change kvm_assign_device() to print return value when iommu_attach_device() fails

2012-10-10 Thread Shuah Khan
On Wed, Oct 10, 2012 at 10:51 AM, Marcelo Tosatti mtosa...@redhat.com wrote:
 On Mon, Oct 08, 2012 at 06:36:11PM -0600, Shuah Khan wrote:
 Change existing kernel error message to include return value from
 iommu_attach_device() when it fails. This will help debug device
 assignment failures more effectively.

 Signed-off-by: Shuah Khan shuah.k...@hp.com
 ---
  virt/kvm/iommu.c |6 +-
  1 file changed, 1 insertion(+), 5 deletions(-)

 diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
 index 037cb67..18e1e30 100644
 --- a/virt/kvm/iommu.c
 +++ b/virt/kvm/iommu.c
 @@ -168,11 +168,7 @@ int kvm_assign_device(struct kvm *kvm,

   r = iommu_attach_device(domain, pdev-dev);
   if (r) {
 - printk(KERN_ERR assign device %x:%x:%x.%x failed,
 - pci_domain_nr(pdev-bus),
 - pdev-bus-number,
 - PCI_SLOT(pdev-devfn),
 - PCI_FUNC(pdev-devfn));
 + dev_err(pdev-dev, kvm assign device failed ret %d, r);
   return r;
   }

 Why removal of domain,bus,slot,func from the message?


dev_err() includes that information like this:

pci :00:1d.0: assign device failed ret 0

Without the removal of domain,bus,slot,func from the message:

It will look like:

pci :00:1d.0: assign device 0:0:1d.0 failed ret 0

Initially I left it in there and after testing, realized it is
duplicate information and decided to remove it.

-- Shuah
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] s390/kvm: Add documentation for KVM_S390_INTERRUPT.

2012-10-10 Thread Marcelo Tosatti
On Wed, Oct 10, 2012 at 06:37:57PM +0200, Cornelia Huck wrote:
 On Wed, 10 Oct 2012 13:27:09 -0300
 Marcelo Tosatti mtosa...@redhat.com wrote:
 
   +Archtectures: s390
  
  typo.
 
   +Note that the vcpu ioctl is asynchronous to vpcu execution.
  
  typo
  
  
 
 Would you like a respin with the typos fixed?

No.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] virt/kvm: change kvm_assign_device() to print return value when iommu_attach_device() fails

2012-10-10 Thread Marcelo Tosatti
On Mon, Oct 08, 2012 at 06:36:11PM -0600, Shuah Khan wrote:
 Change existing kernel error message to include return value from
 iommu_attach_device() when it fails. This will help debug device
 assignment failures more effectively.
 
 Signed-off-by: Shuah Khan shuah.k...@hp.com

Applied, thanks.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 00/14] KVM/ARM Implementation

2012-10-10 Thread Marcelo Tosatti
On Mon, Oct 01, 2012 at 05:09:59AM -0400, Christoffer Dall wrote:
 The following series implements KVM support for ARM processors,
 specifically on the Cortex A-15 platform.  We feel this is ready to be
 merged.
 
 Work is done in collaboration between Columbia University, Virtual Open
 Systems and ARM/Linaro.
 
 The patch series applies to Linux 3.6 with a number of merges:
  1. git://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git
 branch: hyp-mode-boot-next (e5a04cb0b4a)
  2. git://git.kernel.org/pub/scm/linux/kernel/git/maz/arm-platforms.git
 branch: timers-next (437814c44c)
  3. git://git.kernel.org/pub/scm/virt/kvm/kvm.git
 branch: next (1e08ec4a)
 
 This is Version 12 of the patch series, the first 10 versions were
 reviewed on the KVM/ARM and KVM mailing lists. Changes can also be
 pulled from:
 git://github.com/virtualopensystems/linux-kvm-arm.git
 branch: kvm-arm-v12
 branch: kvm-arm-v12-vgic
 branch: kvm-arm-v12-vgic-timers
 
 A non-flattened edition of the patch series, which can always be merged,
 can be found at:
  git://github.com/virtualopensystems/linux-kvm-arm.git kvm-arm-master
 
 This patch series requires QEMU compatibility.  Use the branch
  git://github.com/virtualopensystems/qemu.git kvm-arm
 
 Following this patch series, which implements core KVM support are two
 other patch series implementing Virtual Generic Interrupt Controller
 (VGIC) support and Architected Generic Timers.  All three patch series
 should be applied for full QEMU compatibility.
 
 The implementation is broken up into a logical set of patches, the first
 are preparatory patches:
   1. ARM: Add page table defines for KVM
   3. ARM: Section based HYP idmaps
   3. ARM: Factor out cpuid implementor and part_number fields
 
 The main implementation is broken up into separate patches, the first
 containing a skeleton of files, makefile changes, the basic user space
 interface and KVM architecture specific stubs.  Subsequent patches
 implement parts of the system as listed:
   4. Skeleton and reset hooks
   5. Hypervisor initialization
   6. Memory virtualization setup (hyp mode mappings and 2nd stage)
   7. Inject IRQs and FIQs from userspace
   8. World-switch implementation and Hyp exception vectors
   9. Emulation framework and coproc emulation
  10. Coproc user space API
  11. Demux multiplexed coproc registers
  12. User spac API to get/set VFP registers
  13. Handle guest user memory aborts
  14. Handle guest MMIO aborts
 
 Testing:
  Tested on FAST Models and Versatile Express test-chip2.  Tested by
  running three simultaenous VMs, all running SMP, on an SMP host, each
  VM running hackbench and cyclictest and with extreme memory pressure
  applied to the host with swapping enabled to provoke page eviction.
  Also tested KSM merging and GCC inside VMs.  Fully boots both Ubuntu
  (user space Thumb-2) and Debian (user space ARM) guests.
 
 For a guide on how to set up a testing environment and try out these
 patches, see:
  http://www.virtualopensystems.com/media/pdf/kvm-arm-guide.pdf
 
 Changes since v11:
  - Memory setup and page table defines reworked
  - We do not export unused perf bitfields anymore
  - No module support anymore and following cleanup
  - Hide vcpu register accessors
  - Fix unmap range mmu notifier race condition
  - Factored out A15 coprocs in separate file
  - Factored out world-switch assembly macros to separate file
  - Add dmux of multiplexed coprocs to user space
  - Add VFP get/set interface to user space
  - Addressed various cleanup comments from reviewers
 
 Changes since v10:
  - Boot in Hyp mode and user HVC to initialize HVBAR
  - Support VGIC
  - Support Arch timers
  - Support Thumb-2 mmio instruction decoding
  - Transition to GET_ONE/SET_ONE register API
  - Added KVM_VCPU_GET_REG_LIST
  - New interrupt injection API
  - Don't pin guest pages anymore
  - Fix race condition in page fault handler
  - Cleanup guest instruction copying.
  - Fix race when copying SMP guest instructions
  - Inject data/prefetch aborts when guest does something strange
 
 Changes since v9:
  - Addressed reviewer comments (see mailing list archive)
  - Limit the user of .arch_extensiion sec/virt for compilers that need them
  - VFP/Neon Support (Antonios Motakis)
  - Run exit handling under preemption and still handle guest cache ops
  - Add support for IO mapping at Hyp level (VGIC prep)
  - Add support for IO mapping at Guest level (VGIC prep)
  - Remove backdoor call to irq_svc
  - Complete rework of CP15 handling and register reset (Rusty Russell)
  - Don't use HSTR for anything else than CR 15
  - New ioctl to set emulation target core (only A15 supported for now)
  - Support KVM_GET_MSRS / KVM_SET_MSRS
  - Add page accounting and page table eviction
  - Change pgd lock to spinlock and fix sleeping in atomic bugs
  - Check kvm_condition_valid for HVC traps of undefs
  - Added a naive implementation of kvm_unmap_hva_range
 
 

Re: [PATCH 2/3] s390/kvm: Add documentation for KVM_S390_INTERRUPT.

2012-10-10 Thread Marcelo Tosatti
On Wed, Oct 10, 2012 at 03:43:10PM -0300, Marcelo Tosatti wrote:
 On Wed, Oct 10, 2012 at 06:37:57PM +0200, Cornelia Huck wrote:
  On Wed, 10 Oct 2012 13:27:09 -0300
  Marcelo Tosatti mtosa...@redhat.com wrote:
  
+Archtectures: s390
   
   typo.
  
+Note that the vcpu ioctl is asynchronous to vpcu execution.
   
   typo
   
   
  
  Would you like a respin with the typos fixed?
 
 No.

Applied, thanks.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC 1/2] kvm: Handle undercommitted guest case in PLE handler

2012-10-10 Thread Andrew Theurer
On Wed, 2012-10-10 at 23:13 +0530, Raghavendra K T wrote:
 On 10/10/2012 07:54 PM, Andrew Theurer wrote:
  I ran 'perf sched map' on the dbench workload for medium and large VMs,
  and I thought I would share some of the results.  I think it helps to
  visualize what's going on regarding the yielding.
 
  These files are png bitmaps, generated from processing output from 'perf
  sched map' (and perf data generated from 'perf sched record').  The Y
  axis is the host cpus, each row being 10 pixels high.  For these tests,
  there are 80 host cpus, so the total height is 800 pixels.  The X axis
  is time (in microseconds), with each pixel representing 1 microsecond.
  Each bitmap plots 30,000 microseconds.  The bitmaps are quite wide
  obviously, and zooming in/out while viewing is recommended.
 
  Each row (each host cpu) is assigned a color based on what thread is
  running.  vCPUs of the same VM are assigned a common color (like red,
  blue, magenta, etc), and each vCPU has a unique brightness for that
  color.  There are a maximum of 12 assignable colors, so in any VMs 12
  revert to vCPU color of gray. I would use more colors, but it becomes
  harder to distinguish one color from another.  The white color
  represents missing data from perf, and black color represents any thread
  which is not a vCPU.
 
  For the following tests, VMs were pinned to host NUMA nodes and to
  specific cpus to help with consistency and operate within the
  constraints of the last test (gang scheduler).
 
  Here is a good example of PLE.  These are 10-way VMs, 16 of them (as
  described above only 12 of the VMs have a color, rest are gray).
 
  https://docs.google.com/open?id=0B6tfUNlZ-14wdmFqUmE5QjJHMFU
 
 This looks very nice to visualize what is happening. Beginning of the 
 graph looks little messy but later it is clear.
 
 
  If you zoom out and look at the whole bitmap, you may notice the 4ms
  intervals of the scheduler.  They are pretty well aligned across all
  cpus.  Normally, for cpu bound workloads, we would expect to see each
  thread to run for 4 ms, then something else getting to run, and so on.
  That is mostly true in this test.  We have 2x over-commit and we
  generally see the switching of threads at 4ms.  One thing to note is
  that not all vCPU threads for the same VM run at exactly the same time,
  and that is expected and the whole reason for lock-holder preemption.
  Now, if you zoom in on the bitmap, you should notice within the 4ms
  intervals there is some task switching going on.  This is most likely
  because of the yield_to initiated by the PLE handler.  In this case
  there is not that much yielding to do.   It's quite clean, and the
  performance is quite good.
 
  Below is an example of PLE, but this time with 20-way VMs, 8 of them.
  CPU over-commit is still 2x.
 
  https://docs.google.com/open?id=0B6tfUNlZ-14wdmFqUmE5QjJHMFU
 
 I think this link still 10x16. Could you paste the link again?

Oops
https://docs.google.com/open?id=0B6tfUNlZ-14wSGtYYzZtRTcyVjQ

 
 
  This one looks quite different.  In short, it's a mess.  The switching
  between tasks can be lower than 10 microseconds.  It basically never
  recovers.  There is constant yielding all the time.
 
  Below is again 8 x 20-way VMs, but this time I tried out Nikunj's gang
  scheduling patches.  While I am not recommending gang scheduling, I
  think it's a good data point.  The performance is 3.88x the PLE result.
 
  https://docs.google.com/open?id=0B6tfUNlZ-14wWXdscWcwNTVEY3M
 
  Note that the task switching intervals of 4ms are quite obvious again,
  and this time all vCPUs from same VM run at the same time.  It
  represents the best possible outcome.
 
 
  Anyway, I thought the bitmaps might help better visualize what's going
  on.
 
  -Andrew
 
 
 
 
 


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC 1/2] kvm: Handle undercommitted guest case in PLE handler

2012-10-10 Thread Andrew Theurer
On Wed, 2012-10-10 at 23:24 +0530, Raghavendra K T wrote:
 On 10/10/2012 08:29 AM, Andrew Theurer wrote:
  On Wed, 2012-10-10 at 00:21 +0530, Raghavendra K T wrote:
  * Avi Kivity a...@redhat.com [2012-10-04 17:00:28]:
 
  On 10/04/2012 03:07 PM, Peter Zijlstra wrote:
  On Thu, 2012-10-04 at 14:41 +0200, Avi Kivity wrote:
 
  Again the numbers are ridiculously high for arch_local_irq_restore.
  Maybe there's a bad perf/kvm interaction when we're injecting an
  interrupt, I can't believe we're spending 84% of the time running the
  popf instruction.
 
  Smells like a software fallback that doesn't do NMI, hrtimer based
  sampling typically hits popf where we re-enable interrupts.
 
  Good nose, that's probably it.  Raghavendra, can you ensure that the PMU
  is properly exposed?  'dmesg' in the guest will tell.  If it isn't, -cpu
  host will expose it (and a good idea anyway to get best performance).
 
 
  Hi Avi, you are right. SandyBridge machine result was not proper.
  I cleaned up the services, enabled PMU, re-ran all the test again.
 
  Here is the summary:
  We do get good benefit by increasing ple window. Though we don't
  see good benefit for kernbench and sysbench, for ebizzy, we get huge
  improvement for 1x scenario. (almost 2/3rd of ple disabled case).
 
  Let me know if you think we can increase the default ple_window
  itself to 16k.
 
  I am experimenting with V2 version of undercommit improvement(this) patch
  series, But I think if you wish  to go for increase of
  default ple_window, then we would have to measure the benefit of patches
  when ple_window = 16k.
 
  I can respin the whole series including this default ple_window change.
 
  I also have the perf kvm top result for both ebizzy and kernbench.
  I think they are in expected lines now.
 
  Improvements
  
 
  16 core PLE machine with 16 vcpu guest
 
  base = 3.6.0-rc5 + ple handler optimization patches
  base_pleopt_16k = base + ple_window = 16k
  base_pleopt_32k = base + ple_window = 32k
  base_pleopt_nople = base + ple_gap = 0
  kernbench, hackbench, sysbench (time in sec lower is better)
  ebizzy (rec/sec higher is better)
 
  % improvements w.r.t base (ple_window = 4k)
  ---+---+-+---+
  |base_pleopt_16k| base_pleopt_32k | base_pleopt_nople |
  ---+---+-+---+
  kernbench_1x   |  0.42371  |  1.15164|   0.09320 |
  kernbench_2x   | -1.40981  | -17.48282   |  -570.77053   |
  ---+---+-+---+
  sysbench_1x| -0.92367  | 0.24241 | -0.27027  |
  sysbench_2x| -2.22706  |-0.30896 | -1.27573  |
  sysbench_3x| -0.75509  | 0.09444 | -2.97756  |
  ---+---+-+---+
  ebizzy_1x  | 54.99976  | 67.29460|  74.14076 |
  ebizzy_2x  | -8.83386  |-27.38403| -96.22066 |
  ---+---+-+---+
 
  perf kvm top observation for kernbench and ebizzy (nople, 4k, 32k window)
  
 
  Is the perf data for 1x overcommit?
 
 Yes, 16vcpu guest on 16 core
 
 
  pleopt   ple_gap=0
  
  ebizzy : 18131 records/s
  63.78%  [guest.kernel]  [g] _raw_spin_lock_irqsave
   5.65%  [guest.kernel]  [g] smp_call_function_many
   3.12%  [guest.kernel]  [g] clear_page
   3.02%  [guest.kernel]  [g] down_read_trylock
   1.85%  [guest.kernel]  [g] async_page_fault
   1.81%  [guest.kernel]  [g] up_read
   1.76%  [guest.kernel]  [g] native_apic_mem_write
   1.70%  [guest.kernel]  [g] find_vma
 
  Does 'perf kvm top' not give host samples at the same time?  Would be
  nice to see the host overhead as a function of varying ple window.  I
  would expect that to be the major difference between 4/16/32k window
  sizes.
 
 No, I did something like this
 perf kvm  --guestvmlinux ./vmlinux.guest top -g  -U -d 3. Yes that is a
 good idea.
 
 (I am getting some segfaults with perf top, I think it is already fixed
 but yet to see the patch that fixes)
 
 
 
 
  A big concern I have (if this is 1x overcommit) for ebizzy is that it
  has just terrible scalability to begin with.  I do not think we should
  try to optimize such a bad workload.
 
 
 I think my way of running dbench has some flaw, so I went to ebizzy.
 Could you let me know how you generally run dbench?

I mount a tmpfs and then specify that mount for dbench to run on.  This
eliminates all IO.  I use a 300 second run time and number of threads is
equal to number of vcpus.  All of the VMs of course need to have a
synchronized start.

I would also make sure you are using a recent kernel for dbench, where
the dcache scalability is much improved.  Without any 

Guest status - Windows 8 developer preview

2012-10-10 Thread Josh Heidenreich
Hey,

I just installed Windows 8 developer preview on our libvirt/kvm setup
here at work, for IE10 testing.

Host:
 - Debian 6.0.4
 - 2.6.32-5-amd64
 - AMD FX(tm)-6100 Six-Core Processor (3.3 Ghz)
 - 4 gig ram

Guest is set up to use 1 gig of ram, it's the 32-bit version installed.

Everything appears to work correctly. We access it using VNC which has
been working fine. Performance is acceptable, although we might think of
bumping up the allocated guest ram. CPU usage appears a bit on the high
side, I'm monitoring the situation.

Cheers,
Josh


-- 
leet programming master
www.karmabunny.com.au 

talk // 08 7120 7100
post // po box 188, stepney sa 5069
street // the church, 6 george street, stepney sa 5069



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH] Enabling IA32_TSC_ADJUST for guest VM

2012-10-10 Thread Auld, Will
Marcelo,

You are suggesting that I:
Kvm_host.h:

Struct kvm_x86_ops {

...
change
Int (*set_msr)(struct kvm_vcpu * vcpu, u32 mrs_index, u64 data);
to
Int (*set_msr)(struct kvm_vcpu * vcpu, u32 mrs_index, u64 data, bool 
from_guest);
...
};

and so on down the line to set_msr_common(), kvm_write_tsc()... in a separate 
patch before other related patches?

As far as the initialization after live migration, I will provide some output 
with explanation once I am able to again. At the moment, I have hosed my system 
and need to figure out what's wrong and fix it first. 

Thanks,

Will

-Original Message-
From: Marcelo Tosatti [mailto:mtosa...@redhat.com] 
Sent: Wednesday, October 10, 2012 5:53 AM
To: Auld, Will
Cc: Avi Kivity; kvm@vger.kernel.org; Zhang, Xiantao; Liu, Jinsong
Subject: Re: [PATCH] Enabling IA32_TSC_ADJUST for guest VM

On Tue, Oct 09, 2012 at 04:10:30PM +, Auld, Will wrote:
 I am just testing the second version of this patch. It addresses all the 
 comments so far except Marcelo's issue with breaking the function 
 compute_guest_tsc(). 

Lets try to merge the missing patch from Zachary first (that'll make it clear).

 
 I needed to put the call for updating the TSC_ADJUST_MSR in kvm_write_tsc() 
 to ensure it is only called from user space. Other changes added to vmcs 
 offset should not be tracked in TSC_ADJUST_MSR. 

Please have a separate, earlier patch making that explicit (by passing a bool 
to kvm_x86_ops-set_msr then to kvm_set_msr_common). that = whether msr write 
is guest initiated or not.

 I had some trouble with the order of initialization during live migration. 
 TSC_ADJUST is initialized first but then wiped out by multiple 
 initializations of tsc. The fix for this is to not update TSC_ADJUST if the 
 vmcs offset is not actually changing with the tsc write. So, after migration 
 outcome is that vmcs offset gets defined independent from the migrating value 
 of TSC_ADJUST. I believe this is what we want to happen.

Can you please be more explicit regarding wiped out by multiple 
initializations of tsc ? 

It is probably best to maintain TSC_ADJUST separately, in software, and then 
calculate TSC_OFFSET.

 Thanks,
 
 Will
 
 -Original Message-
 From: Avi Kivity [mailto:a...@redhat.com]
 Sent: Tuesday, October 09, 2012 5:12 AM
 To: Marcelo Tosatti
 Cc: Auld, Will; kvm@vger.kernel.org; Zhang, Xiantao
 Subject: Re: [PATCH] Enabling IA32_TSC_ADJUST for guest VM
 
 On 10/08/2012 07:30 PM, Marcelo Tosatti wrote:
  
  From Intel's manual:
  
  • If an execution of WRMSR to the IA32_TIME_STAMP_COUNTER MSR adds 
  (or
  subtracts) value X from the TSC,
  the logical processor also adds (or subtracts) value X from the 
  IA32_TSC_ADJUST MSR.
  
  This is not handled in the patch. 
  
  To support migration, it will be necessary to differentiate between 
  guest initiated and userspace-model initiated msr write. That is, 
  only guest initiated TSC writes should affect the value of 
  IA32_TSC_ADJUST MSR.
  
  Avi, any better idea?
  
 
 I think we need that anyway, since there are some read-only MSRs that need to 
 be configured by the host (nvmx capabilities).  So if we add that feature it 
 will be useful elsewhere.  I don't think it's possible to do it in any other 
 way:
 
 Local offset value of the IA32_TSC for a logical processor. Reset value is 
 Zero. A write to IA32_TSC will modify the local offset in IA32_TSC_ADJUST and 
 the content of IA32_TSC, but does not affect the internal invariant TSC 
 hardware.
 
 What we want to do is affect the internal invariant TSC hardware, so we can't 
 do that through the normal means.
 
 btw, will tsc writes from userspace (after live migration) cause tsc skew?  
 If so we should think how to model a guest-wide tsc.
 
 --
 error compiling committee.c: too many arguments to function 
 N?r??yb?X??ǧv?^?)޺{.n?+h????ܨ}???Ơz?j:+v??? 
 zZ+??+zf???h???~i???z??w??)ߢf


Re: [Qemu-devel] Using PCI config space to indicate config location

2012-10-10 Thread Rusty Russell
Rusty Russell ru...@rustcorp.com.au writes:
 Jamie Lokier ja...@shareable.org writes:

 Rusty Russell wrote:
 I don't think it'll be that bad; reset clears the device to unknown,
 bar0 moves it from unknown-legacy mode, bar1/2/3 changes it from
 unknown-modern mode, and anything else is bad (I prefer being strict so
 we catch bad implementations from the beginning).

 Will that work, if the guest with kernel that uses modern mode, kexecs
 to an older (but presumed reliable) kernel that only knows about legacy mode?

 I.e. will the replacement kernel, or (ideally) replacement driver on
 the rare occasion that is needed on a running kernel, be able to reset
 the device hard enough?

 Well, you need to reset the device, so yes.

MST said something which made me think harder about this case.

Either there needs to be a way to tell what mode the device is in, or
legacy reset has to work, even in modern mode.  The latter is
preferable, since it allows an older kernel to do the reset.

Now, since qemu would almost certainly have to add code to stop that
working, it'll probably be fine.  But I'd really like to add a strict
mode to qemu virtio which does extra sanity checking for driver
authors, and that might break this.  That's OK.

Thanks,
Rusty.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] Using PCI config space to indicate config location

2012-10-10 Thread Rusty Russell
Michael S. Tsirkin m...@redhat.com writes:
 On Wed, Oct 10, 2012 at 08:36:12AM -0500, Anthony Liguori wrote:
 Rusty Russell ru...@rustcorp.com.au writes:
 
  Gerd Hoffmann kra...@redhat.com writes:
  So how about this:
 
  (1) Add a vendor specific pci capability for new-style virtio.
  Specifies the pci bar used for new-style virtio registers.
  Guests can use it to figure whenever new-style virtio is
  supported and to map the correct bar (which will probably
  be bar 1 in most cases).
 
  This was closer to the original proposal[1], which I really liked (you
  can layout bars however you want).  Anthony thought that vendor
  capabilities were a PCI-e feature, but it seems they're blessed in PCI
  2.3.
 
 2.3 was standardized in 2002.  Are we confident that vendor extensions
 play nice with pre-2.3 OSes like Win2k, WinXP, etc?

2.2 (1998) had the capability IDs linked list, indicated by bit 4 in the
status register, but reserved ids 7 and above.  ID 9 (vendor specific)
was added in 2.3; it should be ignored, but will require testing of
course, like any change.

2.1 didn't have the capability ID linked list at all; bit 4 in the
status register was reserved.

QEMU's pci.c has capability support, and sets the capabiliy status bit
and list pointer when the driver requests (eg. the eepro100).

 Pre 2.3 it was the case that *all* space outside
 the capability linked list, and any capability not
 recognized by space, was considered vendor specific.
 So there's no problem really.

Oh in theory, sure.  In practice, almost certainly.  But this needs to
be proved with actual testing.

Cheers,
Rusty.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/3] virtio-net: inline header support

2012-10-10 Thread Rusty Russell
Paolo Bonzini pbonz...@redhat.com writes:
 Il 09/10/2012 06:59, Rusty Russell ha scritto:
 Paolo Bonzini pbonz...@redhat.com writes:
 Il 05/10/2012 07:43, Rusty Russell ha scritto:
 That's good.  But virtio_blk's scsi command is insoluble AFAICT.  As I
 said to Anthony, the best rules are always and never, so I'd really
 rather not have to grandfather that in.

 It is, but we can add a rule that if the (transport) flag
 VIRTIO_RING_F_ANY_HEADER_SG is set, the cdb field is always 32 bytes in
 virtio-blk.
 
 Could we do that?  It's the cmd length I'm concerned about; is it always
 32 in practice for some reason?

 It is always 32 or less except in very obscure cases that are pretty
 much confined to iSCSI.  We don't care about the obscure cases, and the
 extra bytes don't hurt.

 BTW, 32 is the default cdb_size used by virtio-scsi.

 Currently qemu does:
 
 struct sg_io_hdr hdr;
 memset(hdr, 0, sizeof(struct sg_io_hdr));
 hdr.interface_id = 'S';
 hdr.cmd_len = req-elem.out_sg[1].iov_len;
 hdr.cmdp = req-elem.out_sg[1].iov_base;
 hdr.dxfer_len = 0;
 
 If it's a command which expects more output data, there's no way to
 guess where the boundary is between that command and the data.

 Yep, so I understood the problem right.

OK.  Well, Anthony wants qemu to be robust in this regard, so I am
tempted to rework all the qemu drivers to handle arbitrary layouts.
They could use a good audit anyway.

This would become a glaring exception, but I'm tempted to fix it to 32
bytes at the same time as we get the new pci layout (ie. for the virtio
1.0 spec).  The Linux driver would carefully be backwards compatible, of
course, and the spec would document why.

Cheers,
Rusty.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 6/7] spapr: Pass PowerPCCPU to spapr_hypercall()

2012-10-10 Thread Andreas Färber
Needed for changing the hypercall handlers' argument type to PowerPCCPU.

Signed-off-by: Andreas Färber afaer...@suse.de
---
 hw/spapr.c   |2 +-
 hw/spapr.h   |2 +-
 hw/spapr_hcall.c |4 +++-
 target-ppc/kvm.c |3 ++-
 4 Dateien geändert, 7 Zeilen hinzugefügt(+), 4 Zeilen entfernt(-)

diff --git a/hw/spapr.c b/hw/spapr.c
index 914d60c..ac1f7de 100644
--- a/hw/spapr.c
+++ b/hw/spapr.c
@@ -578,7 +578,7 @@ static void emulate_spapr_hypercall(PowerPCCPU *cpu)
 hcall_dprintf(Hypercall made with MSR[PR]=1\n);
 env-gpr[3] = H_PRIVILEGE;
 } else {
-env-gpr[3] = spapr_hypercall(env, env-gpr[3], env-gpr[4]);
+env-gpr[3] = spapr_hypercall(cpu, env-gpr[3], env-gpr[4]);
 }
 }
 
diff --git a/hw/spapr.h b/hw/spapr.h
index e984e3f..feea401 100644
--- a/hw/spapr.h
+++ b/hw/spapr.h
@@ -288,7 +288,7 @@ typedef target_ulong (*spapr_hcall_fn)(CPUPPCState *env, 
sPAPREnvironment *spapr
target_ulong *args);
 
 void spapr_register_hypercall(target_ulong opcode, spapr_hcall_fn fn);
-target_ulong spapr_hypercall(CPUPPCState *env, target_ulong opcode,
+target_ulong spapr_hypercall(PowerPCCPU *cpu, target_ulong opcode,
  target_ulong *args);
 
 int spapr_allocate_irq(int hint, bool lsi);
diff --git a/hw/spapr_hcall.c b/hw/spapr_hcall.c
index 194d9c2..1594f30 100644
--- a/hw/spapr_hcall.c
+++ b/hw/spapr_hcall.c
@@ -678,9 +678,11 @@ void spapr_register_hypercall(target_ulong opcode, 
spapr_hcall_fn fn)
 *slot = fn;
 }
 
-target_ulong spapr_hypercall(CPUPPCState *env, target_ulong opcode,
+target_ulong spapr_hypercall(PowerPCCPU *cpu, target_ulong opcode,
  target_ulong *args)
 {
+CPUPPCState *env = cpu-env;
+
 if ((opcode = MAX_HCALL_OPCODE)
  ((opcode  0x3) == 0)) {
 spapr_hcall_fn fn = papr_hypercall_table[opcode / 4];
diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
index d7d8e8f..9d381ce 100644
--- a/target-ppc/kvm.c
+++ b/target-ppc/kvm.c
@@ -818,7 +818,8 @@ int kvm_arch_handle_exit(CPUPPCState *env, struct kvm_run 
*run)
 #ifdef CONFIG_PSERIES
 case KVM_EXIT_PAPR_HCALL:
 dprintf(handle PAPR hypercall\n);
-run-papr_hcall.ret = spapr_hypercall(env, run-papr_hcall.nr,
+run-papr_hcall.ret = spapr_hypercall(ppc_env_get_cpu(env),
+  run-papr_hcall.nr,
   run-papr_hcall.args);
 ret = 0;
 break;
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 3/7] target-ppc: Rename kvm_kick_{env = cpu} and pass PowerPCCPU

2012-10-10 Thread Andreas Färber
Needed for changing qemu_cpu_kick() argument type to CPUState.

Signed-off-by: Andreas Färber afaer...@suse.de
---
 target-ppc/kvm.c |8 ++--
 1 Datei geändert, 6 Zeilen hinzugefügt(+), 2 Zeilen entfernt(-)

diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
index 5cbe98a..d7d8e8f 100644
--- a/target-ppc/kvm.c
+++ b/target-ppc/kvm.c
@@ -73,8 +73,11 @@ static int cap_hior;
  */
 static QEMUTimer *idle_timer;
 
-static void kvm_kick_env(void *env)
+static void kvm_kick_cpu(void *opaque)
 {
+PowerPCCPU *cpu = opaque;
+CPUPPCState *env = cpu-env;
+
 qemu_cpu_kick(env);
 }
 
@@ -375,6 +378,7 @@ static inline void kvm_fixup_page_sizes(CPUPPCState *env)
 
 int kvm_arch_init_vcpu(CPUPPCState *cenv)
 {
+PowerPCCPU *cpu = ppc_env_get_cpu(cenv);
 int ret;
 
 /* Gather server mmu info from KVM and update the CPU state */
@@ -386,7 +390,7 @@ int kvm_arch_init_vcpu(CPUPPCState *cenv)
 return ret;
 }
 
-idle_timer = qemu_new_timer_ns(vm_clock, kvm_kick_env, cenv);
+idle_timer = qemu_new_timer_ns(vm_clock, kvm_kick_cpu, cpu);
 
 /* Some targets support access to KVM's guest TLB. */
 switch (cenv-mmu_model) {
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Using PCI config space to indicate config location

2012-10-10 Thread Rusty Russell
Michael S. Tsirkin m...@redhat.com writes:
 On Mon, Oct 08, 2012 at 12:51:25PM +1030, Rusty Russell wrote:
 Note before anyone gets confused; we were talking about using the PCI
 config space to indicate what BAR(s) the virtio stuff is in.  An
 alternative would be to simply specify a new layout format in BAR1.

 One problem we are still left with is this: device specific
 config accesses are still non atomic.
 This is a problem for multibyte fields such as MAC address
 where MAC could change while we are accessing it.

It's also a problem for related fields, eg. console width and height, or
disk geometry.

 I was thinking about some backwards compatible way to solve this, but if
 we are willing to break compatiblity or use some mode switch, how about
 we give up on virtio config space completely, and do everything besides
 IO and ISR through guest memory?

I think there's still a benefit in the simple publishing of information:
I don't really want to add a control queue for the console.  But
inevitably, once-static information can change in later versions, and
it's horrible to have config information plus a bit that says don't use
this, use the control queue.

Here's a table from a quick audit:

Driver  Config   Device changesDriver writes... after init?
net YY NN
block   YY YY
console YY NN
rng NN NN
balloon YY YY
scsiYN YN
9p  YN NN

For config space reads, I suggest the driver publish a generation count.
For writes, the standard seems to be a commit latch.  We could abuse the
generation count for this: the driver writes to it to commit config
changes.

ie:
/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */
struct virtio_pci_common_cfg {
/* About the whole device. */
__le32 device_feature_select;   /* read-write */
__le32 device_feature;  /* read-only */
__le32 guest_feature_select;/* read-write */
__le32 guest_feature;   /* read-only */
__le32 config_gen_and_latch;/* read-write */
__le16 msix_config; /* read-write */
__u8 device_status; /* read-write */
__u8 unused;

/* About a specific virtqueue. */
__le16 queue_select;/* read-write */
__le16 queue_align; /* read-write, power of 2. */
__le16 queue_size;  /* read-write, power of 2. */
__le16 queue_msix_vector;/* read-write */
__le64 queue_address;   /* read-write: 0x == DNE. */
};

Thoughts?
Rusty.
PS.  Let's make all the virtio-device config LE, too...
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [kvmarm] [PATCH v2 12/14] KVM: ARM: VFP userspace interface

2012-10-10 Thread Rusty Russell
Christoffer Dall c.d...@virtualopensystems.com writes:
 On Tue, Oct 9, 2012 at 2:11 PM, Peter Maydell peter.mayd...@linaro.org 
 wrote:
 On 1 October 2012 10:11, Christoffer Dall c.d...@virtualopensystems.com 
 wrote:
 From: Rusty Russell rusty.russ...@linaro.org
...
 +ARM 32-bit VFP control registers have the following id bit patterns:
 +  0x4002  0012 1 selector:3
 +
 +ARM 64-bit FP registers have the following id bit patterns:
 +  0x4002  0012 0 selector:3
 +

 In the other entries in this section, foo:3 indicates a 3 bit field.
 But I think for these two it's trying to indicate a 3 byte field.
...
 right you are, I'll modify this.

Thanks!
Rusty.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] vhost-blk: Add vhost-blk support v3

2012-10-10 Thread Asias He
vhost-blk is an in-kernel virito-blk device accelerator.

Due to lack of proper in-kernel AIO interface, this version converts
guest's I/O request to bio and use submit_bio() to submit I/O directly.
So this version any supports raw block device as guest's disk image,
e.g. /dev/sda, /dev/ram0. We can add file based image support to
vhost-blk once we have in-kernel AIO interface. There are some work in
progress for in-kernel AIO interface from Dave Kleikamp and Zach Brown:

   http://marc.info/?l=linux-fsdevelm=133312234313122

Performance evaluation:
-
1) LKVM
Fio with libaio ioengine on Fusion IO device using kvm tool
IOPS   Before   After   Improvement
seq-read   107  121 +13.0%
seq-write  130  179 +37.6%
rnd-read   102  122 +19.6%
rnd-write  125  159 +27.0%

2) QEMU
Fio with libaio ioengine on Fusion IO device using QEMU
IOPS   Before   After   Improvement
seq-read   76   123 +61.8%
seq-write  139  173 +24.4%
rnd-read   73   120 +64.3%
rnd-write  75   156 +108.0%

Userspace bits:
-
1) LKVM
The latest vhost-blk userspace bits for kvm tool can be found here:
g...@github.com:asias/linux-kvm.git blk.vhost-blk

2) QEMU
The latest vhost-blk userspace prototype for QEMU can be found here:
g...@github.com:asias/qemu.git blk.vhost-blk

Changes in V3:
- Sending REQ_FLUSH bio instead of vfs_fsync, thanks Christoph!
- Check file passed by user is a raw block device file

Signed-off-by: Asias He as...@redhat.com
---
 drivers/vhost/Kconfig |   1 +
 drivers/vhost/Kconfig.blk |  10 +
 drivers/vhost/Makefile|   2 +
 drivers/vhost/blk.c   | 669 ++
 drivers/vhost/blk.h   |   8 +
 5 files changed, 690 insertions(+)
 create mode 100644 drivers/vhost/Kconfig.blk
 create mode 100644 drivers/vhost/blk.c
 create mode 100644 drivers/vhost/blk.h

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 202bba6..acd8038 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -11,4 +11,5 @@ config VHOST_NET
 
 if STAGING
 source drivers/vhost/Kconfig.tcm
+source drivers/vhost/Kconfig.blk
 endif
diff --git a/drivers/vhost/Kconfig.blk b/drivers/vhost/Kconfig.blk
new file mode 100644
index 000..ff8ab76
--- /dev/null
+++ b/drivers/vhost/Kconfig.blk
@@ -0,0 +1,10 @@
+config VHOST_BLK
+   tristate Host kernel accelerator for virtio blk (EXPERIMENTAL)
+   depends on BLOCK   EXPERIMENTAL  m
+   ---help---
+ This kernel module can be loaded in host kernel to accelerate
+ guest block with virtio_blk. Not to be confused with virtio_blk
+ module itself which needs to be loaded in guest kernel.
+
+ To compile this driver as a module, choose M here: the module will
+ be called vhost_blk.
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index a27b053..1a8a4a5 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -2,3 +2,5 @@ obj-$(CONFIG_VHOST_NET) += vhost_net.o
 vhost_net-y := vhost.o net.o
 
 obj-$(CONFIG_TCM_VHOST) += tcm_vhost.o
+obj-$(CONFIG_VHOST_BLK) += vhost_blk.o
+vhost_blk-y := blk.o
diff --git a/drivers/vhost/blk.c b/drivers/vhost/blk.c
new file mode 100644
index 000..4a4e793
--- /dev/null
+++ b/drivers/vhost/blk.c
@@ -0,0 +1,669 @@
+/*
+ * Copyright (C) 2011 Taobao, Inc.
+ * Author: Liu Yuan tailai...@taobao.com
+ *
+ * Copyright (C) 2012 Red Hat, Inc.
+ * Author: Asias He as...@redhat.com
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * virtio-blk server in host kernel.
+ */
+
+#include linux/miscdevice.h
+#include linux/module.h
+#include linux/vhost.h
+#include linux/virtio_blk.h
+#include linux/mutex.h
+#include linux/file.h
+#include linux/kthread.h
+#include linux/blkdev.h
+
+#include vhost.c
+#include vhost.h
+#include blk.h
+
+#define BLK_HDR0
+
+static DEFINE_IDA(vhost_blk_index_ida);
+
+enum {
+   VHOST_BLK_VQ_REQ = 0,
+   VHOST_BLK_VQ_MAX = 1,
+};
+
+struct req_page_list {
+   struct page **pages;
+   int pages_nr;
+};
+
+struct vhost_blk_req {
+   struct llist_node llnode;
+   struct req_page_list *pl;
+   struct vhost_blk *blk;
+
+   struct iovec *iov;
+   int iov_nr;
+
+   struct bio **bio;
+   atomic_t bio_nr;
+
+   sector_t sector;
+   int write;
+   u16 head;
+   long len;
+
+   u8 *status;
+};
+
+struct vhost_blk {
+   struct task_struct *host_kick;
+   struct vhost_blk_req *reqs;
+   struct vhost_virtqueue vq;
+   struct llist_head llhead;
+   struct vhost_dev dev;
+   u16 reqs_nr;
+   int index;
+};
+
+static inline int iov_num_pages(struct iovec *iov)
+{
+   return (PAGE_ALIGN((unsigned long)iov-iov_base + iov-iov_len) -
+  ((unsigned long)iov-iov_base  PAGE_MASK))  PAGE_SHIFT;
+}
+
+static int vhost_blk_setup(struct vhost_blk *blk)
+{
+  

Using taskset , powertop and perf with qemu-kvm

2012-10-10 Thread Jaspal

Hello ,

I wish to run some cpu intensive tasks on a vm , use only one core 
specifically #0 of the 6 available ( Xeon 3670 ) and all the while 
record basic perf events like cpu-cycles , instructions etc.


I am using taskset to define the cpu-affinity of the qemu process :
taskset -p 0x1 `pgrep qemu` (assuming only 1 vm is running )

I can verify that the vm is now running only on core 0 using top. top 
also shows that the qemu process is using 99% of it's share of CPU time 
( %CPU column in top ) upon running the tasks in vm. However what I fail 
to see is the increase in time spent of the core 0 in the C0 ( active ) 
state via powertop. It barely manages to cross 12%. If I run the same 
tasks ( FFT benchmark ) on the host on a different core ( say #1 ) and 
monitor it using powertop again , I can see that  C0 % for core 1 is 
above 100% ( 104 ~ 107 )
So why does not the output of top ( qemu is using 99% of it's cpu time , 
which is spent on core 0 )  co-relate with powertop ( which says core 0 
is idle for around 88% of time ) ?
Also , powertop reports that core 2 is being used heavily ( 104 % ) and 
I suspect this is one which is executing the vm operations. But why ?

There are no other cpu/memory intensive tasks running on the host.

Next , I am using the perf kvm record function to save the stats. If I 
use all the 7 core counters available along-with -a option , perf 
generates ~2MB/second . Is there a way to save just the raw count of the 
events ?
Also , if instead of -a , if I give -C 0 , perf just saves around 
0.016MB/s as opposed to 1.894MB/s in case of -a option. If the vm is 
indeed using core 0 , then perf should have recorded the events.


On a last note , does perf kvm automatically selects the vm to profile ( 
in case multiple vms are running ) on the basis of the guest.kallsyms 
and guest.modules files passed to it ?


Thanks ,
Jaspal
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Using PCI config space to indicate config location

2012-10-10 Thread Michael S. Tsirkin
On Mon, Oct 08, 2012 at 12:51:25PM +1030, Rusty Russell wrote:
 (Topic updated, cc's trimmed).
 
 Anthony Liguori aligu...@us.ibm.com writes:
  Rusty Russell ru...@rustcorp.com.au writes:
  4) The only significant change to the spec is that we use PCI
 capabilities, so we can have infinite feature bits.
 (see 
  http://lists.linuxfoundation.org/pipermail/virtualization/2011-December/019198.html)
 
  We discussed this on IRC last night.  I don't think PCI capabilites are
  a good mechanism to use...
 
  PCI capabilities are there to organize how the PCI config space is
  allocated to allow vendor extensions to co-exist with future PCI
  extensions.
 
  But we've never used the PCI config space within virtio-pci.  We do
  everything in BAR0.  I don't think there's any real advantage of using
  the config space vs. a BAR for virtio-pci.
 
 Note before anyone gets confused; we were talking about using the PCI
 config space to indicate what BAR(s) the virtio stuff is in.  An
 alternative would be to simply specify a new layout format in BAR1.

One problem we are still left with is this: device specific
config accesses are still non atomic.
This is a problem for multibyte fields such as MAC address
where MAC could change while we are accessing it.

I was thinking about some backwards compatible way to solve this, but if
we are willing to break compatiblity or use some mode switch, how about
we give up on virtio config space completely, and do everything besides
IO and ISR through guest memory?

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] Using PCI config space to indicate config location

2012-10-10 Thread Michael S. Tsirkin
On Tue, Oct 09, 2012 at 10:26:12AM +1030, Rusty Russell wrote:
 Anthony Liguori aligu...@us.ibm.com writes:
  Gerd Hoffmann kra...@redhat.com writes:
 
Hi,
 
  So we could have for virtio something like this:
 
  Capabilities: [??] virtio-regs:
  legacy: BAR=0 offset=0
  virtio-pci: BAR=1 offset=1000
  virtio-cfg: BAR=1 offset=1800
  
  This would be a vendor specific PCI capability so lspci wouldn't
  automatically know how to parse it.
 
  Sure, would need a patch to actually parse+print the cap,
  /me was just trying to make my point clear in a simple way.
 
  2) ISTR an argument about mapping the ISR register separately, for
 performance, but I can't find a reference to it.
 
  I think the rationale is that ISR really needs to be PIO but everything
  else doesn't.  PIO is much faster on x86 because it doesn't require
  walking page tables or instruction emulation to handle the exit.
 
  Is this still a pressing issue?  With MSI-X enabled ISR isn't needed,
  correct?  Which would imply that pretty much only old guests without
  MSI-X support need this, and we don't need to worry that much when
  designing something new ...
  
  It wasn't that long ago that MSI-X wasn't supported..  I think we should
  continue to keep ISR as PIO as it is a fast path.
 
  No problem if we allow to have both legacy layout and new layout at the
  same time.  Guests can continue to use ISR @ BAR0 in PIO space for
  existing virtio devices, even in case they want use mmio for other
  registers - all fine.
 
  New virtio devices can support MSI-X from day one and decide to not
  expose a legacy layout PIO bar.
 
  I think having BAR1 be an MMIO mirror of the registers + a BAR2 for
  virtio configuration space is probably not that bad of a solution.
 
 Well, we also want to clean up the registers, so how about:
 
 BAR0: legacy, as is.  If you access this, don't use the others.
 BAR1: new format virtio-pci layout.  If you use this, don't use BAR0.
 BAR2: virtio-cfg.  If you use this, don't use BAR0.
 BAR3: ISR. If you use this, don't use BAR0.

One problem here is there are only 3 64-bit BARs under PCI.
There's no point to make IO BAR 64-bit on x86, but there might
be for memory.

 I prefer the cases exclusive (ie. use one or the other) as a clear path
 to remove the legacy layout; and leaving the ISR in BAR0 leaves us with
 an ugly corner case in future (ISR is BAR0 + 19?  WTF?).
 
 As to MMIO vs PIO, the BARs are self-describing, so we should explicitly
 endorse that and leave it to the devices.
 
 The detection is simple: if BAR1 has non-zero length, it's new-style,
 otherwise legacy.
 
 Thoughts?
 Rusty.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] Using PCI config space to indicate config location

2012-10-10 Thread Rusty Russell
Jamie Lokier ja...@shareable.org writes:

 Rusty Russell wrote:
 I don't think it'll be that bad; reset clears the device to unknown,
 bar0 moves it from unknown-legacy mode, bar1/2/3 changes it from
 unknown-modern mode, and anything else is bad (I prefer being strict so
 we catch bad implementations from the beginning).

 Will that work, if the guest with kernel that uses modern mode, kexecs
 to an older (but presumed reliable) kernel that only knows about legacy mode?

 I.e. will the replacement kernel, or (ideally) replacement driver on
 the rare occasion that is needed on a running kernel, be able to reset
 the device hard enough?

Well, you need to reset the device, so yes.

Cheers,
Rusty.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] Using PCI config space to indicate config location

2012-10-10 Thread Rusty Russell
Gerd Hoffmann kra...@redhat.com writes:
 So how about this:

 (1) Add a vendor specific pci capability for new-style virtio.
 Specifies the pci bar used for new-style virtio registers.
 Guests can use it to figure whenever new-style virtio is
 supported and to map the correct bar (which will probably
 be bar 1 in most cases).

This was closer to the original proposal[1], which I really liked (you
can layout bars however you want).  Anthony thought that vendor
capabilities were a PCI-e feature, but it seems they're blessed in PCI
2.3.

So let's return to that proposal, giving something like this:

/* IDs for different capabilities.  Must all exist. */
/* FIXME: Do we win from separating ISR, NOTIFY and COMMON? */
/* Common configuration */
#define VIRTIO_PCI_CAP_COMMON_CFG   1
/* Notifications */
#define VIRTIO_PCI_CAP_NOTIFY_CFG   2
/* ISR access */
#define VIRTIO_PCI_CAP_ISR_CFG  3
/* Device specific confiuration */
#define VIRTIO_PCI_CAP_DEVICE_CFG   4

/* This is the PCI capability header: */
struct virtio_pci_cap {
u8 cap_vndr;/* Generic PCI field: PCI_CAP_ID_VNDR */
u8 cap_next;/* Generic PCI field: next ptr. */
u8 cap_len; /* Generic PCI field: sizeof(struct virtio_pci_cap). */
u8 cfg_type;/* One of the VIRTIO_PCI_CAP_*_CFG. */
u8 bar; /* Where to find it. */
u8 unused;
__le16 offset;  /* Offset within bar. */
__le32 length;  /* Length. */
};

This means qemu can point the isr_cfg into the legacy area if it wants.
In fact, it can put everything in BAR0 if it wants.

Thoughts?
Rusty.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Proposal for virtio standardization.

2012-10-10 Thread Rusty Russell
Cornelia Huck cornelia.h...@de.ibm.com writes:
 On Thu, 27 Sep 2012 09:59:33 +0930
 Rusty Russell ru...@rustcorp.com.au wrote:
 3) Various clarifications, formalizations and cleanups to the spec text,
and possibly elimination of old deprecated features.
 
 4) The only significant change to the spec is that we use PCI
capabilities, so we can have infinite feature bits.
(see
 http://lists.linuxfoundation.org/pipermail/virtualization/2011-December/019198.html)

 Infinite only applies to virtio-pci, no?

Yes, you already have infinite feature bits for ccw, as does every bus
we did since PCI.

 Sounds like a good idea. I'll be happy to review the spec with an eye
 to virtio-ccw.

Thanks!
Rusty.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] Using PCI config space to indicate config location

2012-10-10 Thread Rusty Russell
Anthony Liguori aligu...@us.ibm.com writes:
 Rusty Russell ru...@rustcorp.com.au writes:

 Anthony Liguori aligu...@us.ibm.com writes:
 We'll never remove legacy so we shouldn't plan on it.  There are
 literally hundreds of thousands of VMs out there with the current virtio
 drivers installed in them.  We'll be supporting them for a very, very
 long time :-)

 You will be supporting this for qemu on x86, sure.

 And PPC.

Yeah, Ben was a bit early fixing the virtio bugs I guess.  Oh well.

 As I think we're
 still in the growth phase for virtio, I prioritize future spec
 cleanliness pretty high.

 But I think you'll be surprised how fast this is deprecated:
 1) Bigger queues for block devices (guest-specified ringsize)
 2) Smaller rings for openbios (guest-specified alignment)
 3) All-mmio mode (powerpc)
 4) Whatever network features get numbers  31.

 We can do all of these things with incremental change to the existing
 layout.  That's the only way what I'm suggesting is different.

Now extend your proposal to add all those features we want.  We add
if()s to the driver that we need to keep forever.  Let's just look at
what your proposal get us:

bool has_feature(struct virtio_pci_dev *dev, u32 num)
{
/* Do we need to set selector? */
if (dev-extended_features) {
unsigned long selector_off = VIRTIO_PCI_HOST_FEATURES_SELECT;
if (dev-using_bar0  !dev-msi_active)
selector_off -= 32;
writel(dev-config + selector_off, num / 32);
num = 31;
} else if (num  31)
return false;
return readl(dev-config + VIRTIO_PCI_HOST_FEATURES)  (1  num);
}

vs two cases which independent with methods set at detection time:

virtio_pci_legacy.c:
bool has_feature(struct virtio_pci_dev *dev, u32 num)
{
if (num  31)
return false;
return readl(dev-config + VIRTIO_PCI_LEGACY_HOST_FEATURES)  (1  
num);
}

virtio_pci_new.c:
bool has_feature(struct virtio_pci_dev *dev, u32 num)
{
writel(dev-config + VIRTIO_PCI_HOST_FEATURES_SELECT, num / 32);
num = 31;
return readl(dev-config + VIRTIO_PCI_HOST_FEATURES)  (1  num);
}

 You want to reorder all of the fields and have a driver flag day.  But I
 strongly suspect we'll decide we need to do the same exercise again in 4
 years when we now need to figure out how to take advantage of
 transactional memory or some other whiz-bang hardware feature.

It's not a flag day, as it's backwards compatible.

Sure, we might have a clean cut again.  I'm not afraid to rewrite this
code to make it cleaner; perhaps this is somewhere qemu could benefit
from a similar approach.

 What I'm suggesting is:

 struct {
 __le32 host_features;   /* read-only */
 __le32 guest_features;  /* read/write */
 __le32 queue_pfn;   /* read/write */
 __le16 queue_size;  /* read-only */
 __le16 queue_sel;   /* read/write */
 __le16 queue_notify;/* read/write */
 u8 status;  /* read/write */
 u8 isr; /* read-only, clear on read */
 __le16 msi_config_vector;   /* read/write */
 __le16 msi_queue_vector;/* read/write */
 __le32 host_feature_select; /* read/write */
 __le32 guest_feature_select;/* read/write */
 __le32 queue_pfn_hi;/* read/write */
 };

 With the additional semantic that the virtio-config space is overlayed
 on top of the register set in BAR0 unless the
 VIRTIO_PCI_F_SEPARATE_CONFIG feature is acknowledged.  This feature
 acts as a latch and when set, removes the config space overlay.

 If the config space overlays the registers, the offset in BAR0 of the
 overlay depends on whether MSI is enabled or not in the PCI device.

 BAR1 is an MMIO mirror of BAR0 except that the config space is never
 overlayed in BAR1 regardless of VIRTIO_PCI_F_SEPARATE_CONFIG.

Creating a horrible mess for new code in order to support old code.
But I have to maintain the new code, and this is horrible.

 You mean, accesses to bar1/2/3 change it to modern mode.  That's a
 pretty big side effect of a read.

They don't need to, but I'd prefer they did to keep implementations
from mixing old and new.

 See above.  A guest could happily just use BAR1/BAR2 and completely
 ignore BAR0 provided that BAR1/BAR2 are present.

But x86 guests want BAR0 because IO space is faster than MMIO.  Right?
So, not happily.

 It also means the guest drivers don't need separate code paths for
 legacy vs. modern.  They can switch between the two by just changing
 a single pointer.

This is wrong.  Of course they have separate code paths, but now you've
got lots of crossover, rather than two distinct ones.

 The implementation ends up being pretty trivial too.  We can use the
 same MemoryRegion in QEMU for both PIO and MMIO.  The kernel code stays
 the same.  It just takes a couple if()s to handle 

Re: Using taskset , powertop and perf with qemu-kvm

2012-10-10 Thread Gleb Natapov
On Wed, Oct 10, 2012 at 01:22:22PM +0530, Jaspal wrote:
 Hello ,
 
 I wish to run some cpu intensive tasks on a vm , use only one core
 specifically #0 of the 6 available ( Xeon 3670 ) and all the while
 record basic perf events like cpu-cycles , instructions etc.
 
 I am using taskset to define the cpu-affinity of the qemu process :
 taskset -p 0x1 `pgrep qemu` (assuming only 1 vm is running )
 
pgrep tells you only the pid of the io thread. vcpu runs in a separate
thread.

 I can verify that the vm is now running only on core 0 using top.
 top also shows that the qemu process is using 99% of it's share of
 CPU time ( %CPU column in top ) upon running the tasks in vm.
 However what I fail to see is the increase in time spent of the core
 0 in the C0 ( active ) state via powertop. It barely manages to
 cross 12%. If I run the same tasks ( FFT benchmark ) on the host on
 a different core ( say #1 ) and monitor it using powertop again , I
 can see that  C0 % for core 1 is above 100% ( 104 ~ 107 )
 So why does not the output of top ( qemu is using 99% of it's cpu
 time , which is spent on core 0 )  co-relate with powertop ( which
 says core 0 is idle for around 88% of time ) ?
 Also , powertop reports that core 2 is being used heavily ( 104 % )
 and I suspect this is one which is executing the vm operations. But
 why ?
 There are no other cpu/memory intensive tasks running on the host.
 
 Next , I am using the perf kvm record function to save the stats. If
 I use all the 7 core counters available along-with -a option , perf
 generates ~2MB/second . Is there a way to save just the raw count of
 the events ?
 Also , if instead of -a , if I give -C 0 , perf just saves around
 0.016MB/s as opposed to 1.894MB/s in case of -a option. If the vm is
 indeed using core 0 , then perf should have recorded the events.
 
 On a last note , does perf kvm automatically selects the vm to
 profile ( in case multiple vms are running ) on the basis of the
 guest.kallsyms and guest.modules files passed to it ?
 
 Thanks ,
 Jaspal
 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] Using PCI config space to indicate config location

2012-10-10 Thread Michael S. Tsirkin
On Wed, Oct 10, 2012 at 02:14:11PM +1030, Rusty Russell wrote:
  See above.  A guest could happily just use BAR1/BAR2 and completely
  ignore BAR0 provided that BAR1/BAR2 are present.
 
 But x86 guests want BAR0 because IO space is faster than MMIO.  Right?

Or to be a bit more precise, ATM on x86 emulating IO instructions is
faster than MMIO. IIUC this is since you get the address/data in
registers and don't need to decode them.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Using taskset , powertop and perf with qemu-kvm

2012-10-10 Thread Jaspal

On 10/10/2012 03:38 PM, Gleb Natapov wrote:

On Wed, Oct 10, 2012 at 01:22:22PM +0530, Jaspal wrote:

Hello ,

I wish to run some cpu intensive tasks on a vm , use only one core
specifically #0 of the 6 available ( Xeon 3670 ) and all the while
record basic perf events like cpu-cycles , instructions etc.

I am using taskset to define the cpu-affinity of the qemu process :
taskset -p 0x1 `pgrep qemu` (assuming only 1 vm is running )


pgrep tells you only the pid of the io thread. vcpu runs in a separate
thread.


I can verify that the vm is now running only on core 0 using top.
top also shows that the qemu process is using 99% of it's share of
CPU time ( %CPU column in top ) upon running the tasks in vm.
However what I fail to see is the increase in time spent of the core
0 in the C0 ( active ) state via powertop. It barely manages to
cross 12%. If I run the same tasks ( FFT benchmark ) on the host on
a different core ( say #1 ) and monitor it using powertop again , I
can see that  C0 % for core 1 is above 100% ( 104 ~ 107 )
So why does not the output of top ( qemu is using 99% of it's cpu
time , which is spent on core 0 )  co-relate with powertop ( which
says core 0 is idle for around 88% of time ) ?
Also , powertop reports that core 2 is being used heavily ( 104 % )
and I suspect this is one which is executing the vm operations. But
why ?
There are no other cpu/memory intensive tasks running on the host.

Next , I am using the perf kvm record function to save the stats. If
I use all the 7 core counters available along-with -a option , perf
generates ~2MB/second . Is there a way to save just the raw count of
the events ?
Also , if instead of -a , if I give -C 0 , perf just saves around
0.016MB/s as opposed to 1.894MB/s in case of -a option. If the vm is
indeed using core 0 , then perf should have recorded the events.

On a last note , does perf kvm automatically selects the vm to
profile ( in case multiple vms are running ) on the basis of the
guest.kallsyms and guest.modules files passed to it ?

Thanks ,
Jaspal
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
Gleb.

Thanks. It works using the correct tid.

--
Jaspal
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Enabling IA32_TSC_ADJUST for guest VM

2012-10-10 Thread Marcelo Tosatti
On Tue, Oct 09, 2012 at 04:10:30PM +, Auld, Will wrote:
 I am just testing the second version of this patch. It addresses all the 
 comments so far except Marcelo's issue with breaking the function 
 compute_guest_tsc(). 

Lets try to merge the missing patch from Zachary first (that'll make it
clear).

 
 I needed to put the call for updating the TSC_ADJUST_MSR in kvm_write_tsc() 
 to ensure it is only called from user space. Other changes added to vmcs 
 offset should not be tracked in TSC_ADJUST_MSR. 

Please have a separate, earlier patch making that explicit (by passing a
bool to kvm_x86_ops-set_msr then to kvm_set_msr_common). that =
whether msr write is guest initiated or not.

 I had some trouble with the order of initialization during live migration. 
 TSC_ADJUST is initialized first but then wiped out by multiple 
 initializations of tsc. The fix for this is to not update TSC_ADJUST if the 
 vmcs offset is not actually changing with the tsc write. So, after migration 
 outcome is that vmcs offset gets defined independent from the migrating value 
 of TSC_ADJUST. I believe this is what we want to happen.

Can you please be more explicit regarding wiped out by multiple
initializations of tsc ? 

It is probably best to maintain TSC_ADJUST separately, in software, and
then calculate TSC_OFFSET.

 Thanks,
 
 Will 
 
 -Original Message-
 From: Avi Kivity [mailto:a...@redhat.com] 
 Sent: Tuesday, October 09, 2012 5:12 AM
 To: Marcelo Tosatti
 Cc: Auld, Will; kvm@vger.kernel.org; Zhang, Xiantao
 Subject: Re: [PATCH] Enabling IA32_TSC_ADJUST for guest VM
 
 On 10/08/2012 07:30 PM, Marcelo Tosatti wrote:
  
  From Intel's manual:
  
  • If an execution of WRMSR to the IA32_TIME_STAMP_COUNTER MSR adds (or
  subtracts) value X from the TSC,
  the logical processor also adds (or subtracts) value X from the 
  IA32_TSC_ADJUST MSR.
  
  This is not handled in the patch. 
  
  To support migration, it will be necessary to differentiate between 
  guest initiated and userspace-model initiated msr write. That is, only 
  guest initiated TSC writes should affect the value of IA32_TSC_ADJUST 
  MSR.
  
  Avi, any better idea?
  
 
 I think we need that anyway, since there are some read-only MSRs that need to 
 be configured by the host (nvmx capabilities).  So if we add that feature it 
 will be useful elsewhere.  I don't think it's possible to do it in any other 
 way:
 
 Local offset value of the IA32_TSC for a logical processor. Reset value is 
 Zero. A write to IA32_TSC will modify the local offset in IA32_TSC_ADJUST and 
 the content of IA32_TSC, but does not affect the internal invariant TSC 
 hardware.
 
 What we want to do is affect the internal invariant TSC hardware, so we can't 
 do that through the normal means.
 
 btw, will tsc writes from userspace (after live migration) cause tsc skew?  
 If so we should think how to model a guest-wide tsc.
 
 --
 error compiling committee.c: too many arguments to function
 N?r??yb?X??ǧv?^?)޺{.n?+h????ܨ}???Ơz?j:+v???zZ+??+zf???h???~i???z??w??)ߢf
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v6 05/16] target-i386: Use Hypervisor level in -machine pc,accel=kvm.

2012-10-10 Thread Don Slutz

On 10/09/12 15:11, Marcelo Tosatti wrote:

On Tue, Oct 09, 2012 at 03:09:17PM -0400, Don Slutz wrote:

On 10/09/12 14:47, Marcelo Tosatti wrote:

On Tue, Oct 09, 2012 at 03:27:26PM -0300, Marcelo Tosatti wrote:

On Tue, Oct 09, 2012 at 02:18:05PM -0300, Marcelo Tosatti wrote:

On Mon, Sep 24, 2012 at 10:32:07AM -0400, Don Slutz wrote:

Also known as Paravirtualization level.

This change is based on:

Microsoft Hypervisor CPUID Leaves:
   
http://msdn.microsoft.com/en-us/library/windows/hardware/ff542428%28v=vs.85%29.aspx

Linux kernel change starts with:
   http://fixunix.com/kernel/538707-use-cpuid-communicate-hypervisor.html
Also:
   http://lkml.indiana.edu/hypermail/linux/kernel/1205.0/00100.html

VMware documention on CPUIDs (Mechanisms to determine if software is
running in a VMware virtual machine):
   
http://kb.vmware.com/selfservice/microsites/search.do?language=en_UScmd=displayKCexternalId=1009458

QEMU knows this is KVM_CPUID_SIGNATURE (0x4000).

Signed-off-by: Don Slutz d...@cloudswitch.com
---
  target-i386/kvm.c |4 ++--
  1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 895d848..8462c75 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -389,12 +389,12 @@ int kvm_arch_init_vcpu(CPUX86State *env)
  c = cpuid_data.entries[cpuid_i++];
  memset(c, 0, sizeof(*c));
  c-function = KVM_CPUID_SIGNATURE
-if (!hyperv_enabled()) {
+if (!env-cpuid_hv_level_set) {
  memcpy(signature, KVMKVMKVM\0\0\0, 12);
  c-eax = 0;
  } else {
  memcpy(signature, Microsoft Hv, 12);
-c-eax = HYPERV_CPUID_MIN;
+c-eax = env-cpuid_hv_level;

This breaks hyperv_enabled() checks.

Don, are you certain it is worthwhile to make this configurable?
Can you explain why, under your scenario, it is worthwhile?

Because these are separate problems:

- Fake VMWare hypervisor  (which seems to be your main goal).
- Make CPUID HV leafs configurable via command line.

Err, meant via properties. Point is, why have VMWare CPUID
configuration as data, if there are reasons to believe code
is a better fit (code as in current Hyper-V implementation).

Nevermind, its the right thing to do. Just separate the patchset
please:

1) Create object properties.
2) Export VMWare CPUID via properties.
3) Convert Hyper-V.

Be careful to make sure Hyper-V's current options are functional
in 3).


Did you mean 3 patch sets (or more)? Or just a different order?
-Don Slutz

Different order. Patches should be logically related (think of what
information the reviewer needs). Please write changelogs for
every patch.


Using this order causes support for Hyper-V to stop working in the 
middle of the patch set.  How about this order:

1) Create object properties.
2) Convert Hyper-V to set the new properties.
3) Change kvm.c to use the new properties.
4) Export VMWare CPUID via properties.

5) Change accel=tcg to use the new properties.

   -Don Slutz

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v6 06/16] target-i386: Use Hypervisor level in -machine pc,accel=tcg.

2012-10-10 Thread Don Slutz

On 10/09/12 15:01, Marcelo Tosatti wrote:

On Mon, Sep 24, 2012 at 10:32:08AM -0400, Don Slutz wrote:

Also known as Paravirtualization level.

This change is based on:

Microsoft Hypervisor CPUID Leaves:
   
http://msdn.microsoft.com/en-us/library/windows/hardware/ff542428%28v=vs.85%29.aspx

Linux kernel change starts with:
   http://fixunix.com/kernel/538707-use-cpuid-communicate-hypervisor.html
Also:
   http://lkml.indiana.edu/hypermail/linux/kernel/1205.0/00100.html

VMware documention on CPUIDs (Mechanisms to determine if software is
running in a VMware virtual machine):
   
http://kb.vmware.com/selfservice/microsites/search.do?language=en_UScmd=displayKCexternalId=1009458

QEMU knows this as KVM_CPUID_SIGNATURE (0x4000) in kvm on linux.

This does not provide vendor support in tcg yet.

Signed-off-by: Don Slutz d...@cloudswitch.com
---
  target-i386/cpu.c |   27 +++
  1 files changed, 27 insertions(+), 0 deletions(-)

diff --git a/target-i386/cpu.c b/target-i386/cpu.c
index 48bdaf9..920278b 100644
--- a/target-i386/cpu.c
+++ b/target-i386/cpu.c
@@ -1651,6 +1651,21 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, 
uint32_t count,
  index =  env-cpuid_xlevel;
  }
  }
+} else if (index  0x4000) {
+if (env-cpuid_hv_level_set) {
+uint32_t real_level = env-cpuid_hv_level;
+
+/* Handle Hypervisor CPUIDs */
+if (real_level  0x4000) {
+real_level = 0x4000;
+}
+if (index  real_level) {
+index = real_level;
+}
+} else {
+if (index  env-cpuid_level)
+index = env-cpuid_level;
+}

Whats the purpose of this checks?
They control what happens for CPUID requests outside the defined 
ranges.  Very similar to code just above this.


Please provide changelogs for each patch.

In progress.

  } else {
  if (index  env-cpuid_level)
  index = env-cpuid_level;
@@ -1789,6 +1804,18 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, 
uint32_t count,
  *edx = 0;
  }
  break;
+case 0x4000:
+*eax = env-cpuid_hv_level;
+*ebx = 0;
+*ecx = 0;
+*edx = 0;
+break;
+case 0x4001:
+*eax = env-cpuid_kvm_features;
+*ebx = 0;
+*ecx = 0;
+*edx = 0;
+break;
  case 0x8000:
  *eax = env-cpuid_xlevel;
  *ebx = env-cpuid_vendor1;
--
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

   -Don Slutz
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] Using PCI config space to indicate config location

2012-10-10 Thread Anthony Liguori
Rusty Russell ru...@rustcorp.com.au writes:

 Gerd Hoffmann kra...@redhat.com writes:
 So how about this:

 (1) Add a vendor specific pci capability for new-style virtio.
 Specifies the pci bar used for new-style virtio registers.
 Guests can use it to figure whenever new-style virtio is
 supported and to map the correct bar (which will probably
 be bar 1 in most cases).

 This was closer to the original proposal[1], which I really liked (you
 can layout bars however you want).  Anthony thought that vendor
 capabilities were a PCI-e feature, but it seems they're blessed in PCI
 2.3.

2.3 was standardized in 2002.  Are we confident that vendor extensions
play nice with pre-2.3 OSes like Win2k, WinXP, etc?

I still think it's a bad idea to rely on something so new in something
as fundamental as virtio-pci unless we have to.

Regards,

Anthony Liguori


 So let's return to that proposal, giving something like this:

 /* IDs for different capabilities.  Must all exist. */
 /* FIXME: Do we win from separating ISR, NOTIFY and COMMON? */
 /* Common configuration */
 #define VIRTIO_PCI_CAP_COMMON_CFG 1
 /* Notifications */
 #define VIRTIO_PCI_CAP_NOTIFY_CFG 2
 /* ISR access */
 #define VIRTIO_PCI_CAP_ISR_CFG3
 /* Device specific confiuration */
 #define VIRTIO_PCI_CAP_DEVICE_CFG 4

 /* This is the PCI capability header: */
 struct virtio_pci_cap {
   u8 cap_vndr;/* Generic PCI field: PCI_CAP_ID_VNDR */
   u8 cap_next;/* Generic PCI field: next ptr. */
   u8 cap_len; /* Generic PCI field: sizeof(struct virtio_pci_cap). */
   u8 cfg_type;/* One of the VIRTIO_PCI_CAP_*_CFG. */
   u8 bar; /* Where to find it. */
   u8 unused;
   __le16 offset;  /* Offset within bar. */
   __le32 length;  /* Length. */
 };

 This means qemu can point the isr_cfg into the legacy area if it wants.
 In fact, it can put everything in BAR0 if it wants.

 Thoughts?
 Rusty.
 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] Using PCI config space to indicate config location

2012-10-10 Thread Michael S. Tsirkin
On Wed, Oct 10, 2012 at 08:36:12AM -0500, Anthony Liguori wrote:
 Rusty Russell ru...@rustcorp.com.au writes:
 
  Gerd Hoffmann kra...@redhat.com writes:
  So how about this:
 
  (1) Add a vendor specific pci capability for new-style virtio.
  Specifies the pci bar used for new-style virtio registers.
  Guests can use it to figure whenever new-style virtio is
  supported and to map the correct bar (which will probably
  be bar 1 in most cases).
 
  This was closer to the original proposal[1], which I really liked (you
  can layout bars however you want).  Anthony thought that vendor
  capabilities were a PCI-e feature, but it seems they're blessed in PCI
  2.3.
 
 2.3 was standardized in 2002.  Are we confident that vendor extensions
 play nice with pre-2.3 OSes like Win2k, WinXP, etc?
 
 I still think it's a bad idea to rely on something so new in something
 as fundamental as virtio-pci unless we have to.
 
 Regards,
 
 Anthony Liguori

Pre 2.3 it was the case that *all* space outside
the capability linked list, and any capability not
recognized by space, was considered vendor specific.
So there's no problem really.

 
  So let's return to that proposal, giving something like this:
 
  /* IDs for different capabilities.  Must all exist. */
  /* FIXME: Do we win from separating ISR, NOTIFY and COMMON? */
  /* Common configuration */
  #define VIRTIO_PCI_CAP_COMMON_CFG   1
  /* Notifications */
  #define VIRTIO_PCI_CAP_NOTIFY_CFG   2
  /* ISR access */
  #define VIRTIO_PCI_CAP_ISR_CFG  3
  /* Device specific confiuration */
  #define VIRTIO_PCI_CAP_DEVICE_CFG   4
 
  /* This is the PCI capability header: */
  struct virtio_pci_cap {
  u8 cap_vndr;/* Generic PCI field: PCI_CAP_ID_VNDR */
  u8 cap_next;/* Generic PCI field: next ptr. */
  u8 cap_len; /* Generic PCI field: sizeof(struct virtio_pci_cap). */
  u8 cfg_type;/* One of the VIRTIO_PCI_CAP_*_CFG. */
  u8 bar; /* Where to find it. */
  u8 unused;
  __le16 offset;  /* Offset within bar. */
  __le32 length;  /* Length. */
  };
 
  This means qemu can point the isr_cfg into the legacy area if it wants.
  In fact, it can put everything in BAR0 if it wants.
 
  Thoughts?
  Rusty.
  --
  To unsubscribe from this list: send the line unsubscribe kvm in
  the body of a message to majord...@vger.kernel.org
  More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v6 05/16] target-i386: Use Hypervisor level in -machine pc,accel=kvm.

2012-10-10 Thread Marcelo Tosatti
On Wed, Oct 10, 2012 at 09:03:20AM -0400, Don Slutz wrote:
 Did you mean 3 patch sets (or more)? Or just a different order?
 -Don Slutz
 Different order. Patches should be logically related (think of what
 information the reviewer needs). Please write changelogs for
 every patch.
 
 
 Using this order causes support for Hyper-V to stop working in the
 middle of the patch set.  How about this order:
 1) Create object properties.
 2) Convert Hyper-V to set the new properties.
 3) Change kvm.c to use the new properties.
 4) Export VMWare CPUID via properties.
 
 5) Change accel=tcg to use the new properties.
 
-Don Slutz

Fine, as long as change from item A) does not leak to item B).

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC 1/2] kvm: Handle undercommitted guest case in PLE handler

2012-10-10 Thread Andrew Theurer
I ran 'perf sched map' on the dbench workload for medium and large VMs,
and I thought I would share some of the results.  I think it helps to
visualize what's going on regarding the yielding.

These files are png bitmaps, generated from processing output from 'perf
sched map' (and perf data generated from 'perf sched record').  The Y
axis is the host cpus, each row being 10 pixels high.  For these tests,
there are 80 host cpus, so the total height is 800 pixels.  The X axis
is time (in microseconds), with each pixel representing 1 microsecond.
Each bitmap plots 30,000 microseconds.  The bitmaps are quite wide
obviously, and zooming in/out while viewing is recommended.

Each row (each host cpu) is assigned a color based on what thread is
running.  vCPUs of the same VM are assigned a common color (like red,
blue, magenta, etc), and each vCPU has a unique brightness for that
color.  There are a maximum of 12 assignable colors, so in any VMs 12
revert to vCPU color of gray. I would use more colors, but it becomes
harder to distinguish one color from another.  The white color
represents missing data from perf, and black color represents any thread
which is not a vCPU.

For the following tests, VMs were pinned to host NUMA nodes and to
specific cpus to help with consistency and operate within the
constraints of the last test (gang scheduler).

Here is a good example of PLE.  These are 10-way VMs, 16 of them (as
described above only 12 of the VMs have a color, rest are gray).

https://docs.google.com/open?id=0B6tfUNlZ-14wdmFqUmE5QjJHMFU

If you zoom out and look at the whole bitmap, you may notice the 4ms
intervals of the scheduler.  They are pretty well aligned across all
cpus.  Normally, for cpu bound workloads, we would expect to see each
thread to run for 4 ms, then something else getting to run, and so on.
That is mostly true in this test.  We have 2x over-commit and we
generally see the switching of threads at 4ms.  One thing to note is
that not all vCPU threads for the same VM run at exactly the same time,
and that is expected and the whole reason for lock-holder preemption.
Now, if you zoom in on the bitmap, you should notice within the 4ms
intervals there is some task switching going on.  This is most likely
because of the yield_to initiated by the PLE handler.  In this case
there is not that much yielding to do.   It's quite clean, and the
performance is quite good.

Below is an example of PLE, but this time with 20-way VMs, 8 of them.
CPU over-commit is still 2x.

https://docs.google.com/open?id=0B6tfUNlZ-14wdmFqUmE5QjJHMFU

This one looks quite different.  In short, it's a mess.  The switching
between tasks can be lower than 10 microseconds.  It basically never
recovers.  There is constant yielding all the time.  

Below is again 8 x 20-way VMs, but this time I tried out Nikunj's gang
scheduling patches.  While I am not recommending gang scheduling, I
think it's a good data point.  The performance is 3.88x the PLE result.

https://docs.google.com/open?id=0B6tfUNlZ-14wWXdscWcwNTVEY3M

Note that the task switching intervals of 4ms are quite obvious again,
and this time all vCPUs from same VM run at the same time.  It
represents the best possible outcome.


Anyway, I thought the bitmaps might help better visualize what's going
on.

-Andrew



--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] x86, add hypervisor name to dump_stack()

2012-10-10 Thread Prarit Bhargava
Debugging crash, panics, stack trace WARN_ONs, etc., from both virtual and
bare-metal boots can get difficult very quickly.  While there are ways to
decipher the output and determine if the output is from a virtual guest,
the in-kernel hypervisors now have a single registration point and set
x86_hyper.  We can use this to output a single extra line on virtual
machines that indicates the hypervisor type.

Signed-off-by: Prarit Bhargava pra...@redhat.com
Cc: Avi Kivity a...@redhat.com
Cc: Gleb Natapov g...@redhat.com
Cc: Alex Williamson alex.william...@redhat.com
Cc: Marcelo Tostatti mtosa...@redhat.com
Cc: Ingo Molnar mi...@redhat.com
Cc: kvm@vger.kernel.org
Cc: x...@kernel.org
---
 arch/x86/kernel/dumpstack.c |3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index ae42418b..75a635e 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -17,6 +17,7 @@
 #include linux/sysfs.h
 
 #include asm/stacktrace.h
+#include asm/hypervisor.h
 
 
 int panic_on_unrecovered_nmi;
@@ -193,6 +194,8 @@ void dump_stack(void)
init_utsname()-release,
(int)strcspn(init_utsname()-version,  ),
init_utsname()-version);
+   if (x86_hyper  x86_hyper-name)
+   printk(Hypervisor: %s\n,  x86_hyper-name);
show_trace(NULL, NULL, stack, bp);
 }
 EXPORT_SYMBOL(dump_stack);
-- 
1.7.9.3

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 1/5] KVM: MMU: fix release noslot pfn

2012-10-10 Thread Marcelo Tosatti
On Sun, Oct 07, 2012 at 08:01:34PM +0800, Xiao Guangrong wrote:
 We can not directly call kvm_release_pfn_clean to release the pfn
 since we can meet noslot pfn which is used to cache mmio info into
 spte
 
 Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
 ---
  arch/x86/kvm/mmu.c  |3 +--
  virt/kvm/kvm_main.c |4 +---
  2 files changed, 2 insertions(+), 5 deletions(-)
 
 diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
 index d289fee..6f85fe0 100644
 --- a/arch/x86/kvm/mmu.c
 +++ b/arch/x86/kvm/mmu.c
 @@ -2497,8 +2497,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 
 *sptep,
   }
   }
 
 - if (!is_error_pfn(pfn))
 - kvm_release_pfn_clean(pfn);
 + kvm_release_pfn_clean(pfn);
  }
 
  static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
 index cc3f6dc..b65ec97 100644
 --- a/virt/kvm/kvm_main.c
 +++ b/virt/kvm/kvm_main.c
 @@ -1322,9 +1322,7 @@ EXPORT_SYMBOL_GPL(kvm_release_page_clean);
 
  void kvm_release_pfn_clean(pfn_t pfn)
  {
 - WARN_ON(is_error_pfn(pfn));
 -
 - if (!kvm_is_mmio_pfn(pfn))
 + if (!is_error_pfn(pfn)  !kvm_is_mmio_pfn(pfn))
   put_page(pfn_to_page(pfn));
  }
  EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
 -- 
 1.7.7.6

Why does is_error_pfn() return true for mmio spte? Its not an error,
after all. 

Please kill is_invalid_pfn and use

- is_error_pfn for checking for errors (mmio spte is not an error pfn,
its a special pfn)

- add explicit is_noslot_pfn checks where necessary in the code
(say to avoid interpreting a noslot_pfn's pfn address bits).

(should have noticed this earlier, sorry).


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v6 03/16] target-i386: Add cpu object access routines for Hypervisor level.

2012-10-10 Thread Don Slutz

On 10/09/12 15:13, Don Slutz wrote:

On 10/09/12 12:25, Marcelo Tosatti wrote:

On Mon, Sep 24, 2012 at 10:32:05AM -0400, Don Slutz wrote:

These are modeled after x86_cpuid_get_xlevel and x86_cpuid_set_xlevel.

Signed-off-by: Don Slutz d...@cloudswitch.com
---
  target-i386/cpu.c |   29 +
  1 files changed, 29 insertions(+), 0 deletions(-)

diff --git a/target-i386/cpu.c b/target-i386/cpu.c
index 25ca986..451de12 100644
--- a/target-i386/cpu.c
+++ b/target-i386/cpu.c
@@ -1166,6 +1166,32 @@ static void x86_cpuid_set_tsc_freq(Object 
*obj, Visitor *v, void *opaque,

  cpu-env.tsc_khz = value / 1000;
  }
  +static void x86_cpuid_get_hv_level(Object *obj, Visitor *v, void 
*opaque,

+const char *name, Error **errp)
+{
+X86CPU *cpu = X86_CPU(obj);
+
+visit_type_uint32(v, cpu-env.cpuid_hv_level, name, errp);
+}
+
+static void x86_cpuid_set_hv_level(Object *obj, Visitor *v, void 
*opaque,

+const char *name, Error **errp)
+{
+X86CPU *cpu = X86_CPU(obj);
+uint32_t value;
+
+visit_type_uint32(v, value, name, errp);
+if (error_is_set(errp)) {
+return;
+}
+
+if (value != 0  value  0x4000) {
+value += 0x4000;
+}

Whats the purpose of this? Adds ambiguity.

Will add more info in this commit message.
   -Don
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Not clear on how to add info in the commit message.

This is direct copy with adjustment from x86_cpuid_set_xlevel():

if (value  0x8000) {
value += 0x8000;
}

(Pending patch: 
http://comments.gmane.org/gmane.comp.emulators.qemu/172703 adds this)


The adjustment is that 0 is a legal value. See 
http://lkml.indiana.edu/hypermail/linux/kernel/1205.0/00100.html


This does mean that just like xlevel=1 and xlevel=0x8001 are the 
same; hypervisor-level=1 and hypervisor-level=0x401 are the same.  
If this is not wanted, I have no issue with removing it.


   -Don Slutz
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC v2 5/6] ARM KVM: save and load VFP registers from kernel

2012-10-10 Thread Peter Maydell
Add support for saving and restoring VFP register state from the
kernel. This includes a check that the KVM-created CPU has full
VFP support (as the TCG Cortex-A15 model always does), since for
the moment ARM QEMU doesn't have any way to tweak optional features
on created CPUs.

Signed-off-by: Peter Maydell peter.mayd...@linaro.org
---
 target-arm/kvm.c |   78 +++---
 1 file changed, 75 insertions(+), 3 deletions(-)

diff --git a/target-arm/kvm.c b/target-arm/kvm.c
index fee60e1..f17e7fd 100644
--- a/target-arm/kvm.c
+++ b/target-arm/kvm.c
@@ -39,10 +39,28 @@ int kvm_arch_init(KVMState *s)
 int kvm_arch_init_vcpu(CPUARMState *env)
 {
 struct kvm_vcpu_init init;
+int ret;
+uint64_t v;
+struct kvm_one_reg r;
 
 init.target = KVM_ARM_TARGET_CORTEX_A15;
 memset(init.features, 0, sizeof(init.features));
-return kvm_vcpu_ioctl(env, KVM_ARM_VCPU_INIT, init);
+ret = kvm_vcpu_ioctl(env, KVM_ARM_VCPU_INIT, init);
+if (ret) {
+return ret;
+}
+/* Query the kernel to make sure it supports 32 VFP
+ * registers: QEMU's cortex-a15 CPU is always a
+ * VFP-D32 core. The simplest way to do this is just
+ * to attempt to read register d31.
+ */
+r.id = KVM_REG_ARM | KVM_REG_SIZE_U64 | KVM_REG_ARM_VFP | 31;
+r.addr = (uintptr_t)(v);
+ret = kvm_vcpu_ioctl(env, KVM_GET_ONE_REG, r);
+if (ret == ENOENT) {
+return EINVAL;
+}
+return ret;
 }
 
 struct reg {
@@ -68,6 +86,13 @@ struct reg {
 offsetof(CPUARMState, QEMUFIELD) \
 }
 
+#define VFPSYSREG(R)   \
+{  \
+KVM_REG_ARM | KVM_REG_SIZE_U32 | KVM_REG_ARM_VFP | \
+KVM_REG_ARM_VFP_##R,   \
+offsetof(CPUARMState, vfp.xregs[ARM_VFP_##R])  \
+}
+
 const struct reg regs[] = {
 /* R0_usr .. R14_usr */
 COREREG(usr_regs[0], regs[0]),
@@ -115,6 +140,13 @@ const struct reg regs[] = {
 CP15REG(1, 0, 0, 0, cp15.c1_sys), /* SCTLR */
 CP15REG(2, 0, 0, 2, cp15.c2_control), /* TTBCR */
 CP15REG(3, 0, 0, 0, cp15.c3), /* DACR */
+/* VFP system registers */
+VFPSYSREG(FPSID),
+VFPSYSREG(MVFR1),
+VFPSYSREG(MVFR0),
+VFPSYSREG(FPEXC),
+VFPSYSREG(FPINST),
+VFPSYSREG(FPINST2),
 };
 
 int kvm_arch_put_registers(CPUARMState *env, int level)
@@ -122,7 +154,7 @@ int kvm_arch_put_registers(CPUARMState *env, int level)
 struct kvm_one_reg r;
 int mode, bn;
 int ret, i;
-uint32_t cpsr;
+uint32_t cpsr, fpscr;
 uint64_t ttbr;
 
 /* Make sure the banked regs are properly set */
@@ -173,6 +205,26 @@ int kvm_arch_put_registers(CPUARMState *env, int level)
 (2  KVM_REG_ARM_CRM_SHIFT) | (1  KVM_REG_ARM_OPC1_SHIFT);
 r.addr = (uintptr_t)(ttbr);
 ret = kvm_vcpu_ioctl(env, KVM_SET_ONE_REG, r);
+if (ret) {
+return ret;
+}
+
+/* VFP registers */
+r.id = KVM_REG_ARM | KVM_REG_SIZE_U64 | KVM_REG_ARM_VFP;
+for (i = 0; i  32; i++) {
+r.addr = (uintptr_t)(env-vfp.regs[i]);
+ret = kvm_vcpu_ioctl(env, KVM_SET_ONE_REG, r);
+if (ret) {
+return ret;
+}
+r.id++;
+}
+
+r.id = KVM_REG_ARM | KVM_REG_SIZE_U32 | KVM_REG_ARM_VFP |
+KVM_REG_ARM_VFP_FPSCR;
+fpscr = vfp_get_fpscr(env);
+r.addr = (uintptr_t)fpscr;
+ret = kvm_vcpu_ioctl(env, KVM_SET_ONE_REG, r);
 
 return ret;
 }
@@ -182,7 +234,7 @@ int kvm_arch_get_registers(CPUARMState *env)
 struct kvm_one_reg r;
 int mode, bn;
 int ret, i;
-uint32_t cpsr;
+uint32_t cpsr, fpscr;
 uint64_t ttbr;
 
 for (i = 0; i  ARRAY_SIZE(regs); i++) {
@@ -247,6 +299,26 @@ int kvm_arch_get_registers(CPUARMState *env)
 env-cp15.c2_mask = ~(((uint32_t)0xu)  env-cp15.c2_control);
 env-cp15.c2_base_mask = ~((uint32_t)0x3fffu  env-cp15.c2_control);
 
+/* VFP registers */
+r.id = KVM_REG_ARM | KVM_REG_SIZE_U64 | KVM_REG_ARM_VFP;
+for (i = 0; i  32; i++) {
+r.addr = (uintptr_t)(env-vfp.regs[i]);
+ret = kvm_vcpu_ioctl(env, KVM_GET_ONE_REG, r);
+if (ret) {
+return ret;
+}
+r.id++;
+}
+
+r.id = KVM_REG_ARM | KVM_REG_SIZE_U32 | KVM_REG_ARM_VFP |
+KVM_REG_ARM_VFP_FPSCR;
+r.addr = (uintptr_t)fpscr;
+ret = kvm_vcpu_ioctl(env, KVM_GET_ONE_REG, r);
+if (ret) {
+return ret;
+}
+vfp_set_fpscr(env, fpscr);
+
 return 0;
 }
 
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC v2 2/6] ARM: KVM: Add support for KVM on ARM architecture

2012-10-10 Thread Peter Maydell
From: Christoffer Dall cd...@cs.columbia.edu

Add basic support for KVM on ARM architecture.

Signed-off-by: Christoffer Dall cd...@cs.columbia.edu
[PMM: Minor tweaks and code cleanup, switch to ONE_REG]
Signed-off-by: Peter Maydell peter.mayd...@linaro.org
---
 hw/arm_pic.c |   28 
 target-arm/Makefile.objs |1 +
 target-arm/cpu.h |1 +
 target-arm/helper.c  |2 +-
 target-arm/kvm.c |  328 ++
 5 files changed, 359 insertions(+), 1 deletion(-)
 create mode 100644 target-arm/kvm.c

diff --git a/hw/arm_pic.c b/hw/arm_pic.c
index ffb4d41..72272dc 100644
--- a/hw/arm_pic.c
+++ b/hw/arm_pic.c
@@ -9,6 +9,7 @@
 
 #include hw.h
 #include arm-misc.h
+#include kvm.h
 
 /* Input 0 is IRQ and input 1 is FIQ.  */
 static void arm_pic_cpu_handler(void *opaque, int irq, int level)
@@ -34,7 +35,34 @@ static void arm_pic_cpu_handler(void *opaque, int irq, int 
level)
 }
 }
 
+#ifdef CONFIG_KVM
+static void kvm_arm_pic_cpu_handler(void *opaque, int irq, int level)
+{
+ARMCPU *cpu = opaque;
+CPUARMState *env = cpu-env;
+int kvm_irq = KVM_ARM_IRQ_TYPE_CPU  KVM_ARM_IRQ_TYPE_SHIFT;
+
+switch (irq) {
+case ARM_PIC_CPU_IRQ:
+kvm_irq |= KVM_ARM_IRQ_CPU_IRQ;
+break;
+case ARM_PIC_CPU_FIQ:
+kvm_irq |= KVM_ARM_IRQ_CPU_FIQ;
+break;
+default:
+hw_error(kvm_arm_pic_cpu_handler: Bad interrupt line %d\n, irq);
+}
+kvm_irq |= env-cpu_index  KVM_ARM_IRQ_VCPU_SHIFT;
+kvm_set_irq(kvm_state, kvm_irq, level ? 1 : 0);
+}
+#endif
+
 qemu_irq *arm_pic_init_cpu(ARMCPU *cpu)
 {
+#ifdef CONFIG_KVM
+if (kvm_enabled()) {
+return qemu_allocate_irqs(kvm_arm_pic_cpu_handler, cpu, 2);
+}
+#endif
 return qemu_allocate_irqs(arm_pic_cpu_handler, cpu, 2);
 }
diff --git a/target-arm/Makefile.objs b/target-arm/Makefile.objs
index b6f1a9e..d89b57c 100644
--- a/target-arm/Makefile.objs
+++ b/target-arm/Makefile.objs
@@ -1,4 +1,5 @@
 obj-y += arm-semi.o
 obj-$(CONFIG_SOFTMMU) += machine.o
+obj-$(CONFIG_KVM) += kvm.o
 obj-y += translate.o op_helper.o helper.o cpu.o
 obj-y += neon_helper.o iwmmxt_helper.o
diff --git a/target-arm/cpu.h b/target-arm/cpu.h
index ff4de10..8c7e574 100644
--- a/target-arm/cpu.h
+++ b/target-arm/cpu.h
@@ -236,6 +236,7 @@ ARMCPU *cpu_arm_init(const char *cpu_model);
 void arm_translate_init(void);
 int cpu_arm_exec(CPUARMState *s);
 void do_interrupt(CPUARMState *);
+int bank_number(CPUARMState *env, int mode);
 void switch_mode(CPUARMState *, int);
 uint32_t do_arm_semihosting(CPUARMState *env);
 
diff --git a/target-arm/helper.c b/target-arm/helper.c
index 58340bd..f62f3da 100644
--- a/target-arm/helper.c
+++ b/target-arm/helper.c
@@ -1619,7 +1619,7 @@ uint32_t HELPER(get_r13_banked)(CPUARMState *env, 
uint32_t mode)
 #else
 
 /* Map CPU modes onto saved register banks.  */
-static inline int bank_number(CPUARMState *env, int mode)
+int bank_number(CPUARMState *env, int mode)
 {
 switch (mode) {
 case ARM_CPU_MODE_USR:
diff --git a/target-arm/kvm.c b/target-arm/kvm.c
new file mode 100644
index 000..fee60e1
--- /dev/null
+++ b/target-arm/kvm.c
@@ -0,0 +1,328 @@
+/*
+ * ARM implementation of KVM hooks
+ *
+ * Copyright Christoffer Dall 2009-2010
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include stdio.h
+#include sys/types.h
+#include sys/ioctl.h
+#include sys/mman.h
+
+#include linux/kvm.h
+
+#include qemu-common.h
+#include qemu-timer.h
+#include sysemu.h
+#include kvm.h
+#include cpu.h
+#include device_tree.h
+#include hw/arm-misc.h
+
+const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
+KVM_CAP_LAST_INFO
+};
+
+int kvm_arch_init(KVMState *s)
+{
+/* For ARM interrupt delivery is always asynchronous,
+ * whether we are using an in-kernel VGIC or not.
+ */
+kvm_async_interrupts_allowed = true;
+return 0;
+}
+
+int kvm_arch_init_vcpu(CPUARMState *env)
+{
+struct kvm_vcpu_init init;
+
+init.target = KVM_ARM_TARGET_CORTEX_A15;
+memset(init.features, 0, sizeof(init.features));
+return kvm_vcpu_ioctl(env, KVM_ARM_VCPU_INIT, init);
+}
+
+struct reg {
+uint64_t id;
+int offset;
+};
+
+#define COREREG(KERNELNAME, QEMUFIELD)   \
+{\
+KVM_REG_ARM | KVM_REG_SIZE_U32 | \
+KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(KERNELNAME), \
+offsetof(CPUARMState, QEMUFIELD) \
+}
+
+#define CP15REG(CRN, CRM, OPC1, OPC2, QEMUFIELD) \
+{\
+KVM_REG_ARM | KVM_REG_SIZE_U32 | \
+(15  KVM_REG_ARM_COPROC_SHIFT) |   \
+((CRN)  KVM_REG_ARM_32_CRN_SHIFT) |\
+((CRM)  KVM_REG_ARM_CRM_SHIFT) |   \
+((OPC1)  KVM_REG_ARM_OPC1_SHIFT) | \
+((OPC2)