Re: [PATCH] tcm_vhost: Multi-target support
Hello Nicholas, On 01/31/2013 03:33 PM, Asias He wrote: In order to take advantages of Paolo's multi-queue virito-scsi, we need multi-target support in tcm_vhost first. Otherwise all the requests go to one queue and other queues are idle. This patch makes: 1. All the targets under the wwpn is seen and can be used by guest. 2. No need to pass the tpgt number in struct vhost_scsi_target to tcm_vhost.ko. Only wwpn is needed. 3. We can always pass max_target = 255 to guest now, since we abort the request who's target id does not exist. Signed-off-by: Asias He as...@redhat.com --- drivers/vhost/tcm_vhost.c | 115 -- drivers/vhost/tcm_vhost.h | 4 +- 2 files changed, 74 insertions(+), 45 deletions(-) diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c index 218deb6..d50cb95 100644 --- a/drivers/vhost/tcm_vhost.c +++ b/drivers/vhost/tcm_vhost.c @@ -59,13 +59,18 @@ enum { VHOST_SCSI_VQ_IO = 2, }; +#define VHOST_SCSI_MAX_TARGET 256 + struct vhost_scsi { - struct tcm_vhost_tpg *vs_tpg; /* Protected by vhost_scsi-dev.mutex */ + /* Protected by vhost_scsi-dev.mutex */ + struct tcm_vhost_tpg *vs_tpg[VHOST_SCSI_MAX_TARGET]; struct vhost_dev dev; struct vhost_virtqueue vqs[3]; struct vhost_work vs_completion_work; /* cmd completion work item */ struct llist_head vs_completion_list; /* cmd completion queue */ + char vs_vhost_wwpn[TRANSPORT_IQN_LEN]; + int vs_num_target; }; /* Local pointer to allocated TCM configfs fabric module */ @@ -564,13 +569,7 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs) u32 exp_data_len, data_first, data_num, data_direction; unsigned out, in, i; int head, ret; - - /* Must use ioctl VHOST_SCSI_SET_ENDPOINT */ - tv_tpg = vs-vs_tpg; - if (unlikely(!tv_tpg)) { - pr_err(%s endpoint not set\n, __func__); - return; - } + u8 target; mutex_lock(vq-mutex); vhost_disable_notify(vs-dev, vq); @@ -637,6 +636,35 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs) break; } + /* Extract the tpgt */ + target = v_req.lun[1]; + + /* Target does not exit, fail the request */ + if (unlikely(target = vs-vs_num_target)) { + struct virtio_scsi_cmd_resp __user *resp; + struct virtio_scsi_cmd_resp rsp; + + memset(rsp, 0, sizeof(rsp)); + rsp.response = VIRTIO_SCSI_S_BAD_TARGET; + resp = vq-iov[out].iov_base; + ret = copy_to_user(resp, rsp, sizeof(rsp)); + if (!ret) + vhost_add_used_and_signal(vs-dev, + vs-vqs[2], head, 0); + else + pr_err(Faulted on virtio_scsi_cmd_resp\n); + + continue; + } + + tv_tpg = vs-vs_tpg[target]; + if (unlikely(!tv_tpg)) { + /* Must use ioctl VHOST_SCSI_SET_ENDPOINT */ + pr_err(endpoint not set, target = %d\n, target); + vhost_discard_vq_desc(vq, 1); + break; + } + exp_data_len = 0; for (i = 0; i data_num; i++) exp_data_len += vq-iov[data_first + i].iov_len; @@ -771,14 +799,11 @@ static int vhost_scsi_set_endpoint( } tv_tport = tv_tpg-tport; - if (!strcmp(tv_tport-tport_name, t-vhost_wwpn) - (tv_tpg-tport_tpgt == t-vhost_tpgt)) { + if (!strcmp(tv_tport-tport_name, t-vhost_wwpn)) { tv_tpg-tv_tpg_vhost_count++; - mutex_unlock(tv_tpg-tv_tpg_mutex); - mutex_unlock(tcm_vhost_mutex); mutex_lock(vs-dev.mutex); - if (vs-vs_tpg) { + if (vs-vs_tpg[tv_tpg-tport_tpgt - 1]) { mutex_unlock(vs-dev.mutex); mutex_lock(tv_tpg-tv_tpg_mutex); tv_tpg-tv_tpg_vhost_count--; @@ -786,15 +811,17 @@ static int vhost_scsi_set_endpoint( return -EEXIST; } - vs-vs_tpg = tv_tpg; + vs-vs_tpg[tv_tpg-tport_tpgt - 1] = tv_tpg; tv_tpg-tport_tpgt starts from 0, right? I thought it starts from 1, because I always got it starts from 1 in targetcli. o- vhost o- naa.6001405bd4e8476d o- tpg1 o- luns o- lun0 o- tpg2 o- luns o- lun0 o- tpg3 o- luns o- lun0 o- tpg4 o- luns o- lun0
Re: [PATCH 2/2] x86, apicv: Add Posted Interrupt supporting
On Thu, Dec 13, 2012 at 03:29:40PM +0800, Yang Zhang wrote: From: Yang Zhang yang.z.zh...@intel.com Posted Interrupt allows APIC interrupts to inject into guest directly without any vmexit. - When delivering a interrupt to guest, if target vcpu is running, update Posted-interrupt requests bitmap and send a notification event to the vcpu. Then the vcpu will handle this interrupt automatically, without any software involvemnt. - If target vcpu is not running or there already a notification event pending in the vcpu, do nothing. The interrupt will be handled by next vm entry. Signed-off-by: Yang Zhang yang.z.zh...@intel.com --- arch/x86/include/asm/entry_arch.h |1 + arch/x86/include/asm/hw_irq.h |1 + arch/x86/include/asm/irq.h |1 + arch/x86/include/asm/irq_vectors.h |4 + arch/x86/include/asm/kvm_host.h|3 + arch/x86/include/asm/vmx.h |4 + arch/x86/kernel/entry_64.S |2 + arch/x86/kernel/irq.c | 25 +++ arch/x86/kernel/irqinit.c |2 + arch/x86/kvm/lapic.c | 16 +++- arch/x86/kvm/lapic.h |1 + arch/x86/kvm/vmx.c | 133 +--- 12 files changed, 180 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 40afa00..7b0a29e 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -18,6 +18,7 @@ BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR) #endif BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) +BUILD_INTERRUPT(posted_intr_ipi, POSTED_INTR_VECTOR) /* * every pentium local APIC has two 'local interrupts', with a diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index eb92a6e..ee61af3 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -28,6 +28,7 @@ /* Interrupt handlers registered during init_IRQ */ extern void apic_timer_interrupt(void); extern void x86_platform_ipi(void); +extern void posted_intr_ipi(void); extern void error_interrupt(void); extern void irq_work_interrupt(void); diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index ba870bb..cff9933 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -30,6 +30,7 @@ extern void irq_force_complete_move(int); #endif extern void (*x86_platform_ipi_callback)(void); +extern void (*posted_intr_callback)(void); extern void native_init_IRQ(void); extern bool handle_irq(unsigned irq, struct pt_regs *regs); diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 1508e51..8f2e383 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -102,6 +102,10 @@ */ #define X86_PLATFORM_IPI_VECTOR 0xf7 +#ifdef CONFIG_HAVE_KVM Users of POSTED_INTR_VECTOR are not under ifdef, which means compilation will fails with kvm disabled. Test it please. +#define POSTED_INTR_VECTOR 0xf2 +#endif + /* * IRQ work vector: */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7e26d1a..82423a8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -700,6 +700,9 @@ struct kvm_x86_ops { int (*has_virtual_interrupt_delivery)(struct kvm_vcpu *vcpu); void (*update_irq)(struct kvm_vcpu *vcpu); void (*update_eoi_exitmap)(struct kvm_vcpu *vcpu, int vector, bool set); + int (*has_posted_interrupt)(struct kvm_vcpu *vcpu); + int (*send_nv)(struct kvm_vcpu *vcpu, int vector); + void (*update_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*get_tdp_level)(void); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 1003341..7b9e1d0 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -152,6 +152,7 @@ #define PIN_BASED_EXT_INTR_MASK 0x0001 #define PIN_BASED_NMI_EXITING 0x0008 #define PIN_BASED_VIRTUAL_NMIS 0x0020 +#define PIN_BASED_POSTED_INTR 0x0080 #define VM_EXIT_SAVE_DEBUG_CONTROLS 0x0002 #define VM_EXIT_HOST_ADDR_SPACE_SIZE0x0200 @@ -174,6 +175,7 @@ /* VMCS Encodings */ enum vmcs_field { VIRTUAL_PROCESSOR_ID= 0x, + POSTED_INTR_NV = 0x0002, GUEST_ES_SELECTOR = 0x0800, GUEST_CS_SELECTOR = 0x0802, GUEST_SS_SELECTOR = 0x0804, @@ -208,6 +210,8 @@ enum vmcs_field { VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x2013, APIC_ACCESS_ADDR= 0x2014,
Re: [PATCH 2/2] x86, apicv: Add Posted Interrupt supporting
On Wed, Jan 30, 2013 at 09:03:11PM -0200, Marcelo Tosatti wrote: Posted interrupt patch: 2) Must move IN_GUEST_MODE assignment after local_irq_disable, in vcpu_enter_guest function. Otherwise: cpu0 vcpu1-cpu1 vcpu-mode = IN_GUEST_MODE if IN_GUEST_MODE == true send IPI local_irq_disable PIR not transferred to VIRR, misses interrupt. cpu0 will set KVM_REQ_EVENT, so vmentry will be aborted after local_irq_disable() by -requests check. 3) Must check outstanding PIR notification bit unconditionally on every VM-entry, because: 1. local_irq_disable 2. vcpu-mode = IN_GUEST_MODE 3. vmenter 4. vmexit 5. vcpu-mode = OUTSIDE_GUEST_MODE If PIR-IPI-interrupt is sent between an event which triggers VM-exit (for example, an external interrupt due to a device), and step 5 (assignment of vcpu-mode), the PIR-VIRR transfer before vmentry must be made. Not sure I understand, but I think KVM_REQ_EVENT will cover that too. 4) Today, an interrupt notification is cached on IRR until its delivered - further interrupt injection is not generating further interrupt notification bits. With PIR, behaviour changes: Its possible to have one bit in PIR and another on IRR APIC page (if timing is right). Is this harmless? Why? -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/2] KVM: set_memory_region: Cleanup and new restriction
On Wed, Jan 30, 2013 at 07:38:37PM +0900, Takuya Yoshikawa wrote: Patch 1: just rebased for this series. Patch 2: an API change, so please let me know if you notice any problems. Takuya Yoshikawa (2): KVM: set_memory_region: Identify the requested change explicitly KVM: set_memory_region: Disallow changing read-only attribute later Documentation/virtual/kvm/api.txt | 12 ++-- virt/kvm/kvm_main.c | 95 + 2 files changed, 60 insertions(+), 47 deletions(-) Reviewed-by: Gleb Natapov g...@redhat.com -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] KVM call minutes 2013-01-29 - Port I/O
On Wed, Jan 30, 2013 at 04:28:30PM -0700, Alex Williamson wrote: On Thu, 2013-01-31 at 10:02 +1100, Benjamin Herrenschmidt wrote: On Thu, 2013-01-31 at 00:49 +0200, Michael S. Tsirkin wrote: In practice they do (VGA at least) From a SW modelling standpoint, I don't think it's worth differentiating PCI and PCIE. Cheers, Ben. Interesting. Do you have such hardware? Could you please dump the output of lspci -vv? Any ATI or nVidia card still supports hard decoding of VGA regions for the sake of legacy operating systems and BIOSes :-) I don't know about Intel but I suppose it's the same. For example: -[:00]-+-00.0 Advanced Micro Devices [AMD] nee ATI RD890 PCI to PCI bridge (external gfx0 p +-04.0-[02]--+-00.0 Advanced Micro Devices [AMD] nee ATI Cedar PRO [Radeon HD 5450/6350] 00:04.0 PCI bridge: Advanced Micro Devices [AMD] nee ATI RD890 PCI to PCI bridge (PCI express gpp port D) (prog-if 00 [Normal decode]) Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR- FastB2B- DisINTx- Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast TAbort- TAbort- MAbort- SERR- PERR- INTx- Latency: 0, Cache Line Size: 64 bytes Bus: primary=00, secondary=02, subordinate=02, sec-latency=0 I/O behind bridge: c000-cfff Memory behind bridge: fd10-fd1f Prefetchable memory behind bridge: d000-dfff Secondary status: 66MHz- FastB2B- ParErr- DEVSEL=fast TAbort- TAbort- MAbort+ SERR- PERR- BridgeCtl: Parity- SERR- NoISA- VGA+ MAbort- Reset- FastB2B- VGA+ (VGA Enable) indicates positive decode of 0x3b0 - 0x3bb, 0x3c0 - 0x3df, and 0xa - 0xbfff. Device 2:00.0 of course doesn't report these ISA ranges as they're implicit in the VGA class code. OK but this appears behind a bridge. So the bridge configuration tells the root complex where to send accesses to the VGA. But qemu currently puts devices directly on root bus. And as far as I can tell when we present devices directly on bus 0, we pretend these are integrated in the root complex. The spec seems to say explicitly that root complex integrated devices should not use legacy addresses or support hotplug. So I would be surprised if such one appears in real world. Luckily guests do not seem to be worried as long as we use ACPI. BTW, I've been working on vfio-pci support of VGA assignment which makes use of the VGA arbiter in the host to manipulate the VGA Enable control register, allowing us to select which device to access. The qemu side is simply registering memory regions for the VGA areas and expecting to be used with -vga none, but I'll adopt whatever strategy we choose for hard coded address range support. Current base patches at the links below. Thanks, Alex https://github.com/awilliam/qemu-vfio/commit/ea2befa59010a429dcf13c10dbccdf8b64e82fbd https://github.com/awilliam/linux-vfio/commit/bae182d929229cbf1eaeb01e5fad4f77f81a4c61 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [kvmarm] [RFC v5 7/8] hw/kvm/arm_gic: Implement support for KVM in-kernel ARM GIC
Am 31.01.2013 11:52, schrieb KONRAD Frédéric: On 24/01/2013 16:43, Peter Maydell wrote: Implement support for using the KVM in-kernel GIC for ARM. Signed-off-by: Peter Maydell peter.mayd...@linaro.org --- hw/a15mpcore.c |8 ++- hw/arm/Makefile.objs |1 + hw/kvm/arm_gic.c | 169 ++ 3 files changed, 177 insertions(+), 1 deletion(-) create mode 100644 hw/kvm/arm_gic.c diff --git a/hw/a15mpcore.c b/hw/a15mpcore.c index fe6c34c..1ca6f28 100644 --- a/hw/a15mpcore.c +++ b/hw/a15mpcore.c @@ -19,6 +19,7 @@ */ #include sysbus.h +#include sysemu/kvm.h /* A15MP private memory region. */ @@ -40,8 +41,13 @@ static int a15mp_priv_init(SysBusDevice *dev) { A15MPPrivState *s = FROM_SYSBUS(A15MPPrivState, dev); SysBusDevice *busdev; +const char *gictype = arm-gic; s/arm-gic/arm_gic/ ^^ ? Christoffer and I had trouble with that: qemu-system-arm: Unknown device 'arm-gic' for default sysbus Since you already ran into issues here, even better would be to use a TYPE_ARM_GIC constant or so. Andreas Fred -s-gic = qdev_create(NULL, arm_gic); +if (kvm_irqchip_in_kernel()) { +gictype = kvm-arm-gic; +} + +s-gic = qdev_create(NULL, gictype); qdev_prop_set_uint32(s-gic, num-cpu, s-num_cpu); qdev_prop_set_uint32(s-gic, num-irq, s-num_irq); -- SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [kvmarm] [RFC v5 7/8] hw/kvm/arm_gic: Implement support for KVM in-kernel ARM GIC
On 24/01/2013 16:43, Peter Maydell wrote: Implement support for using the KVM in-kernel GIC for ARM. Signed-off-by: Peter Maydell peter.mayd...@linaro.org --- hw/a15mpcore.c |8 ++- hw/arm/Makefile.objs |1 + hw/kvm/arm_gic.c | 169 ++ 3 files changed, 177 insertions(+), 1 deletion(-) create mode 100644 hw/kvm/arm_gic.c diff --git a/hw/a15mpcore.c b/hw/a15mpcore.c index fe6c34c..1ca6f28 100644 --- a/hw/a15mpcore.c +++ b/hw/a15mpcore.c @@ -19,6 +19,7 @@ */ #include sysbus.h +#include sysemu/kvm.h /* A15MP private memory region. */ @@ -40,8 +41,13 @@ static int a15mp_priv_init(SysBusDevice *dev) { A15MPPrivState *s = FROM_SYSBUS(A15MPPrivState, dev); SysBusDevice *busdev; +const char *gictype = arm-gic; s/arm-gic/arm_gic/ ^^ ? Christoffer and I had trouble with that: qemu-system-arm: Unknown device 'arm-gic' for default sysbus Fred -s-gic = qdev_create(NULL, arm_gic); +if (kvm_irqchip_in_kernel()) { +gictype = kvm-arm-gic; +} + +s-gic = qdev_create(NULL, gictype); qdev_prop_set_uint32(s-gic, num-cpu, s-num_cpu); qdev_prop_set_uint32(s-gic, num-irq, s-num_irq); -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] tcm_vhost: Multi-target support
On Thu, Jan 31, 2013 at 05:28:21PM +0800, Asias He wrote: Hello Nicholas, On 01/31/2013 03:33 PM, Asias He wrote: In order to take advantages of Paolo's multi-queue virito-scsi, we need multi-target support in tcm_vhost first. Otherwise all the requests go to one queue and other queues are idle. This patch makes: 1. All the targets under the wwpn is seen and can be used by guest. 2. No need to pass the tpgt number in struct vhost_scsi_target to tcm_vhost.ko. Only wwpn is needed. 3. We can always pass max_target = 255 to guest now, since we abort the request who's target id does not exist. Signed-off-by: Asias He as...@redhat.com --- drivers/vhost/tcm_vhost.c | 115 -- drivers/vhost/tcm_vhost.h | 4 +- 2 files changed, 74 insertions(+), 45 deletions(-) diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c index 218deb6..d50cb95 100644 --- a/drivers/vhost/tcm_vhost.c +++ b/drivers/vhost/tcm_vhost.c @@ -59,13 +59,18 @@ enum { VHOST_SCSI_VQ_IO = 2, }; +#define VHOST_SCSI_MAX_TARGET 256 + struct vhost_scsi { - struct tcm_vhost_tpg *vs_tpg; /* Protected by vhost_scsi-dev.mutex */ + /* Protected by vhost_scsi-dev.mutex */ + struct tcm_vhost_tpg *vs_tpg[VHOST_SCSI_MAX_TARGET]; struct vhost_dev dev; struct vhost_virtqueue vqs[3]; struct vhost_work vs_completion_work; /* cmd completion work item */ struct llist_head vs_completion_list; /* cmd completion queue */ + char vs_vhost_wwpn[TRANSPORT_IQN_LEN]; + int vs_num_target; }; /* Local pointer to allocated TCM configfs fabric module */ @@ -564,13 +569,7 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs) u32 exp_data_len, data_first, data_num, data_direction; unsigned out, in, i; int head, ret; - - /* Must use ioctl VHOST_SCSI_SET_ENDPOINT */ - tv_tpg = vs-vs_tpg; - if (unlikely(!tv_tpg)) { - pr_err(%s endpoint not set\n, __func__); - return; - } + u8 target; mutex_lock(vq-mutex); vhost_disable_notify(vs-dev, vq); @@ -637,6 +636,35 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs) break; } + /* Extract the tpgt */ + target = v_req.lun[1]; + + /* Target does not exit, fail the request */ + if (unlikely(target = vs-vs_num_target)) { + struct virtio_scsi_cmd_resp __user *resp; + struct virtio_scsi_cmd_resp rsp; + + memset(rsp, 0, sizeof(rsp)); + rsp.response = VIRTIO_SCSI_S_BAD_TARGET; + resp = vq-iov[out].iov_base; + ret = copy_to_user(resp, rsp, sizeof(rsp)); + if (!ret) + vhost_add_used_and_signal(vs-dev, + vs-vqs[2], head, 0); + else + pr_err(Faulted on virtio_scsi_cmd_resp\n); + + continue; + } + + tv_tpg = vs-vs_tpg[target]; + if (unlikely(!tv_tpg)) { + /* Must use ioctl VHOST_SCSI_SET_ENDPOINT */ + pr_err(endpoint not set, target = %d\n, target); + vhost_discard_vq_desc(vq, 1); + break; + } + exp_data_len = 0; for (i = 0; i data_num; i++) exp_data_len += vq-iov[data_first + i].iov_len; @@ -771,14 +799,11 @@ static int vhost_scsi_set_endpoint( } tv_tport = tv_tpg-tport; - if (!strcmp(tv_tport-tport_name, t-vhost_wwpn) - (tv_tpg-tport_tpgt == t-vhost_tpgt)) { + if (!strcmp(tv_tport-tport_name, t-vhost_wwpn)) { tv_tpg-tv_tpg_vhost_count++; - mutex_unlock(tv_tpg-tv_tpg_mutex); - mutex_unlock(tcm_vhost_mutex); mutex_lock(vs-dev.mutex); - if (vs-vs_tpg) { + if (vs-vs_tpg[tv_tpg-tport_tpgt - 1]) { mutex_unlock(vs-dev.mutex); mutex_lock(tv_tpg-tv_tpg_mutex); tv_tpg-tv_tpg_vhost_count--; @@ -786,15 +811,17 @@ static int vhost_scsi_set_endpoint( return -EEXIST; } - vs-vs_tpg = tv_tpg; + vs-vs_tpg[tv_tpg-tport_tpgt - 1] = tv_tpg; tv_tpg-tport_tpgt starts from 0, right? I thought it starts from 1, because I always got it starts from 1 in targetcli. o- vhost o- naa.6001405bd4e8476d o- tpg1 o- luns o- lun0 o- tpg2 o- luns o- lun0 o- tpg3 o- luns o-
Re: [kvmarm] [RFC v5 7/8] hw/kvm/arm_gic: Implement support for KVM in-kernel ARM GIC
On 31 January 2013 10:54, Andreas Färber afaer...@suse.de wrote: Am 31.01.2013 11:52, schrieb KONRAD Frédéric: +const char *gictype = arm-gic; s/arm-gic/arm_gic/ ^^ ? Christoffer and I had trouble with that: qemu-system-arm: Unknown device 'arm-gic' for default sysbus Oops, nice catch. Since you already ran into issues here, even better would be to use a TYPE_ARM_GIC constant or so. Hmm, I kind of agree, but QOM idiom doesn't seem to encourage having that define be publicly visible. Should we have a hw/my_device.h [with the public bits like the TYPE_ and FOO_CLASS/FOO_GET_CLASS macros] for every type? (and a hw/my_device_priv.h if needed] -- PMM -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 8/8] KVM:PPC:booke: Allow debug interrupt injection to guest
On 30.01.2013, at 12:12, Bhushan Bharat-R65777 wrote: -Original Message- From: kvm-ppc-ow...@vger.kernel.org [mailto:kvm-ppc-ow...@vger.kernel.org] On Behalf Of Alexander Graf Sent: Friday, January 25, 2013 5:44 PM To: Bhushan Bharat-R65777 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Bhushan Bharat-R65777 Subject: Re: [PATCH 8/8] KVM:PPC:booke: Allow debug interrupt injection to guest On 16.01.2013, at 09:24, Bharat Bhushan wrote: Allow userspace to inject debug interrupt to guest. QEMU can s/QEMU/user space. inject the debug interrupt to guest if it is not able to handle the debug interrupt. Signed-off-by: Bharat Bhushan bharat.bhus...@freescale.com --- arch/powerpc/kvm/booke.c | 32 +++- arch/powerpc/kvm/e500mc.c | 10 +- 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index faa0a0b..547797f 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -133,6 +133,13 @@ static void kvmppc_vcpu_sync_fpu(struct kvm_vcpu *vcpu) #endif } +#ifdef CONFIG_KVM_BOOKE_HV +static int kvmppc_core_pending_debug(struct kvm_vcpu *vcpu) { + return test_bit(BOOKE_IRQPRIO_DEBUG, +vcpu-arch.pending_exceptions); } #endif + /* * Helper function for full MSR writes. No need to call this if only * EE/CE/ME/DE/RI are changing. @@ -144,7 +151,11 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr) #ifdef CONFIG_KVM_BOOKE_HV new_msr |= MSR_GS; - if (vcpu-guest_debug) + /* +* Set MSR_DE if the hardware debug resources are owned by user-space +* and there is no debug interrupt pending for guest to handle. Why? QEMU is using the IAC/DAC registers to set hardware breakpoint/watchpoints via debug ioctls. As debug events are enabled/gated by MSR_DE so somehow we need to set MSR_DE on hardware MSR when guest is running in this case. Reading this 5 times I still have no idea what you're really checking for here. Maybe the naming for kvmppc_core_pending_debug is just unnatural? What does that function do really? On bookehv this is how I am controlling the MSR_DE in hardware MSR. And why is this whole thing only executed on HV? On e500v2 we always enable MSR_DE using vcpu-arch.shadow_msr in e500.c #ifndef CONFIG_KVM_BOOKE_HV - vcpu-arch.shadow_msr = MSR_USER | MSR_IS | MSR_DS; + vcpu-arch.shadow_msr = MSR_USER | MSR_DE | MSR_IS | MSR_DS; Why? How is e500v2 any different wrt debug? And why wouldn't that work for e500mc? Alex -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 3/8] KVM: PPC: booke: Added debug handler
On 30.01.2013, at 12:30, Bhushan Bharat-R65777 wrote: -Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Friday, January 25, 2013 5:13 PM To: Bhushan Bharat-R65777 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Bhushan Bharat-R65777 Subject: Re: [PATCH 3/8] KVM: PPC: booke: Added debug handler On 16.01.2013, at 09:24, Bharat Bhushan wrote: From: Bharat Bhushan bharat.bhus...@freescale.com Installed debug handler will be used for guest debug support and debug facility emulation features (patches for these features will follow this patch). Signed-off-by: Liu Yu yu@freescale.com [bharat.bhus...@freescale.com: Substantial changes] Signed-off-by: Bharat Bhushan bharat.bhus...@freescale.com --- arch/powerpc/include/asm/kvm_host.h |1 + arch/powerpc/kernel/asm-offsets.c |1 + arch/powerpc/kvm/booke_interrupts.S | 49 ++- 3 files changed, 44 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 8a72d59..f4ba881 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -503,6 +503,7 @@ struct kvm_vcpu_arch { u32 tlbcfg[4]; u32 mmucfg; u32 epr; + u32 crit_save; struct kvmppc_booke_debug_reg dbg_reg; #endif gpa_t paddr_accessed; diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 46f6afd..02048f3 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -562,6 +562,7 @@ int main(void) DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst)); DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear)); DEFINE(VCPU_FAULT_ESR, offsetof(struct kvm_vcpu, arch.fault_esr)); + DEFINE(VCPU_CRIT_SAVE, offsetof(struct kvm_vcpu, arch.crit_save)); #endif /* CONFIG_PPC_BOOK3S */ #endif /* CONFIG_KVM */ diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S index eae8483..dd9c5d4 100644 --- a/arch/powerpc/kvm/booke_interrupts.S +++ b/arch/powerpc/kvm/booke_interrupts.S @@ -52,12 +52,7 @@ (1BOOKE_INTERRUPT_PROGRAM) | \ (1BOOKE_INTERRUPT_DTLB_MISS)) -.macro KVM_HANDLER ivor_nr scratch srr0 -_GLOBAL(kvmppc_handler_\ivor_nr) - /* Get pointer to vcpu and record exit number. */ - mtspr \scratch , r4 - mfspr r4, SPRN_SPRG_THREAD - lwz r4, THREAD_KVM_VCPU(r4) +.macro __KVM_HANDLER ivor_nr scratch srr0 stw r3, VCPU_GPR(R3)(r4) stw r5, VCPU_GPR(R5)(r4) stw r6, VCPU_GPR(R6)(r4) @@ -74,6 +69,46 @@ _GLOBAL(kvmppc_handler_\ivor_nr) bctr .endm +.macro KVM_HANDLER ivor_nr scratch srr0 +_GLOBAL(kvmppc_handler_\ivor_nr) + /* Get pointer to vcpu and record exit number. */ + mtspr \scratch , r4 + mfspr r4, SPRN_SPRG_THREAD + lwz r4, THREAD_KVM_VCPU(r4) + __KVM_HANDLER \ivor_nr \scratch \srr0 .endm + +.macro KVM_DBG_HANDLER ivor_nr scratch srr0 +_GLOBAL(kvmppc_handler_\ivor_nr) + mtspr \scratch, r4 + mfspr r4, SPRN_SPRG_THREAD + lwz r4, THREAD_KVM_VCPU(r4) + stw r3, VCPU_CRIT_SAVE(r4) + mfcrr3 + mfspr r4, SPRN_CSRR1 + andi. r4, r4, MSR_PR + bne 1f + /* debug interrupt happened in enter/exit path */ + mfspr r4, SPRN_CSRR1 + rlwinm r4, r4, 0, ~MSR_DE + mtspr SPRN_CSRR1, r4 + lis r4, 0x + ori r4, r4, 0x + mtspr SPRN_DBSR, r4 + mfspr r4, SPRN_SPRG_THREAD + lwz r4, THREAD_KVM_VCPU(r4) + mtcrr3 + lwz r3, VCPU_CRIT_SAVE(r4) + mfspr r4, \scratch + rfci What is this part doing? Try to ignore the debug exit? As BOOKE doesn't have hardware support for virtualization, hardware never know current pc is in guest or in host. So when enable hardware single step for guest, it cannot be disabled at the time guest exit. Thus, we'll see that an single step interrupt happens at the beginning of guest exit path. With the above code we recognize this kind of single step interrupt disable single step and rfci. Why would we have MSR_DE enabled in the first place when we can't handle it? When QEMU is using hardware debug resource then we always set MSR_DE during guest is running. Right, but why is MSR_DE enabled during the exit path? If MSR_DE wasn't set, you wouldn't get a single step exit. During the exit code path, you could then swap DBSR back to what the host expects (which means no single step). Only after that enable MSR_DE again. +1: /* debug interrupt happened in guest */ + mtcrr3 + mfspr r4, SPRN_SPRG_THREAD + lwz r4, THREAD_KVM_VCPU(r4) + lwz r3, VCPU_CRIT_SAVE(r4) + __KVM_HANDLER \ivor_nr \scratch \srr0 I don't think you need the __KVM_HANDLER split. This should be quite easily refactorable into a simple DBG prolog. Can you please
Re: [Qemu-devel] QEMU buildbot maintenance state
On Wed, Jan 30, 2013 at 10:31:22AM +0100, Gerd Hoffmann wrote: Hi, Gerd: Are you willing to co-maintain the QEMU buildmaster with Daniel and Christian? It would be awesome if you could do this given your experience running and customizing buildbot. I'll try to set aside some time for that. Christians idea to host the config at github is good, that certainly makes it easier to balance things to more people. Another thing which would be helpful: Any chance we can setup a maintainer tree mirror @ git.qemu.org? A single repository where each maintainer tree shows up as a branch? This would make the buildbot setup *alot* easier. We can go for a AnyBranchScheduler then with BuildFactory and BuildConfig shared, instead of needing one BuildFactory and BuildConfig per branch. Also makes the buildbot web interface less cluttered as we don't have a insane amount of BuildConfigs any more. And saves some resources (bandwidth + diskspace) for the buildslaves. I think people who want to look what is coming or who want to test stuff cooking it would be a nice service too if they have a one-stop shop where they can get everything. I sent a pull request that makes the BuildFactory definitions simpler using a single create_build_factory() function: https://github.com/b1-systems/buildbot/pull/1 Keep in mind that BuildFactories differ not just by repo/branch but also: * in-tree or out-of-tree * extra ./configure arguments * gmake instead of make I think this means it is not as simple as defining a single BuildFactory. Stefan -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] QEMU buildbot maintenance state
On Wed, Jan 30, 2013 at 10:31:22AM +0100, Gerd Hoffmann wrote: Hi, Gerd: Are you willing to co-maintain the QEMU buildmaster with Daniel and Christian? It would be awesome if you could do this given your experience running and customizing buildbot. I'll try to set aside some time for that. Excellent, thank you! Stefan -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] QEMU buildbot maintenance state
On 01/31/2013 01:54 PM, Stefan Hajnoczi wrote: I sent a pull request that makes the BuildFactory definitions simpler using a single create_build_factory() function: https://github.com/b1-systems/buildbot/pull/1 Stefan, I'll have a look later this day. Christian. -- Christian Berendt Tel.: +49-171-5542175 Mail: bere...@b1-systems.de B1 Systems GmbH Osterfeldstraße 7 / 85088 Vohburg / http://www.b1-systems.de GF: Ralph Dehner / Unternehmenssitz: Vohburg / AG: Ingolstadt,HRB 3537 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 5/8] KVM: PPC: debug stub interface parameter defined
On 30.01.2013, at 15:15, Bhushan Bharat-R65777 wrote: -Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Friday, January 25, 2013 5:24 PM To: Bhushan Bharat-R65777 Cc: Paul Mackerras; kvm-...@vger.kernel.org; kvm@vger.kernel.org Subject: Re: [PATCH 5/8] KVM: PPC: debug stub interface parameter defined On 17.01.2013, at 12:11, Bhushan Bharat-R65777 wrote: -Original Message- From: Paul Mackerras [mailto:pau...@samba.org] Sent: Thursday, January 17, 2013 12:53 PM To: Bhushan Bharat-R65777 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; ag...@suse.de; Bhushan Bharat- R65777 Subject: Re: [PATCH 5/8] KVM: PPC: debug stub interface parameter defined On Wed, Jan 16, 2013 at 01:54:42PM +0530, Bharat Bhushan wrote: This patch defines the interface parameter for KVM_SET_GUEST_DEBUG ioctl support. Follow up patches will use this for setting up hardware breakpoints, watchpoints and software breakpoints. [snip] diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 453a10f..7d5a51c 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1483,6 +1483,12 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) return r; } +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *dbg) +{ + return -EINVAL; +} + int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { return -ENOTSUPP; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 934413c..4c94ca9 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -532,12 +532,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) #endif } -int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, -struct kvm_guest_debug *dbg) -{ - return -EINVAL; -} - This will break the build for non-book E machines, since kvm_arch_vcpu_ioctl_set_guest_debug() is referenced from generic code. You need to add it to arch/powerpc/kvm/book3s.c as well. right, I will correct this. Would the implementation actually be different on booke vs book3s? My feeling is that powerpc.c is actually the right place for this. I am not sure there will be anything common between book3s and booke. Should we define the cpu specific function something like kvm_ppc_vcpu_ioctl_set_guest_debug() for booke and book3s and call this new defined function from kvm_arch_vcpu_ioctl_set_guest_debug() in powerpc.c ? No, just put it into the subarch directories then :). No need to overengineer anything for now. Alex -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/5] KVM: PPC: e500: Move VCPU's MMUCFG register initialization earlier
On 30.01.2013, at 14:29, Mihai Caraman wrote: VCPU's MMUCFG register initialization should not depend on KVM_CAP_SW_TLB ioctl call. Move it earlier into tlb initalization phase. Quite the contrary. The fact that there is an mfspr() in e500_mmu.c already tells us that the code is broken. The TLB guest code should only depend on input from the SW_TLB configuration. It's completely orthogonal to the host capabilities. Alex Signed-off-by: Mihai Caraman mihai.cara...@freescale.com --- arch/powerpc/kvm/e500_mmu.c |4 ++-- 1 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c index 5c44759..bb1b2b0 100644 --- a/arch/powerpc/kvm/e500_mmu.c +++ b/arch/powerpc/kvm/e500_mmu.c @@ -692,8 +692,6 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, vcpu_e500-gtlb_offset[0] = 0; vcpu_e500-gtlb_offset[1] = params.tlb_sizes[0]; - vcpu-arch.mmucfg = mfspr(SPRN_MMUCFG) ~MMUCFG_LPIDSIZE; - vcpu-arch.tlbcfg[0] = ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); if (params.tlb_sizes[0] = 2048) vcpu-arch.tlbcfg[0] |= params.tlb_sizes[0]; @@ -781,6 +779,8 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) if (!vcpu_e500-g2h_tlb1_map) goto err; + vcpu-arch.mmucfg = mfspr(SPRN_MMUCFG) ~MMUCFG_LPIDSIZE; + /* Init TLB configuration register */ vcpu-arch.tlbcfg[0] = mfspr(SPRN_TLB0CFG) ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); -- 1.7.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/5] KVM: PPC: e500: Emulate TLBnPS registers
On 30.01.2013, at 14:29, Mihai Caraman wrote: Emulate TLBnPS registers which are available in MMU Architecture Version (MAV) 2.0. Signed-off-by: Mihai Caraman mihai.cara...@freescale.com --- arch/powerpc/include/asm/kvm_host.h |1 + arch/powerpc/kvm/e500.h |5 + arch/powerpc/kvm/e500_emulate.c | 10 ++ arch/powerpc/kvm/e500_mmu.c |5 + 4 files changed, 21 insertions(+), 0 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 8a72d59..88fcfe6 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -501,6 +501,7 @@ struct kvm_vcpu_arch { spinlock_t wdt_lock; struct timer_list wdt_timer; u32 tlbcfg[4]; + u32 tlbps[4]; u32 mmucfg; u32 epr; struct kvmppc_booke_debug_reg dbg_reg; diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h index 41cefd4..b9f76d8 100644 --- a/arch/powerpc/kvm/e500.h +++ b/arch/powerpc/kvm/e500.h @@ -303,4 +303,9 @@ static inline unsigned int get_tlbmiss_tid(struct kvm_vcpu *vcpu) #define get_tlb_sts(gtlbe) (MAS1_TS) #endif /* !BOOKE_HV */ +static inline unsigned int has_mmu_v2(const struct kvm_vcpu *vcpu) bool. Also rename it to is_... then. +{ + return ((vcpu-arch.mmucfg MMUCFG_MAVN) == MMUCFG_MAVN_V2); +} + #endif /* KVM_E500_H */ diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index e78f353..5515dc5 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c @@ -329,6 +329,16 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) *spr_val = vcpu-arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT]; break; #endif + case SPRN_TLB0PS: + if (!has_mmu_v2(vcpu)) + return EMULATE_FAIL; + *spr_val = vcpu-arch.tlbps[0]; + break; + case SPRN_TLB1PS: + if (!has_mmu_v2(vcpu)) + return EMULATE_FAIL; + *spr_val = vcpu-arch.tlbps[1]; + break; default: emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, spr_val); } diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c index bb1b2b0..129299a 100644 --- a/arch/powerpc/kvm/e500_mmu.c +++ b/arch/powerpc/kvm/e500_mmu.c @@ -794,6 +794,11 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) vcpu-arch.tlbcfg[1] |= vcpu_e500-gtlb_params[1].ways TLBnCFG_ASSOC_SHIFT; + if (has_mmu_v2(vcpu)) { + vcpu-arch.tlbps[0] = mfspr(SPRN_TLB0PS); + vcpu-arch.tlbps[1] = mfspr(SPRN_TLB1PS); So I suppose that means that user space doesn't tell us the possible TLB entry sizes through the SW_TLB config? Then we should add them there. To not break untested code paths, we can still compare if the values user space asks for are identical to what physical hardware does. But eventually we shouldn't care. Alex -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 3/5] KVM: PPC: e500: Remove E.PT category from VCPUs
On 30.01.2013, at 14:29, Mihai Caraman wrote: Embedded.Page Table (E.PT) category in VMs requires indirect tlb entries emulation which is not supported yet. Configure TLBnCFG to remove E.PT category from VCPUs. Signed-off-by: Mihai Caraman mihai.cara...@freescale.com Please do this in a separate function that you call from these locations. That way the code is self-documenting on what it actually does. Also add a comment to this one function that removes E.PT related bits from TLBCFG that our _guest_ mmu emulation currently doesn't handle E.PT. Alex --- arch/powerpc/kvm/e500_mmu.c | 10 ++ 1 files changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c index 129299a..9a1f7b7 100644 --- a/arch/powerpc/kvm/e500_mmu.c +++ b/arch/powerpc/kvm/e500_mmu.c @@ -692,12 +692,14 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, vcpu_e500-gtlb_offset[0] = 0; vcpu_e500-gtlb_offset[1] = params.tlb_sizes[0]; - vcpu-arch.tlbcfg[0] = ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); + vcpu-arch.tlbcfg[0] = + ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC | TLBnCFG_IND); if (params.tlb_sizes[0] = 2048) vcpu-arch.tlbcfg[0] |= params.tlb_sizes[0]; vcpu-arch.tlbcfg[0] |= params.tlb_ways[0] TLBnCFG_ASSOC_SHIFT; - vcpu-arch.tlbcfg[1] = ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); + vcpu-arch.tlbcfg[1] = + ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC | TLBnCFG_IND); vcpu-arch.tlbcfg[1] |= params.tlb_sizes[1]; vcpu-arch.tlbcfg[1] |= params.tlb_ways[1] TLBnCFG_ASSOC_SHIFT; @@ -783,13 +785,13 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) /* Init TLB configuration register */ vcpu-arch.tlbcfg[0] = mfspr(SPRN_TLB0CFG) - ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); + ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC | TLBnCFG_IND); vcpu-arch.tlbcfg[0] |= vcpu_e500-gtlb_params[0].entries; vcpu-arch.tlbcfg[0] |= vcpu_e500-gtlb_params[0].ways TLBnCFG_ASSOC_SHIFT; vcpu-arch.tlbcfg[1] = mfspr(SPRN_TLB1CFG) - ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); + ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC | TLBnCFG_IND); vcpu-arch.tlbcfg[1] |= vcpu_e500-gtlb_params[1].entries; vcpu-arch.tlbcfg[1] |= vcpu_e500-gtlb_params[1].ways TLBnCFG_ASSOC_SHIFT; -- 1.7.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 4/5] KVM: PPC: e500: Emulate EPTCFG register
On 30.01.2013, at 14:29, Mihai Caraman wrote: EPTCFG register defined by E.PT is accessed unconditionally by Linux guests in the presence of MAV 2.0. Emulate EPTCFG register now. Signed-off-by: Mihai Caraman mihai.cara...@freescale.com --- arch/powerpc/include/asm/kvm_host.h |1 + arch/powerpc/kvm/e500.h |6 ++ arch/powerpc/kvm/e500_emulate.c |9 + arch/powerpc/kvm/e500_mmu.c |5 + 4 files changed, 21 insertions(+), 0 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 88fcfe6..f480b20 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -503,6 +503,7 @@ struct kvm_vcpu_arch { u32 tlbcfg[4]; u32 tlbps[4]; u32 mmucfg; + u32 eptcfg; This too needs to be settable through SW_TLB. u32 epr; struct kvmppc_booke_debug_reg dbg_reg; #endif diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h index b9f76d8..983eb95 100644 --- a/arch/powerpc/kvm/e500.h +++ b/arch/powerpc/kvm/e500.h @@ -308,4 +308,10 @@ static inline unsigned int has_mmu_v2(const struct kvm_vcpu *vcpu) return ((vcpu-arch.mmucfg MMUCFG_MAVN) == MMUCFG_MAVN_V2); } +static inline unsigned int supports_page_tables(const struct kvm_vcpu *vcpu) bool again. Can we generalize this a bit more? How about a small framework that allows us to differentiate across e.XX features? if (has_feature(vcpu, FEATURE_E_PT)) ... +{ + return ((vcpu-arch.tlbcfg[0] TLBnCFG_IND) + || (vcpu-arch.tlbcfg[1] TLBnCFG_IND)); +} + #endif /* KVM_E500_H */ diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c index 5515dc5..493e231 100644 --- a/arch/powerpc/kvm/e500_emulate.c +++ b/arch/powerpc/kvm/e500_emulate.c @@ -339,6 +339,15 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) return EMULATE_FAIL; *spr_val = vcpu-arch.tlbps[1]; break; + case SPRN_EPTCFG: + if (!has_mmu_v2(vcpu)) + return EMULATE_FAIL; + /* + * Legacy Linux guests access EPTCFG register even if the E.PT + * category is disabled in the VM. Give them a chance to live. + */ + *spr_val = vcpu-arch.eptcfg; + break; default: emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, spr_val); } diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c index 9a1f7b7..199c11e 100644 --- a/arch/powerpc/kvm/e500_mmu.c +++ b/arch/powerpc/kvm/e500_mmu.c @@ -799,6 +799,11 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) if (has_mmu_v2(vcpu)) { vcpu-arch.tlbps[0] = mfspr(SPRN_TLB0PS); vcpu-arch.tlbps[1] = mfspr(SPRN_TLB1PS); + + if (supports_page_tables(vcpu)) + vcpu-arch.eptcfg = mfspr(SPRN_EPTCFG); Please don't introduce new mfspr()s here :). Just have user space set it. Alex + else + vcpu-arch.eptcfg = 0; } kvmppc_recalc_tlb1map_range(vcpu_e500); -- 1.7.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 5/5] KVM: PPC: e500mc: Enable e6500 cores
On 30.01.2013, at 14:29, Mihai Caraman wrote: Extend processor compatibility names to e6500 cores. Signed-off-by: Mihai Caraman mihai.cara...@freescale.com Looks good to me. Reviewed-by: Alexander Graf ag...@suse.de Alex --- arch/powerpc/kvm/e500mc.c |2 ++ 1 files changed, 2 insertions(+), 0 deletions(-) diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c index 1f89d26..6c87299 100644 --- a/arch/powerpc/kvm/e500mc.c +++ b/arch/powerpc/kvm/e500mc.c @@ -172,6 +172,8 @@ int kvmppc_core_check_processor_compat(void) r = 0; else if (strcmp(cur_cpu_spec-cpu_name, e5500) == 0) r = 0; + else if (strcmp(cur_cpu_spec-cpu_name, e6500) == 0) + r = 0; else r = -ENOTSUPP; -- 1.7.4.1 -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/5] KVM: PPC: e500: Emulate TLBnPS registers
On 31.01.2013, at 14:24, Alexander Graf wrote: On 30.01.2013, at 14:29, Mihai Caraman wrote: Emulate TLBnPS registers which are available in MMU Architecture Version (MAV) 2.0. Signed-off-by: Mihai Caraman mihai.cara...@freescale.com --- arch/powerpc/include/asm/kvm_host.h |1 + arch/powerpc/kvm/e500.h |5 + arch/powerpc/kvm/e500_emulate.c | 10 ++ arch/powerpc/kvm/e500_mmu.c |5 + 4 files changed, 21 insertions(+), 0 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 8a72d59..88fcfe6 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -501,6 +501,7 @@ struct kvm_vcpu_arch { spinlock_t wdt_lock; struct timer_list wdt_timer; u32 tlbcfg[4]; +u32 tlbps[4]; u32 mmucfg; u32 epr; struct kvmppc_booke_debug_reg dbg_reg; diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h index 41cefd4..b9f76d8 100644 --- a/arch/powerpc/kvm/e500.h +++ b/arch/powerpc/kvm/e500.h @@ -303,4 +303,9 @@ static inline unsigned int get_tlbmiss_tid(struct kvm_vcpu *vcpu) #define get_tlb_sts(gtlbe) (MAS1_TS) #endif /* !BOOKE_HV */ +static inline unsigned int has_mmu_v2(const struct kvm_vcpu *vcpu) bool. Also rename it to is_... then. In light of the comment I did in a later patch, this too could be convert to feature flags. Alex -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] x86, apicv: Add Posted Interrupt supporting
On Thu, Jan 31, 2013 at 11:43:48AM +0200, Gleb Natapov wrote: On Wed, Jan 30, 2013 at 09:03:11PM -0200, Marcelo Tosatti wrote: Posted interrupt patch: 2) Must move IN_GUEST_MODE assignment after local_irq_disable, in vcpu_enter_guest function. Otherwise: cpu0vcpu1-cpu1 vcpu-mode = IN_GUEST_MODE if IN_GUEST_MODE == true send IPI local_irq_disable PIR not transferred to VIRR, misses interrupt. cpu0 will set KVM_REQ_EVENT, so vmentry will be aborted after local_irq_disable() by -requests check. Yes, but you don't want KVM_REQ_EVENT+kick. It defeats the purpose of posted interrupts. You want if vcpu in guest mode send posted interrupt IPI else KVM_REQ_EVENT+kick 3) Must check outstanding PIR notification bit unconditionally on every VM-entry, because: 1. local_irq_disable 2. vcpu-mode = IN_GUEST_MODE 3. vmenter 4. vmexit 5. vcpu-mode = OUTSIDE_GUEST_MODE If PIR-IPI-interrupt is sent between an event which triggers VM-exit (for example, an external interrupt due to a device), and step 5 (assignment of vcpu-mode), the PIR-VIRR transfer before vmentry must be made. Not sure I understand, but I think KVM_REQ_EVENT will cover that too. See above. 4) Today, an interrupt notification is cached on IRR until its delivered - further interrupt injection is not generating further interrupt notification bits. With PIR, behaviour changes: Its possible to have one bit in PIR and another on IRR APIC page (if timing is right). Is this harmless? Why? -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] x86, apicv: Add Posted Interrupt supporting
On Thu, Jan 31, 2013 at 11:32:45AM -0200, Marcelo Tosatti wrote: On Thu, Jan 31, 2013 at 11:43:48AM +0200, Gleb Natapov wrote: On Wed, Jan 30, 2013 at 09:03:11PM -0200, Marcelo Tosatti wrote: Posted interrupt patch: 2) Must move IN_GUEST_MODE assignment after local_irq_disable, in vcpu_enter_guest function. Otherwise: cpu0 vcpu1-cpu1 vcpu-mode = IN_GUEST_MODE if IN_GUEST_MODE == true send IPI local_irq_disable PIR not transferred to VIRR, misses interrupt. cpu0 will set KVM_REQ_EVENT, so vmentry will be aborted after local_irq_disable() by -requests check. Yes, but you don't want KVM_REQ_EVENT+kick. It defeats the purpose of posted interrupts. You want if vcpu in guest mode send posted interrupt IPI else KVM_REQ_EVENT+kick I am thinking: set KVM_REQ_EVENT if pi is enabled send posted interrupt IPI else kick 3) Must check outstanding PIR notification bit unconditionally on every VM-entry, because: 1. local_irq_disable 2. vcpu-mode = IN_GUEST_MODE 3. vmenter 4. vmexit 5. vcpu-mode = OUTSIDE_GUEST_MODE If PIR-IPI-interrupt is sent between an event which triggers VM-exit (for example, an external interrupt due to a device), and step 5 (assignment of vcpu-mode), the PIR-VIRR transfer before vmentry must be made. Not sure I understand, but I think KVM_REQ_EVENT will cover that too. See above. 4) Today, an interrupt notification is cached on IRR until its delivered - further interrupt injection is not generating further interrupt notification bits. With PIR, behaviour changes: Its possible to have one bit in PIR and another on IRR APIC page (if timing is right). Is this harmless? Why? -- Gleb. -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] x86, apicv: Add Posted Interrupt supporting
On Thu, Jan 31, 2013 at 03:38:37PM +0200, Gleb Natapov wrote: On Thu, Jan 31, 2013 at 11:32:45AM -0200, Marcelo Tosatti wrote: On Thu, Jan 31, 2013 at 11:43:48AM +0200, Gleb Natapov wrote: On Wed, Jan 30, 2013 at 09:03:11PM -0200, Marcelo Tosatti wrote: Posted interrupt patch: 2) Must move IN_GUEST_MODE assignment after local_irq_disable, in vcpu_enter_guest function. Otherwise: cpu0vcpu1-cpu1 vcpu-mode = IN_GUEST_MODE if IN_GUEST_MODE == true send IPI local_irq_disable PIR not transferred to VIRR, misses interrupt. cpu0 will set KVM_REQ_EVENT, so vmentry will be aborted after local_irq_disable() by -requests check. Yes, but you don't want KVM_REQ_EVENT+kick. It defeats the purpose of posted interrupts. You want if vcpu in guest mode send posted interrupt IPI else KVM_REQ_EVENT+kick I am thinking: set KVM_REQ_EVENT if pi is enabled send posted interrupt IPI else kick KVM_REQ_EVENT must be after sending posted interrupt IPI. Otherwise on the vcpu entry side test_and_clear(KVM_REQ_EVENT) { No bits set in PIR } What about item 4 below? 3) Must check outstanding PIR notification bit unconditionally on every VM-entry, because: 1. local_irq_disable 2. vcpu-mode = IN_GUEST_MODE 3. vmenter 4. vmexit 5. vcpu-mode = OUTSIDE_GUEST_MODE If PIR-IPI-interrupt is sent between an event which triggers VM-exit (for example, an external interrupt due to a device), and step 5 (assignment of vcpu-mode), the PIR-VIRR transfer before vmentry must be made. Not sure I understand, but I think KVM_REQ_EVENT will cover that too. See above. 4) Today, an interrupt notification is cached on IRR until its delivered - further interrupt injection is not generating further interrupt notification bits. With PIR, behaviour changes: Its possible to have one bit in PIR and another on IRR APIC page (if timing is right). Is this harmless? Why? -- Gleb. -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [PATCH V4 00/22] Multiqueue virtio-net
On 01/31/2013 12:00 AM, Jason Wang wrote: On 01/31/2013 02:29 AM, Eric Blake wrote: On 01/30/2013 04:12 AM, Jason Wang wrote: With this changes, user could start a multiqueue virtio-net device through ./qemu -netdev tap,id=hn0,queues=2,vhost=on -device virtio-net-pci,netdev=hn0 Management tools such as libvirt can pass multiple pre-created fds/vhostfds through ./qemu -netdev tap,id=hn0,fds=X:Y,vhostfds=M:N -device virtio-net-pci,netdev=hn0 Do we really need specific fds= parsing, or can we reuse the existing -add-fd command line option to our advantage? I guess what I'm asking is how hotplug will work; and if hotplug takes a file name, shouldn't the command line also take a name; and if the command line takes a name, what's wrong with: ./qemu -add-fd fdset=1,fd=X -add-fd fdset=2,fd=Y -add-fd fdset=3,fd=M -add-fd fdset=4,fd=N -netdev tap,id=hn0,fds=/dev/fdset/1:/dev/fdset/2,vhostfds=/dev/fdset/3:/dev/fdset/4 -device virtio-net-pci,netdev=hn0 AFAIK, tap does not support fdset now, so this requirement is beyond the scope of multiqueue itself. We can do this in the future. Btw does libvirt support add-fd now? Anything that uses qemu_open() supports fdset now. The question I'm asking is whether the command line has a way to pass /path/to/name (which can be presented as /dev/fdset/nnn for add-fd usage) now, or whether it only supports fds=integers. For hotplug, it just work if you pass multiple file descriptors one by one through getfd and then use fds=X:Y,vhostfds=M:N. For hotplug, you can't pass integers; you have to name the fds either way. Either you name it with getfd, or you name it with add-fd. But getfd is not as nice as add-fd when it comes to ensuring that fds are not leaked in qemu, even when the management app such as libvirt restarts. Furthermore, if it is possible to specify taps by pathname instead of by fd inheritance, then using getfd means you have to support two different approaches in QMP to distinguish which string is being supplied, while supporting add-fd means you only have to support qemu_open() which handles both direct names and fd passing in a single string interface. As for libvirt support of add-fd, I'm currently working with Stefan Berger to get patches applied; the goal is tha libvirt 1.0.3 (end of February) will support add-fd. -- Eric Blake eblake redhat com+1-919-301-3266 Libvirt virtualization library http://libvirt.org signature.asc Description: OpenPGP digital signature
Re: [Qemu-devel] [PATCH V4 00/22] Multiqueue virtio-net
On Thu, Jan 31, 2013 at 06:44:49AM -0700, Eric Blake wrote: On 01/31/2013 12:00 AM, Jason Wang wrote: On 01/31/2013 02:29 AM, Eric Blake wrote: On 01/30/2013 04:12 AM, Jason Wang wrote: With this changes, user could start a multiqueue virtio-net device through ./qemu -netdev tap,id=hn0,queues=2,vhost=on -device virtio-net-pci,netdev=hn0 Management tools such as libvirt can pass multiple pre-created fds/vhostfds through ./qemu -netdev tap,id=hn0,fds=X:Y,vhostfds=M:N -device virtio-net-pci,netdev=hn0 Do we really need specific fds= parsing, or can we reuse the existing -add-fd command line option to our advantage? I guess what I'm asking is how hotplug will work; and if hotplug takes a file name, shouldn't the command line also take a name; and if the command line takes a name, what's wrong with: ./qemu -add-fd fdset=1,fd=X -add-fd fdset=2,fd=Y -add-fd fdset=3,fd=M -add-fd fdset=4,fd=N -netdev tap,id=hn0,fds=/dev/fdset/1:/dev/fdset/2,vhostfds=/dev/fdset/3:/dev/fdset/4 -device virtio-net-pci,netdev=hn0 AFAIK, tap does not support fdset now, so this requirement is beyond the scope of multiqueue itself. We can do this in the future. Btw does libvirt support add-fd now? Anything that uses qemu_open() supports fdset now. The question I'm asking is whether the command line has a way to pass /path/to/name (which can be presented as /dev/fdset/nnn for add-fd usage) now, or whether it only supports fds=integers. For hotplug, it just work if you pass multiple file descriptors one by one through getfd and then use fds=X:Y,vhostfds=M:N. For hotplug, you can't pass integers; you have to name the fds either way. Either you name it with getfd, or you name it with add-fd. But getfd is not as nice as add-fd when it comes to ensuring that fds are not leaked in qemu, even when the management app such as libvirt restarts. Furthermore, if it is possible to specify taps by pathname instead of by fd inheritance, I don't think there's a way to specify taps by pathname. then using getfd means you have to support two different approaches in QMP to distinguish which string is being supplied, while supporting add-fd means you only have to support qemu_open() which handles both direct names and fd passing in a single string interface. As for libvirt support of add-fd, I'm currently working with Stefan Berger to get patches applied; the goal is tha libvirt 1.0.3 (end of February) will support add-fd. -- Eric Blake eblake redhat com+1-919-301-3266 Libvirt virtualization library http://libvirt.org -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/2] x86, apicv: Add Posted Interrupt supporting
On Thu, Jan 31, 2013 at 11:44:43AM -0200, Marcelo Tosatti wrote: On Thu, Jan 31, 2013 at 03:38:37PM +0200, Gleb Natapov wrote: On Thu, Jan 31, 2013 at 11:32:45AM -0200, Marcelo Tosatti wrote: On Thu, Jan 31, 2013 at 11:43:48AM +0200, Gleb Natapov wrote: On Wed, Jan 30, 2013 at 09:03:11PM -0200, Marcelo Tosatti wrote: Posted interrupt patch: 2) Must move IN_GUEST_MODE assignment after local_irq_disable, in vcpu_enter_guest function. Otherwise: cpu0 vcpu1-cpu1 vcpu-mode = IN_GUEST_MODE if IN_GUEST_MODE == true send IPI local_irq_disable PIR not transferred to VIRR, misses interrupt. cpu0 will set KVM_REQ_EVENT, so vmentry will be aborted after local_irq_disable() by -requests check. Yes, but you don't want KVM_REQ_EVENT+kick. It defeats the purpose of posted interrupts. You want if vcpu in guest mode send posted interrupt IPI else KVM_REQ_EVENT+kick I am thinking: set KVM_REQ_EVENT if pi is enabled send posted interrupt IPI else kick KVM_REQ_EVENT must be after sending posted interrupt IPI. Otherwise on the vcpu entry side test_and_clear(KVM_REQ_EVENT) { No bits set in PIR } It should be after updating PIR, but before sending posted interrupt IPI. Otherwise: cpu0 cpu1/vcpu KVM_REQ_EVENT is not set set pir send IPI irq_disable() -request is empty. set KVM_REQ_EVENT That's the same sequence as with IRR update, KVM_REQ_EVENT and kick today. What about item 4 below? That's for Yang to answer :) 3) Must check outstanding PIR notification bit unconditionally on every VM-entry, because: 1. local_irq_disable 2. vcpu-mode = IN_GUEST_MODE 3. vmenter 4. vmexit 5. vcpu-mode = OUTSIDE_GUEST_MODE If PIR-IPI-interrupt is sent between an event which triggers VM-exit (for example, an external interrupt due to a device), and step 5 (assignment of vcpu-mode), the PIR-VIRR transfer before vmentry must be made. Not sure I understand, but I think KVM_REQ_EVENT will cover that too. See above. 4) Today, an interrupt notification is cached on IRR until its delivered - further interrupt injection is not generating further interrupt notification bits. With PIR, behaviour changes: Its possible to have one bit in PIR and another on IRR APIC page (if timing is right). Is this harmless? Why? -- Gleb. -- Gleb. -- Gleb. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 5/8] KVM: PPC: debug stub interface parameter defined
-Original Message- From: kvm-ppc-ow...@vger.kernel.org [mailto:kvm-ppc-ow...@vger.kernel.org] On Behalf Of Alexander Graf Sent: Thursday, January 31, 2013 6:31 PM To: Bhushan Bharat-R65777 Cc: Paul Mackerras; kvm-...@vger.kernel.org; kvm@vger.kernel.org Subject: Re: [PATCH 5/8] KVM: PPC: debug stub interface parameter defined On 30.01.2013, at 15:15, Bhushan Bharat-R65777 wrote: -Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Friday, January 25, 2013 5:24 PM To: Bhushan Bharat-R65777 Cc: Paul Mackerras; kvm-...@vger.kernel.org; kvm@vger.kernel.org Subject: Re: [PATCH 5/8] KVM: PPC: debug stub interface parameter defined On 17.01.2013, at 12:11, Bhushan Bharat-R65777 wrote: -Original Message- From: Paul Mackerras [mailto:pau...@samba.org] Sent: Thursday, January 17, 2013 12:53 PM To: Bhushan Bharat-R65777 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; ag...@suse.de; Bhushan Bharat- R65777 Subject: Re: [PATCH 5/8] KVM: PPC: debug stub interface parameter defined On Wed, Jan 16, 2013 at 01:54:42PM +0530, Bharat Bhushan wrote: This patch defines the interface parameter for KVM_SET_GUEST_DEBUG ioctl support. Follow up patches will use this for setting up hardware breakpoints, watchpoints and software breakpoints. [snip] diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 453a10f..7d5a51c 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1483,6 +1483,12 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) return r; } +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, +struct kvm_guest_debug *dbg) { + return -EINVAL; +} + int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { return -ENOTSUPP; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 934413c..4c94ca9 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -532,12 +532,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) #endif } -int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, -struct kvm_guest_debug *dbg) -{ - return -EINVAL; -} - This will break the build for non-book E machines, since kvm_arch_vcpu_ioctl_set_guest_debug() is referenced from generic code. You need to add it to arch/powerpc/kvm/book3s.c as well. right, I will correct this. Would the implementation actually be different on booke vs book3s? My feeling is that powerpc.c is actually the right place for this. I am not sure there will be anything common between book3s and booke. Should we define the cpu specific function something like kvm_ppc_vcpu_ioctl_set_guest_debug() for booke and book3s and call this new defined function from kvm_arch_vcpu_ioctl_set_guest_debug() in powerpc.c ? No, just put it into the subarch directories then :). No need to overengineer anything for now. What you mean by subarch? Above you mentioned that powerpc.c is right place? Is not this patch is doing partially :) Thanks -Bharat -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH V4 00/22] Multiqueue virtio-net
On Wed, Jan 30, 2013 at 07:12:19PM +0800, Jason Wang wrote: Hello all: This seires is an update of last version of multiqueue virtio-net support. This series tries to brings multiqueue support to virtio-net through a multiqueue support tap backend and multiple vhost threads. Patch 1 converts bitfield in TAPState to bool. Patch 2 replace assert(0) with abort() in tap. To support this, multiqueue nic support were added to qemu. This is done by introducing an array of NetClientStates in NICState, and make each pair of peers to be an queue of the nic. This is done in patch 3-9. Tap were also converted to be able to create a multiple queue backend. Currently, only linux support this by issuing TUNSETIFF N times with the same device name to create N queues. Each fd returned by TUNSETIFF were a queue supported by kernel. Three new command lines were introduced, queues were used to tell how many queues will be created by qemu; fds were used to pass multiple pre-created tap file descriptors to qemu; vhostfds were used to pass multiple pre-created vhost descriptors to qemu. This is done in patch 10-15. A method of deleting a queue and queue_index were also introduce for virtio, this is done in patch 16-17. Vhost were also changed to support multiqueue by introducing a start vq index which tracks the first virtqueue that will be used by vhost instead of the assumption that the vhost always use virtqueue from index 0. This is done in patch 18. The last part is the multiqueue userspace changes, this is done in patch 19-22. With this changes, user could start a multiqueue virtio-net device through ./qemu -netdev tap,id=hn0,queues=2,vhost=on -device virtio-net-pci,netdev=hn0 Management tools such as libvirt can pass multiple pre-created fds/vhostfds through ./qemu -netdev tap,id=hn0,fds=X:Y,vhostfds=M:N -device virtio-net-pci,netdev=hn0 For the one who wants to try, a git tree is available at: git://github.com/jasowang/qemu.git Changes from V3: - convert bitfield to bool in TAPState (Blue) - use abort() instead of assert(0) in tap code (Blue) - rebase to the latest - fix a bug that breaks the non-tap network This conflicts with the pull request I sent, in partucular this adds a layout assumption. In the hope this will accelerate things, I did a rebase and a trivial test with single queue only and it seems ok: git://github.com/mstsirkin/qemu.git pci There were some warnings about whitespace at EOF but otherwise seems ok. -- MST -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 5/8] KVM: PPC: debug stub interface parameter defined
On 31.01.2013, at 15:05, Bhushan Bharat-R65777 wrote: -Original Message- From: kvm-ppc-ow...@vger.kernel.org [mailto:kvm-ppc-ow...@vger.kernel.org] On Behalf Of Alexander Graf Sent: Thursday, January 31, 2013 6:31 PM To: Bhushan Bharat-R65777 Cc: Paul Mackerras; kvm-...@vger.kernel.org; kvm@vger.kernel.org Subject: Re: [PATCH 5/8] KVM: PPC: debug stub interface parameter defined On 30.01.2013, at 15:15, Bhushan Bharat-R65777 wrote: -Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Friday, January 25, 2013 5:24 PM To: Bhushan Bharat-R65777 Cc: Paul Mackerras; kvm-...@vger.kernel.org; kvm@vger.kernel.org Subject: Re: [PATCH 5/8] KVM: PPC: debug stub interface parameter defined On 17.01.2013, at 12:11, Bhushan Bharat-R65777 wrote: -Original Message- From: Paul Mackerras [mailto:pau...@samba.org] Sent: Thursday, January 17, 2013 12:53 PM To: Bhushan Bharat-R65777 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; ag...@suse.de; Bhushan Bharat- R65777 Subject: Re: [PATCH 5/8] KVM: PPC: debug stub interface parameter defined On Wed, Jan 16, 2013 at 01:54:42PM +0530, Bharat Bhushan wrote: This patch defines the interface parameter for KVM_SET_GUEST_DEBUG ioctl support. Follow up patches will use this for setting up hardware breakpoints, watchpoints and software breakpoints. [snip] diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 453a10f..7d5a51c 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1483,6 +1483,12 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) return r; } +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, +struct kvm_guest_debug *dbg) { + return -EINVAL; +} + int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { return -ENOTSUPP; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 934413c..4c94ca9 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -532,12 +532,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) #endif } -int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, -struct kvm_guest_debug *dbg) -{ - return -EINVAL; -} - This will break the build for non-book E machines, since kvm_arch_vcpu_ioctl_set_guest_debug() is referenced from generic code. You need to add it to arch/powerpc/kvm/book3s.c as well. right, I will correct this. Would the implementation actually be different on booke vs book3s? My feeling is that powerpc.c is actually the right place for this. I am not sure there will be anything common between book3s and booke. Should we define the cpu specific function something like kvm_ppc_vcpu_ioctl_set_guest_debug() for booke and book3s and call this new defined function from kvm_arch_vcpu_ioctl_set_guest_debug() in powerpc.c ? No, just put it into the subarch directories then :). No need to overengineer anything for now. What you mean by subarch? Above you mentioned that powerpc.c is right place? Is not this patch is doing partially :) If the code in powerpc.c only says void kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { kvmppc_core_set_guest_debug(vcpu, dbg); } then doing it in powerpc.c is obviously moot. Since there is no other debug implementation, it's ok if we try and find (and create) commonalities later. So yes, it's ok if you put it into booke.c or even e500.c. Just make sure to not break any other archs (440, book3s_pr, book3s_hv). Alex -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH V4 00/22] Multiqueue virtio-net
On Thu, Jan 31, 2013 at 04:21:49PM +0200, Michael S. Tsirkin wrote: On Wed, Jan 30, 2013 at 07:12:19PM +0800, Jason Wang wrote: Hello all: This seires is an update of last version of multiqueue virtio-net support. This series tries to brings multiqueue support to virtio-net through a multiqueue support tap backend and multiple vhost threads. Patch 1 converts bitfield in TAPState to bool. Patch 2 replace assert(0) with abort() in tap. To support this, multiqueue nic support were added to qemu. This is done by introducing an array of NetClientStates in NICState, and make each pair of peers to be an queue of the nic. This is done in patch 3-9. Tap were also converted to be able to create a multiple queue backend. Currently, only linux support this by issuing TUNSETIFF N times with the same device name to create N queues. Each fd returned by TUNSETIFF were a queue supported by kernel. Three new command lines were introduced, queues were used to tell how many queues will be created by qemu; fds were used to pass multiple pre-created tap file descriptors to qemu; vhostfds were used to pass multiple pre-created vhost descriptors to qemu. This is done in patch 10-15. A method of deleting a queue and queue_index were also introduce for virtio, this is done in patch 16-17. Vhost were also changed to support multiqueue by introducing a start vq index which tracks the first virtqueue that will be used by vhost instead of the assumption that the vhost always use virtqueue from index 0. This is done in patch 18. The last part is the multiqueue userspace changes, this is done in patch 19-22. With this changes, user could start a multiqueue virtio-net device through ./qemu -netdev tap,id=hn0,queues=2,vhost=on -device virtio-net-pci,netdev=hn0 Management tools such as libvirt can pass multiple pre-created fds/vhostfds through ./qemu -netdev tap,id=hn0,fds=X:Y,vhostfds=M:N -device virtio-net-pci,netdev=hn0 For the one who wants to try, a git tree is available at: git://github.com/jasowang/qemu.git Changes from V3: - convert bitfield to bool in TAPState (Blue) - use abort() instead of assert(0) in tap code (Blue) - rebase to the latest - fix a bug that breaks the non-tap network This conflicts with the pull request I sent, in partucular this adds a layout assumption. In the hope this will accelerate things, I did a rebase and a trivial test with single queue only and it seems ok: git://github.com/mstsirkin/qemu.git pci There were some warnings about whitespace at EOF but otherwise seems ok. Pushed to my pci branch on kernel.org too. -- MST -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 5/8] KVM: PPC: debug stub interface parameter defined
-Original Message- From: kvm-ow...@vger.kernel.org [mailto:kvm-ow...@vger.kernel.org] On Behalf Of Alexander Graf Sent: Thursday, January 31, 2013 7:58 PM To: Bhushan Bharat-R65777 Cc: Paul Mackerras; kvm-...@vger.kernel.org; kvm@vger.kernel.org Subject: Re: [PATCH 5/8] KVM: PPC: debug stub interface parameter defined On 31.01.2013, at 15:05, Bhushan Bharat-R65777 wrote: -Original Message- From: kvm-ppc-ow...@vger.kernel.org [mailto:kvm-ppc-ow...@vger.kernel.org] On Behalf Of Alexander Graf Sent: Thursday, January 31, 2013 6:31 PM To: Bhushan Bharat-R65777 Cc: Paul Mackerras; kvm-...@vger.kernel.org; kvm@vger.kernel.org Subject: Re: [PATCH 5/8] KVM: PPC: debug stub interface parameter defined On 30.01.2013, at 15:15, Bhushan Bharat-R65777 wrote: -Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Friday, January 25, 2013 5:24 PM To: Bhushan Bharat-R65777 Cc: Paul Mackerras; kvm-...@vger.kernel.org; kvm@vger.kernel.org Subject: Re: [PATCH 5/8] KVM: PPC: debug stub interface parameter defined On 17.01.2013, at 12:11, Bhushan Bharat-R65777 wrote: -Original Message- From: Paul Mackerras [mailto:pau...@samba.org] Sent: Thursday, January 17, 2013 12:53 PM To: Bhushan Bharat-R65777 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; ag...@suse.de; Bhushan Bharat- R65777 Subject: Re: [PATCH 5/8] KVM: PPC: debug stub interface parameter defined On Wed, Jan 16, 2013 at 01:54:42PM +0530, Bharat Bhushan wrote: This patch defines the interface parameter for KVM_SET_GUEST_DEBUG ioctl support. Follow up patches will use this for setting up hardware breakpoints, watchpoints and software breakpoints. [snip] diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 453a10f..7d5a51c 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1483,6 +1483,12 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) return r; } +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *dbg) { + return -EINVAL; +} + int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { return -ENOTSUPP; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 934413c..4c94ca9 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -532,12 +532,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) #endif } -int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, -struct kvm_guest_debug *dbg) -{ - return -EINVAL; -} - This will break the build for non-book E machines, since kvm_arch_vcpu_ioctl_set_guest_debug() is referenced from generic code. You need to add it to arch/powerpc/kvm/book3s.c as well. right, I will correct this. Would the implementation actually be different on booke vs book3s? My feeling is that powerpc.c is actually the right place for this. I am not sure there will be anything common between book3s and booke. Should we define the cpu specific function something like kvm_ppc_vcpu_ioctl_set_guest_debug() for booke and book3s and call this new defined function from kvm_arch_vcpu_ioctl_set_guest_debug() in powerpc.c ? No, just put it into the subarch directories then :). No need to overengineer anything for now. What you mean by subarch? Above you mentioned that powerpc.c is right place? Is not this patch is doing partially :) If the code in powerpc.c only says void kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { kvmppc_core_set_guest_debug(vcpu, dbg); } then doing it in powerpc.c is obviously moot. Since there is no other debug implementation, it's ok if we try and find (and create) commonalities later. So yes, it's ok if you put it into booke.c or even e500.c. Just make sure to not break any other archs (440, book3s_pr, book3s_hv). Right, yes I will correct that it compiles for all archs. Thanks. -Bharat Alex -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 1/5] KVM: PPC: e500: Move VCPU's MMUCFG register initialization earlier
-Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Thursday, January 31, 2013 3:21 PM To: Caraman Mihai Claudiu-B02008 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; linuxppc- d...@lists.ozlabs.org Subject: Re: [PATCH 1/5] KVM: PPC: e500: Move VCPU's MMUCFG register initialization earlier On 30.01.2013, at 14:29, Mihai Caraman wrote: VCPU's MMUCFG register initialization should not depend on KVM_CAP_SW_TLB ioctl call. Move it earlier into tlb initalization phase. Quite the contrary. The fact that there is an mfspr() in e500_mmu.c already tells us that the code is broken. The TLB guest code should only depend on input from the SW_TLB configuration. It's completely orthogonal to the host capabilities. Then we have the same issue for TLBnCFG registers which need to be configured via SW_TLB ioctl. What is the purpose of guest tlb initalization in e500_mmu.c if we rely on SW_TLB? -Mike -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/5] KVM: PPC: e500: Move VCPU's MMUCFG register initialization earlier
On 31.01.2013, at 15:56, Caraman Mihai Claudiu-B02008 wrote: -Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Thursday, January 31, 2013 3:21 PM To: Caraman Mihai Claudiu-B02008 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; linuxppc- d...@lists.ozlabs.org Subject: Re: [PATCH 1/5] KVM: PPC: e500: Move VCPU's MMUCFG register initialization earlier On 30.01.2013, at 14:29, Mihai Caraman wrote: VCPU's MMUCFG register initialization should not depend on KVM_CAP_SW_TLB ioctl call. Move it earlier into tlb initalization phase. Quite the contrary. The fact that there is an mfspr() in e500_mmu.c already tells us that the code is broken. The TLB guest code should only depend on input from the SW_TLB configuration. It's completely orthogonal to the host capabilities. Then we have the same issue for TLBnCFG registers which need to be configured via SW_TLB ioctl. What is the purpose of guest tlb initalization in e500_mmu.c if we rely on SW_TLB? It's to provide a fallback to user space that doesn't implement SW_TLB configuration yet. Alex -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 4/5] KVM: PPC: e500: Emulate EPTCFG register
-Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Thursday, January 31, 2013 3:31 PM To: Caraman Mihai Claudiu-B02008 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; linuxppc- d...@lists.ozlabs.org Subject: Re: [PATCH 4/5] KVM: PPC: e500: Emulate EPTCFG register On 30.01.2013, at 14:29, Mihai Caraman wrote: EPTCFG register defined by E.PT is accessed unconditionally by Linux guests in the presence of MAV 2.0. Emulate EPTCFG register now. Signed-off-by: Mihai Caraman mihai.cara...@freescale.com --- arch/powerpc/include/asm/kvm_host.h |1 + arch/powerpc/kvm/e500.h |6 ++ arch/powerpc/kvm/e500_emulate.c |9 + arch/powerpc/kvm/e500_mmu.c |5 + 4 files changed, 21 insertions(+), 0 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 88fcfe6..f480b20 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -503,6 +503,7 @@ struct kvm_vcpu_arch { u32 tlbcfg[4]; u32 tlbps[4]; u32 mmucfg; + u32 eptcfg; This too needs to be settable through SW_TLB. u32 epr; struct kvmppc_booke_debug_reg dbg_reg; #endif diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h index b9f76d8..983eb95 100644 --- a/arch/powerpc/kvm/e500.h +++ b/arch/powerpc/kvm/e500.h @@ -308,4 +308,10 @@ static inline unsigned int has_mmu_v2(const struct kvm_vcpu *vcpu) return ((vcpu-arch.mmucfg MMUCFG_MAVN) == MMUCFG_MAVN_V2); } +static inline unsigned int supports_page_tables(const struct kvm_vcpu *vcpu) bool again. Can we generalize this a bit more? How about a small framework that allows us to differentiate across e.XX features? I thought you will ask for it :) -Mike -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [PATCH V4 00/22] Multiqueue virtio-net
On 01/31/2013 09:44 PM, Eric Blake wrote: On 01/31/2013 12:00 AM, Jason Wang wrote: On 01/31/2013 02:29 AM, Eric Blake wrote: On 01/30/2013 04:12 AM, Jason Wang wrote: With this changes, user could start a multiqueue virtio-net device through ./qemu -netdev tap,id=hn0,queues=2,vhost=on -device virtio-net-pci,netdev=hn0 Management tools such as libvirt can pass multiple pre-created fds/vhostfds through ./qemu -netdev tap,id=hn0,fds=X:Y,vhostfds=M:N -device virtio-net-pci,netdev=hn0 Do we really need specific fds= parsing, or can we reuse the existing -add-fd command line option to our advantage? I guess what I'm asking is how hotplug will work; and if hotplug takes a file name, shouldn't the command line also take a name; and if the command line takes a name, what's wrong with: ./qemu -add-fd fdset=1,fd=X -add-fd fdset=2,fd=Y -add-fd fdset=3,fd=M -add-fd fdset=4,fd=N -netdev tap,id=hn0,fds=/dev/fdset/1:/dev/fdset/2,vhostfds=/dev/fdset/3:/dev/fdset/4 -device virtio-net-pci,netdev=hn0 AFAIK, tap does not support fdset now, so this requirement is beyond the scope of multiqueue itself. We can do this in the future. Btw does libvirt support add-fd now? Anything that uses qemu_open() supports fdset now. The question I'm asking is whether the command line has a way to pass /path/to/name (which can be presented as /dev/fdset/nnn for add-fd usage) now, or whether it only supports fds=integers. Nothing special with 'fds' and 'vhostfds', it just split the params by ':' and pass them one by one through monitor_handle_fd_param() just like fd and vhostfd. So if 'fd' and 'vhostfd' supports /path/to/name, so do 'fds' and 'vhostfds'. So for command line, you do can pass /path/to/name to fd/vhostfd but it won't work since monitor_handle_fd_param() can not handle it because 1) it's not an integer 2) it was not named before. But for hotplug, non-integers works since it has already named by getfd, so does fds and vhostfds. For management such as libvirt, what's needed is just to connect the fdname with ':'. For hotplug, it just work if you pass multiple file descriptors one by one through getfd and then use fds=X:Y,vhostfds=M:N. For hotplug, you can't pass integers; you have to name the fds either way. Either you name it with getfd, or you name it with add-fd. But getfd is not as nice as add-fd when it comes to ensuring that fds are not leaked in qemu, even when the management app such as libvirt restarts. Furthermore, if it is possible to specify taps by pathname instead of by fd inheritance, then using getfd means you have to support two different approaches in QMP to distinguish which string is being supplied, while supporting add-fd means you only have to support qemu_open() which handles both direct names and fd passing in a single string interface. As for libvirt support of add-fd, I'm currently working with Stefan Berger to get patches applied; the goal is tha libvirt 1.0.3 (end of February) will support add-fd. Thanks, I know there are advantages of add-fd, but current tap does not use qemu_open() which means it can't support fdset. We can add this in the future. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH V4 00/22] Multiqueue virtio-net
On 01/31/2013 10:36 PM, Michael S. Tsirkin wrote: On Thu, Jan 31, 2013 at 04:21:49PM +0200, Michael S. Tsirkin wrote: On Wed, Jan 30, 2013 at 07:12:19PM +0800, Jason Wang wrote: Hello all: This seires is an update of last version of multiqueue virtio-net support. This series tries to brings multiqueue support to virtio-net through a multiqueue support tap backend and multiple vhost threads. Patch 1 converts bitfield in TAPState to bool. Patch 2 replace assert(0) with abort() in tap. To support this, multiqueue nic support were added to qemu. This is done by introducing an array of NetClientStates in NICState, and make each pair of peers to be an queue of the nic. This is done in patch 3-9. Tap were also converted to be able to create a multiple queue backend. Currently, only linux support this by issuing TUNSETIFF N times with the same device name to create N queues. Each fd returned by TUNSETIFF were a queue supported by kernel. Three new command lines were introduced, queues were used to tell how many queues will be created by qemu; fds were used to pass multiple pre-created tap file descriptors to qemu; vhostfds were used to pass multiple pre-created vhost descriptors to qemu. This is done in patch 10-15. A method of deleting a queue and queue_index were also introduce for virtio, this is done in patch 16-17. Vhost were also changed to support multiqueue by introducing a start vq index which tracks the first virtqueue that will be used by vhost instead of the assumption that the vhost always use virtqueue from index 0. This is done in patch 18. The last part is the multiqueue userspace changes, this is done in patch 19-22. With this changes, user could start a multiqueue virtio-net device through ./qemu -netdev tap,id=hn0,queues=2,vhost=on -device virtio-net-pci,netdev=hn0 Management tools such as libvirt can pass multiple pre-created fds/vhostfds through ./qemu -netdev tap,id=hn0,fds=X:Y,vhostfds=M:N -device virtio-net-pci,netdev=hn0 For the one who wants to try, a git tree is available at: git://github.com/jasowang/qemu.git Changes from V3: - convert bitfield to bool in TAPState (Blue) - use abort() instead of assert(0) in tap code (Blue) - rebase to the latest - fix a bug that breaks the non-tap network This conflicts with the pull request I sent, in partucular this adds a layout assumption. In the hope this will accelerate things, I did a rebase and a trivial test with single queue only and it seems ok: git://github.com/mstsirkin/qemu.git pci There were some warnings about whitespace at EOF but otherwise seems ok. Pushed to my pci branch on kernel.org too. Tested with mq, it works well. Thanks. -- MST -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [PATCH V4 00/22] Multiqueue virtio-net
On 01/31/2013 06:58 AM, Michael S. Tsirkin wrote: For hotplug, it just work if you pass multiple file descriptors one by one through getfd and then use fds=X:Y,vhostfds=M:N. For hotplug, you can't pass integers; you have to name the fds either way. Either you name it with getfd, or you name it with add-fd. But getfd is not as nice as add-fd when it comes to ensuring that fds are not leaked in qemu, even when the management app such as libvirt restarts. Furthermore, if it is possible to specify taps by pathname instead of by fd inheritance, I don't think there's a way to specify taps by pathname. Then using fds=integer:integer on the command line makes the most sense, and QMP uses fds=name:name where name was specified by 'getfd', and there is no way to wire up qemu_open() nor any need to use 'add-fd'. Okay, my question has been answered, your approach looks right now that I know more about how -netdev works to begin with. -- Eric Blake eblake redhat com+1-919-301-3266 Libvirt virtualization library http://libvirt.org signature.asc Description: OpenPGP digital signature
RE: [PATCH 1/5] KVM: PPC: e500: Move VCPU's MMUCFG register initialization earlier
-Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Thursday, January 31, 2013 4:58 PM To: Caraman Mihai Claudiu-B02008 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; linuxppc- d...@lists.ozlabs.org Subject: Re: [PATCH 1/5] KVM: PPC: e500: Move VCPU's MMUCFG register initialization earlier On 31.01.2013, at 15:56, Caraman Mihai Claudiu-B02008 wrote: -Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Thursday, January 31, 2013 3:21 PM To: Caraman Mihai Claudiu-B02008 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; linuxppc- d...@lists.ozlabs.org Subject: Re: [PATCH 1/5] KVM: PPC: e500: Move VCPU's MMUCFG register initialization earlier On 30.01.2013, at 14:29, Mihai Caraman wrote: VCPU's MMUCFG register initialization should not depend on KVM_CAP_SW_TLB ioctl call. Move it earlier into tlb initalization phase. Quite the contrary. The fact that there is an mfspr() in e500_mmu.c already tells us that the code is broken. The TLB guest code should only depend on input from the SW_TLB configuration. It's completely orthogonal to the host capabilities. Then we have the same issue for TLBnCFG registers which need to be configured via SW_TLB ioctl. What is the purpose of guest tlb initalization in e500_mmu.c if we rely on SW_TLB? It's to provide a fallback to user space that doesn't implement SW_TLB configuration yet. Do we have such a case now or is it just hypothetical? For the fallback we need to initialize the MMUCFG register which I intended to say in the commit message. Alex -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Listing on your website
Hi Could you please let me know whether it is possible to list us on your website? I recently sent an e-mail to yourselves to query whether it would be possible to do this but have not heard anything back yet? We are Sirius Corporation - an Open Source services provider. More details on us can be found on our website: http://www.siriusopensource.com/about Please let me know and I can send you our logo and brief description. Kind regards, Maz -- Maz Khan, Marketing and Sales Coordinator Sirius - stress free technology www.siriusopensource.com Tel: +44 870 608 0063 Twitter: @SiriusCorp -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] KVM call minutes 2013-01-29 - Port I/O
On Thu, 2013-01-31 at 12:49 +0200, Michael S. Tsirkin wrote: On Wed, Jan 30, 2013 at 04:28:30PM -0700, Alex Williamson wrote: On Thu, 2013-01-31 at 10:02 +1100, Benjamin Herrenschmidt wrote: On Thu, 2013-01-31 at 00:49 +0200, Michael S. Tsirkin wrote: In practice they do (VGA at least) From a SW modelling standpoint, I don't think it's worth differentiating PCI and PCIE. Cheers, Ben. Interesting. Do you have such hardware? Could you please dump the output of lspci -vv? Any ATI or nVidia card still supports hard decoding of VGA regions for the sake of legacy operating systems and BIOSes :-) I don't know about Intel but I suppose it's the same. For example: -[:00]-+-00.0 Advanced Micro Devices [AMD] nee ATI RD890 PCI to PCI bridge (external gfx0 p +-04.0-[02]--+-00.0 Advanced Micro Devices [AMD] nee ATI Cedar PRO [Radeon HD 5450/6350] 00:04.0 PCI bridge: Advanced Micro Devices [AMD] nee ATI RD890 PCI to PCI bridge (PCI express gpp port D) (prog-if 00 [Normal decode]) Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR- FastB2B- DisINTx- Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast TAbort- TAbort- MAbort- SERR- PERR- INTx- Latency: 0, Cache Line Size: 64 bytes Bus: primary=00, secondary=02, subordinate=02, sec-latency=0 I/O behind bridge: c000-cfff Memory behind bridge: fd10-fd1f Prefetchable memory behind bridge: d000-dfff Secondary status: 66MHz- FastB2B- ParErr- DEVSEL=fast TAbort- TAbort- MAbort+ SERR- PERR- BridgeCtl: Parity- SERR- NoISA- VGA+ MAbort- Reset- FastB2B- VGA+ (VGA Enable) indicates positive decode of 0x3b0 - 0x3bb, 0x3c0 - 0x3df, and 0xa - 0xbfff. Device 2:00.0 of course doesn't report these ISA ranges as they're implicit in the VGA class code. OK but this appears behind a bridge. So the bridge configuration tells the root complex where to send accesses to the VGA. But qemu currently puts devices directly on root bus. And as far as I can tell when we present devices directly on bus 0, we pretend these are integrated in the root complex. The spec seems to say explicitly that root complex integrated devices should not use legacy addresses or support hotplug. So I would be surprised if such one appears in real world. Luckily guests do not seem to be worried as long as we use ACPI. Yes, in fact I just figured out last night that Windows is unhappy with assigned PCI devices on bus 0 that claim to be an endpoint in their PCIe capability rather than an integrated endpoint. We'll need to do extra mangling of the PCIe capability to massage it into the guest visible topology. Section 1.3.2.3 of the 3.0 spec says integrated endpoints must not require I/O resources claimed through BAR(s). VGA skirts around this by not having the legacy resources claimed by BARs, but instead being implicit. Are there other sections restricting legacy I/O? It's common that a plugin VGA card sits behind a root port where the bridge registers tell us about VGA routing, but integrated VGA devices are often on bus 0 though, here's an example: -[:00]-+-00.0 Intel Corporation 2nd Generation Core Processor Family DRAM Controller +-02.0 Intel Corporation 2nd Generation Core Processor Family Integrated Graphics Controller Often these systems will disable the integrated graphics when a plugin graphics is installed below a root port. I'm not sure how the system knows to route VGA to the integrated device vs the root port otherwise. Here's a more interesting example: -+-[:01]-+-00.0 NVIDIA Corporation GT218 [GeForce G210M] | \-00.1 NVIDIA Corporation High Definition Audio Controller \-[:00]-+-00.0 Intel Corporation Mobile 4 Series Chipset Memory Controller Hub +-01.0 Intel Corporation Mobile 4 Series Chipset PCI Express Graphics Port This system seems to have two host bridges with VGA behind each of them. There's no bridge to control VGA routing, so I don't know how the selection is done. It's possible the g210m never sees legacy VGA accesses in this mode. This bios has another mode which makes the g210m the primary graphics and hides the integrated graphics, essentially the same as I mention above with hiding integrated endpoint graphics when plugin graphics are used. Thanks, Alex -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/5] KVM: PPC: e500: Move VCPU's MMUCFG register initialization earlier
On 01/31/2013 09:26:20 AM, Caraman Mihai Claudiu-B02008 wrote: -Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Thursday, January 31, 2013 4:58 PM To: Caraman Mihai Claudiu-B02008 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; linuxppc- d...@lists.ozlabs.org Subject: Re: [PATCH 1/5] KVM: PPC: e500: Move VCPU's MMUCFG register initialization earlier On 31.01.2013, at 15:56, Caraman Mihai Claudiu-B02008 wrote: -Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Thursday, January 31, 2013 3:21 PM To: Caraman Mihai Claudiu-B02008 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; linuxppc- d...@lists.ozlabs.org Subject: Re: [PATCH 1/5] KVM: PPC: e500: Move VCPU's MMUCFG register initialization earlier On 30.01.2013, at 14:29, Mihai Caraman wrote: VCPU's MMUCFG register initialization should not depend on KVM_CAP_SW_TLB ioctl call. Move it earlier into tlb initalization phase. Quite the contrary. The fact that there is an mfspr() in e500_mmu.c already tells us that the code is broken. The TLB guest code should only depend on input from the SW_TLB configuration. It's completely orthogonal to the host capabilities. Then we have the same issue for TLBnCFG registers which need to be configured via SW_TLB ioctl. What is the purpose of guest tlb initalization in e500_mmu.c if we rely on SW_TLB? It's to provide a fallback to user space that doesn't implement SW_TLB configuration yet. Do we have such a case now or is it just hypothetical? For the fallback we need to initialize the MMUCFG register which I intended to say in the commit message. I don't think we need to support a fallback for e6500, since there's nothing to be backwards compatible with. As for use case, I don't see us ever supporting the guest being a different CPU than the host. Page sizes probably aren't a problem, but there are other barriers. The main reasons that TLBnCFG are settable through SW_TLB are: 1. The guest TLB can be enlarged as a performance hack (like in Topaz, though QEMU doesn't currently do this), 2. The legacy default in KVM is based on the e500v1 TLB0 size, which is half of what e500v2/e500mc have, and 3. QEMU needs to know the exact geometry of the TLB so that it can interpret the shared data properly. #3 seems like a compelling reason here, to avoid silent weirdness if there's a slight mismatch between what QEMU thinks it's modelling and what we're actually running on. -Scott -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 3/8] KVM: PPC: booke: Added debug handler
-Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Thursday, January 31, 2013 5:47 PM To: Bhushan Bharat-R65777 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org Subject: Re: [PATCH 3/8] KVM: PPC: booke: Added debug handler On 30.01.2013, at 12:30, Bhushan Bharat-R65777 wrote: -Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Friday, January 25, 2013 5:13 PM To: Bhushan Bharat-R65777 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Bhushan Bharat-R65777 Subject: Re: [PATCH 3/8] KVM: PPC: booke: Added debug handler On 16.01.2013, at 09:24, Bharat Bhushan wrote: From: Bharat Bhushan bharat.bhus...@freescale.com Installed debug handler will be used for guest debug support and debug facility emulation features (patches for these features will follow this patch). Signed-off-by: Liu Yu yu@freescale.com [bharat.bhus...@freescale.com: Substantial changes] Signed-off-by: Bharat Bhushan bharat.bhus...@freescale.com --- arch/powerpc/include/asm/kvm_host.h |1 + arch/powerpc/kernel/asm-offsets.c |1 + arch/powerpc/kvm/booke_interrupts.S | 49 ++--- -- 3 files changed, 44 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 8a72d59..f4ba881 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -503,6 +503,7 @@ struct kvm_vcpu_arch { u32 tlbcfg[4]; u32 mmucfg; u32 epr; + u32 crit_save; struct kvmppc_booke_debug_reg dbg_reg; #endif gpa_t paddr_accessed; diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 46f6afd..02048f3 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -562,6 +562,7 @@ int main(void) DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst)); DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear)); DEFINE(VCPU_FAULT_ESR, offsetof(struct kvm_vcpu, arch.fault_esr)); + DEFINE(VCPU_CRIT_SAVE, offsetof(struct kvm_vcpu, arch.crit_save)); #endif /* CONFIG_PPC_BOOK3S */ #endif /* CONFIG_KVM */ diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S index eae8483..dd9c5d4 100644 --- a/arch/powerpc/kvm/booke_interrupts.S +++ b/arch/powerpc/kvm/booke_interrupts.S @@ -52,12 +52,7 @@ (1BOOKE_INTERRUPT_PROGRAM) | \ (1BOOKE_INTERRUPT_DTLB_MISS)) -.macro KVM_HANDLER ivor_nr scratch srr0 -_GLOBAL(kvmppc_handler_\ivor_nr) - /* Get pointer to vcpu and record exit number. */ - mtspr \scratch , r4 - mfspr r4, SPRN_SPRG_THREAD - lwz r4, THREAD_KVM_VCPU(r4) +.macro __KVM_HANDLER ivor_nr scratch srr0 stw r3, VCPU_GPR(R3)(r4) stw r5, VCPU_GPR(R5)(r4) stw r6, VCPU_GPR(R6)(r4) @@ -74,6 +69,46 @@ _GLOBAL(kvmppc_handler_\ivor_nr) bctr .endm +.macro KVM_HANDLER ivor_nr scratch srr0 +_GLOBAL(kvmppc_handler_\ivor_nr) + /* Get pointer to vcpu and record exit number. */ + mtspr \scratch , r4 + mfspr r4, SPRN_SPRG_THREAD + lwz r4, THREAD_KVM_VCPU(r4) + __KVM_HANDLER \ivor_nr \scratch \srr0 .endm + +.macro KVM_DBG_HANDLER ivor_nr scratch srr0 +_GLOBAL(kvmppc_handler_\ivor_nr) + mtspr \scratch, r4 + mfspr r4, SPRN_SPRG_THREAD + lwz r4, THREAD_KVM_VCPU(r4) + stw r3, VCPU_CRIT_SAVE(r4) + mfcrr3 + mfspr r4, SPRN_CSRR1 + andi. r4, r4, MSR_PR + bne 1f + /* debug interrupt happened in enter/exit path */ + mfspr r4, SPRN_CSRR1 + rlwinm r4, r4, 0, ~MSR_DE + mtspr SPRN_CSRR1, r4 + lis r4, 0x + ori r4, r4, 0x + mtspr SPRN_DBSR, r4 + mfspr r4, SPRN_SPRG_THREAD + lwz r4, THREAD_KVM_VCPU(r4) + mtcrr3 + lwz r3, VCPU_CRIT_SAVE(r4) + mfspr r4, \scratch + rfci What is this part doing? Try to ignore the debug exit? As BOOKE doesn't have hardware support for virtualization, hardware never know current pc is in guest or in host. So when enable hardware single step for guest, it cannot be disabled at the time guest exit. Thus, we'll see that an single step interrupt happens at the beginning of guest exit path. With the above code we recognize this kind of single step interrupt disable single step and rfci. Why would we have MSR_DE enabled in the first place when we can't handle it? When QEMU is using hardware debug resource then we always set MSR_DE during guest is running. Right, but why is MSR_DE enabled during the exit path? If MSR_DE wasn't set, you wouldn't get a single step exit. We always set MSR_DE in hw MSR when qemu using the debug resource. During the exit code path, you could then swap DBSR back to what the host expects (which means no single step). Only after that enable MSR_DE again. We
Re: [PATCH 3/8] KVM: PPC: booke: Added debug handler
On 31.01.2013, at 17:58, Bhushan Bharat-R65777 wrote: -Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Thursday, January 31, 2013 5:47 PM To: Bhushan Bharat-R65777 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org Subject: Re: [PATCH 3/8] KVM: PPC: booke: Added debug handler On 30.01.2013, at 12:30, Bhushan Bharat-R65777 wrote: -Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Friday, January 25, 2013 5:13 PM To: Bhushan Bharat-R65777 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Bhushan Bharat-R65777 Subject: Re: [PATCH 3/8] KVM: PPC: booke: Added debug handler On 16.01.2013, at 09:24, Bharat Bhushan wrote: From: Bharat Bhushan bharat.bhus...@freescale.com Installed debug handler will be used for guest debug support and debug facility emulation features (patches for these features will follow this patch). Signed-off-by: Liu Yu yu@freescale.com [bharat.bhus...@freescale.com: Substantial changes] Signed-off-by: Bharat Bhushan bharat.bhus...@freescale.com --- arch/powerpc/include/asm/kvm_host.h |1 + arch/powerpc/kernel/asm-offsets.c |1 + arch/powerpc/kvm/booke_interrupts.S | 49 ++--- -- 3 files changed, 44 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 8a72d59..f4ba881 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -503,6 +503,7 @@ struct kvm_vcpu_arch { u32 tlbcfg[4]; u32 mmucfg; u32 epr; + u32 crit_save; struct kvmppc_booke_debug_reg dbg_reg; #endif gpa_t paddr_accessed; diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 46f6afd..02048f3 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -562,6 +562,7 @@ int main(void) DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst)); DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear)); DEFINE(VCPU_FAULT_ESR, offsetof(struct kvm_vcpu, arch.fault_esr)); + DEFINE(VCPU_CRIT_SAVE, offsetof(struct kvm_vcpu, arch.crit_save)); #endif /* CONFIG_PPC_BOOK3S */ #endif /* CONFIG_KVM */ diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S index eae8483..dd9c5d4 100644 --- a/arch/powerpc/kvm/booke_interrupts.S +++ b/arch/powerpc/kvm/booke_interrupts.S @@ -52,12 +52,7 @@ (1BOOKE_INTERRUPT_PROGRAM) | \ (1BOOKE_INTERRUPT_DTLB_MISS)) -.macro KVM_HANDLER ivor_nr scratch srr0 -_GLOBAL(kvmppc_handler_\ivor_nr) - /* Get pointer to vcpu and record exit number. */ - mtspr \scratch , r4 - mfspr r4, SPRN_SPRG_THREAD - lwz r4, THREAD_KVM_VCPU(r4) +.macro __KVM_HANDLER ivor_nr scratch srr0 stw r3, VCPU_GPR(R3)(r4) stw r5, VCPU_GPR(R5)(r4) stw r6, VCPU_GPR(R6)(r4) @@ -74,6 +69,46 @@ _GLOBAL(kvmppc_handler_\ivor_nr) bctr .endm +.macro KVM_HANDLER ivor_nr scratch srr0 +_GLOBAL(kvmppc_handler_\ivor_nr) + /* Get pointer to vcpu and record exit number. */ + mtspr \scratch , r4 + mfspr r4, SPRN_SPRG_THREAD + lwz r4, THREAD_KVM_VCPU(r4) + __KVM_HANDLER \ivor_nr \scratch \srr0 .endm + +.macro KVM_DBG_HANDLER ivor_nr scratch srr0 +_GLOBAL(kvmppc_handler_\ivor_nr) + mtspr \scratch, r4 + mfspr r4, SPRN_SPRG_THREAD + lwz r4, THREAD_KVM_VCPU(r4) + stw r3, VCPU_CRIT_SAVE(r4) + mfcrr3 + mfspr r4, SPRN_CSRR1 + andi. r4, r4, MSR_PR + bne 1f + /* debug interrupt happened in enter/exit path */ + mfspr r4, SPRN_CSRR1 + rlwinm r4, r4, 0, ~MSR_DE + mtspr SPRN_CSRR1, r4 + lis r4, 0x + ori r4, r4, 0x + mtspr SPRN_DBSR, r4 + mfspr r4, SPRN_SPRG_THREAD + lwz r4, THREAD_KVM_VCPU(r4) + mtcrr3 + lwz r3, VCPU_CRIT_SAVE(r4) + mfspr r4, \scratch + rfci What is this part doing? Try to ignore the debug exit? As BOOKE doesn't have hardware support for virtualization, hardware never know current pc is in guest or in host. So when enable hardware single step for guest, it cannot be disabled at the time guest exit. Thus, we'll see that an single step interrupt happens at the beginning of guest exit path. With the above code we recognize this kind of single step interrupt disable single step and rfci. Why would we have MSR_DE enabled in the first place when we can't handle it? When QEMU is using hardware debug resource then we always set MSR_DE during guest is running. Right, but why is MSR_DE enabled during the exit path? If MSR_DE wasn't set, you wouldn't get a single step exit. We always set MSR_DE in hw MSR when qemu using the debug resource. In the _guest_ MSR, yes. But once we exit the guest, it shouldn't be set anymore, because we're in an interrupt handler, no? Or is MSR_DE kept alive on interrupts? During the exit code path, you
Re: [PATCH 3/8] KVM: PPC: booke: Added debug handler
On 31.01.2013, at 18:08, Alexander Graf wrote: On 31.01.2013, at 17:58, Bhushan Bharat-R65777 wrote: -Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Thursday, January 31, 2013 5:47 PM To: Bhushan Bharat-R65777 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org Subject: Re: [PATCH 3/8] KVM: PPC: booke: Added debug handler On 30.01.2013, at 12:30, Bhushan Bharat-R65777 wrote: -Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Friday, January 25, 2013 5:13 PM To: Bhushan Bharat-R65777 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Bhushan Bharat-R65777 Subject: Re: [PATCH 3/8] KVM: PPC: booke: Added debug handler On 16.01.2013, at 09:24, Bharat Bhushan wrote: From: Bharat Bhushan bharat.bhus...@freescale.com Installed debug handler will be used for guest debug support and debug facility emulation features (patches for these features will follow this patch). Signed-off-by: Liu Yu yu@freescale.com [bharat.bhus...@freescale.com: Substantial changes] Signed-off-by: Bharat Bhushan bharat.bhus...@freescale.com --- arch/powerpc/include/asm/kvm_host.h |1 + arch/powerpc/kernel/asm-offsets.c |1 + arch/powerpc/kvm/booke_interrupts.S | 49 ++--- -- 3 files changed, 44 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 8a72d59..f4ba881 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -503,6 +503,7 @@ struct kvm_vcpu_arch { u32 tlbcfg[4]; u32 mmucfg; u32 epr; +u32 crit_save; struct kvmppc_booke_debug_reg dbg_reg; #endif gpa_t paddr_accessed; diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 46f6afd..02048f3 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -562,6 +562,7 @@ int main(void) DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst)); DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear)); DEFINE(VCPU_FAULT_ESR, offsetof(struct kvm_vcpu, arch.fault_esr)); +DEFINE(VCPU_CRIT_SAVE, offsetof(struct kvm_vcpu, arch.crit_save)); #endif /* CONFIG_PPC_BOOK3S */ #endif /* CONFIG_KVM */ diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S index eae8483..dd9c5d4 100644 --- a/arch/powerpc/kvm/booke_interrupts.S +++ b/arch/powerpc/kvm/booke_interrupts.S @@ -52,12 +52,7 @@ (1BOOKE_INTERRUPT_PROGRAM) | \ (1BOOKE_INTERRUPT_DTLB_MISS)) -.macro KVM_HANDLER ivor_nr scratch srr0 -_GLOBAL(kvmppc_handler_\ivor_nr) -/* Get pointer to vcpu and record exit number. */ -mtspr \scratch , r4 -mfspr r4, SPRN_SPRG_THREAD -lwz r4, THREAD_KVM_VCPU(r4) +.macro __KVM_HANDLER ivor_nr scratch srr0 stw r3, VCPU_GPR(R3)(r4) stw r5, VCPU_GPR(R5)(r4) stw r6, VCPU_GPR(R6)(r4) @@ -74,6 +69,46 @@ _GLOBAL(kvmppc_handler_\ivor_nr) bctr .endm +.macro KVM_HANDLER ivor_nr scratch srr0 +_GLOBAL(kvmppc_handler_\ivor_nr) +/* Get pointer to vcpu and record exit number. */ +mtspr \scratch , r4 +mfspr r4, SPRN_SPRG_THREAD +lwz r4, THREAD_KVM_VCPU(r4) +__KVM_HANDLER \ivor_nr \scratch \srr0 .endm + +.macro KVM_DBG_HANDLER ivor_nr scratch srr0 +_GLOBAL(kvmppc_handler_\ivor_nr) +mtspr \scratch, r4 +mfspr r4, SPRN_SPRG_THREAD +lwz r4, THREAD_KVM_VCPU(r4) +stw r3, VCPU_CRIT_SAVE(r4) +mfcrr3 +mfspr r4, SPRN_CSRR1 +andi. r4, r4, MSR_PR +bne 1f +/* debug interrupt happened in enter/exit path */ +mfspr r4, SPRN_CSRR1 +rlwinm r4, r4, 0, ~MSR_DE +mtspr SPRN_CSRR1, r4 +lis r4, 0x +ori r4, r4, 0x +mtspr SPRN_DBSR, r4 +mfspr r4, SPRN_SPRG_THREAD +lwz r4, THREAD_KVM_VCPU(r4) +mtcrr3 +lwz r3, VCPU_CRIT_SAVE(r4) +mfspr r4, \scratch +rfci What is this part doing? Try to ignore the debug exit? As BOOKE doesn't have hardware support for virtualization, hardware never know current pc is in guest or in host. So when enable hardware single step for guest, it cannot be disabled at the time guest exit. Thus, we'll see that an single step interrupt happens at the beginning of guest exit path. With the above code we recognize this kind of single step interrupt disable single step and rfci. Why would we have MSR_DE enabled in the first place when we can't handle it? When QEMU is using hardware debug resource then we always set MSR_DE during guest is running. Right, but why is MSR_DE enabled during the exit path? If MSR_DE wasn't set, you wouldn't get a single step exit. We
Re: windows 2008 guest causing rcu_shed to emit NMI
On Thu, Jan 31, 2013 at 12:11 AM, Marcelo Tosatti mtosa...@redhat.com wrote: On Wed, Jan 30, 2013 at 11:21:08AM +0300, Andrey Korolyov wrote: On Wed, Jan 30, 2013 at 3:15 AM, Marcelo Tosatti mtosa...@redhat.com wrote: On Tue, Jan 29, 2013 at 02:35:02AM +0300, Andrey Korolyov wrote: On Mon, Jan 28, 2013 at 5:56 PM, Andrey Korolyov and...@xdel.ru wrote: On Mon, Jan 28, 2013 at 3:14 AM, Marcelo Tosatti mtosa...@redhat.com wrote: On Mon, Jan 28, 2013 at 12:04:50AM +0300, Andrey Korolyov wrote: On Sat, Jan 26, 2013 at 12:49 AM, Marcelo Tosatti mtosa...@redhat.com wrote: On Fri, Jan 25, 2013 at 10:45:02AM +0300, Andrey Korolyov wrote: On Thu, Jan 24, 2013 at 4:20 PM, Marcelo Tosatti mtosa...@redhat.com wrote: On Thu, Jan 24, 2013 at 01:54:03PM +0300, Andrey Korolyov wrote: Thank you Marcelo, Host node locking up sometimes later than yesterday, bur problem still here, please see attached dmesg. Stuck process looks like root 19251 0.0 0.0 228476 12488 ?D14:42 0:00 /usr/bin/kvm -no-user-config -device ? -device pci-assign,? -device virtio-blk-pci,? -device on fourth vm by count. Should I try upstream kernel instead of applying patch to the latest 3.4 or it is useless? If you can upgrade to an upstream kernel, please do that. With vanilla 3.7.4 there is almost no changes, and NMI started firing again. External symptoms looks like following: starting from some count, may be third or sixth vm, qemu-kvm process allocating its memory very slowly and by jumps, 20M-200M-700M-1.6G in minutes. Patch helps, of course - on both patched 3.4 and vanilla 3.7 I`m able to kill stuck kvm processes and node returned back to the normal, when on 3.2 sending SIGKILL to the process causing zombies and hanged ``ps'' output (problem and workaround when no scheduler involved described here http://www.spinics.net/lists/kvm/msg84799.html). Try disabling pause loop exiting with ple_gap=0 kvm-intel.ko module parameter. Hi Marcelo, thanks, this parameter helped to increase number of working VMs in a half of order of magnitude, from 3-4 to 10-15. Very high SY load, 10 to 15 percents, persists on such numbers for a long time, where linux guests in same configuration do not jump over one percent even under stress bench. After I disabled HT, crash happens only in long runs and now it is kernel panic :) Stair-like memory allocation behaviour disappeared, but other symptom leading to the crash which I have not counted previously, persists: if VM count is ``enough'' for crash, some qemu processes starting to eat one core, and they`ll panic system after run in tens of minutes in such state or if I try to attach debugger to one of them. If needed, I can log entire crash output via netconsole, now I have some tail, almost the same every time: http://xdel.ru/downloads/btwin.png Yes, please log entire crash output, thanks. Here please, 3.7.4-vanilla, 16 vms, ple_gap=0: http://xdel.ru/downloads/oops-default-kvmintel.txt Just an update: I was able to reproduce that on pure linux VMs using qemu-1.3.0 and ``stress'' benchmark running on them - panic occurs at start of vm(with count ten working machines at the moment). Qemu-1.1.2 generally is not able to reproduce that, but host node with older version crashing on less amount of Windows VMs(three to six instead ten to fifteen) than with 1.3, please see trace below: http://xdel.ru/downloads/oops-old-qemu.txt Single bit memory error, apparently. Try: 1. memtest86. 2. Boot with slub_debug=ZFPU kernel parameter. 3. Reproduce on different machine Hi Marcelo, I always follow the rule - if some weird bug exists, check it on ECC-enabled machine and check IPMI logs too before start complaining :) I have finally managed to ``fix'' the problem, but my solution seems a bit strange: - I have noticed that if virtual machines started without any cgroup setting they will not cause this bug under any conditions, - I have thought, very wrong in my mind, that the CONFIG_SCHED_AUTOGROUP should regroup the tasks without any cgroup and should not touch tasks already inside any existing cpu cgroup. First sight on the 200-line patch shows that the autogrouping always applies to all tasks, so I tried to disable it, - wild magic appears - VMs didn`t crashed host any more, even in count 30+ they work fine. I still don`t know what exactly triggered that and will I face it again under different conditions, so my solution more likely to be a patch of mud in wall of the dam, instead of proper fixing. There seems to be two possible origins of such error - a very very hideous race condition involving cgroups and processes like qemu-kvm causing frequent context switches and simple
RE: [PATCH 4/8] Added ONE_REG interface for debug instruction
-Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Friday, January 25, 2013 5:18 PM To: Bhushan Bharat-R65777 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Bhushan Bharat-R65777 Subject: Re: [PATCH 4/8] Added ONE_REG interface for debug instruction On 16.01.2013, at 09:24, Bharat Bhushan wrote: This patch adds the one_reg interface to get the special instruction to be used for setting software breakpoint from userspace. Signed-off-by: Bharat Bhushan bharat.bhus...@freescale.com --- Documentation/virtual/kvm/api.txt |1 + arch/powerpc/include/asm/kvm_ppc.h |1 + arch/powerpc/include/uapi/asm/kvm.h |3 +++ arch/powerpc/kvm/44x.c |5 + arch/powerpc/kvm/booke.c| 10 ++ arch/powerpc/kvm/e500.c |5 + arch/powerpc/kvm/e500.h |9 + arch/powerpc/kvm/e500mc.c |5 + 8 files changed, 39 insertions(+), 0 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 09905cb..7e8be9e 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1775,6 +1775,7 @@ registers, find a list below: PPC | KVM_REG_PPC_VPA_DTL | 128 PPC | KVM_REG_PPC_EPCR | 32 PPC | KVM_REG_PPC_EPR | 32 + PPC | KVM_REG_PPC_DEBUG_INST| 32 4.69 KVM_GET_ONE_REG diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 44a657a..b3c481e 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -235,6 +235,7 @@ union kvmppc_one_reg { void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); +u32 kvmppc_core_debug_inst_op(void); void kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 16064d0..e81ae5b 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -417,4 +417,7 @@ struct kvm_get_htab_header { #define KVM_REG_PPC_EPCR(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x85) #define KVM_REG_PPC_EPR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x86) +/* Debugging: Special instruction for software breakpoint */ #define +KVM_REG_PPC_DEBUG_INST (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x87) + #endif /* __LINUX_KVM_POWERPC_H */ diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c index 3d7fd21..41501be 100644 --- a/arch/powerpc/kvm/44x.c +++ b/arch/powerpc/kvm/44x.c @@ -114,6 +114,11 @@ int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu, return 0; } +u32 kvmppc_core_debug_inst_op(void) +{ + return -1; +} + void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { kvmppc_get_sregs_ivor(vcpu, sregs); diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index d2f502d..453a10f 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c Please provide the DEBUG_INST on a more global level - across all ppc subarchs. Do you mean defining in powerpc.c ? We are using one_reg for DEBUG_INST and one_reg_ioctl and defined in respective subarchs (booke and books have their separate handler). So how you want this to be defined in more common way for all subarchs? Thanks -Bharat @@ -1424,6 +1424,12 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) r = put_user(vcpu-arch.epcr, (u32 __user *)(long)reg-addr); break; #endif + case KVM_REG_PPC_DEBUG_INST: { + u32 opcode = kvmppc_core_debug_inst_op(); + r = copy_to_user((u32 __user *)(long)reg-addr, +opcode, sizeof(u32)); + break; + } default: break; } @@ -1467,6 +1473,10 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) break; } #endif + case KVM_REG_PPC_DEBUG_INST: + /* This is read only, so write to this is nop*/ + r = 0; + break; Just don't support set_one_reg on this reg. default: break; } diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index 6dd4de7..d8a5e8e 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -367,6 +367,11 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) return 0; } +u32 kvmppc_core_debug_inst_op(void) +{ + return KVMPPC_INST_GUEST_GDB; +} + void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); diff --git a/arch/powerpc/kvm/e500.h
Re: [PATCH 4/8] Added ONE_REG interface for debug instruction
On 31.01.2013, at 18:44, Bhushan Bharat-R65777 wrote: -Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Friday, January 25, 2013 5:18 PM To: Bhushan Bharat-R65777 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Bhushan Bharat-R65777 Subject: Re: [PATCH 4/8] Added ONE_REG interface for debug instruction On 16.01.2013, at 09:24, Bharat Bhushan wrote: This patch adds the one_reg interface to get the special instruction to be used for setting software breakpoint from userspace. Signed-off-by: Bharat Bhushan bharat.bhus...@freescale.com --- Documentation/virtual/kvm/api.txt |1 + arch/powerpc/include/asm/kvm_ppc.h |1 + arch/powerpc/include/uapi/asm/kvm.h |3 +++ arch/powerpc/kvm/44x.c |5 + arch/powerpc/kvm/booke.c| 10 ++ arch/powerpc/kvm/e500.c |5 + arch/powerpc/kvm/e500.h |9 + arch/powerpc/kvm/e500mc.c |5 + 8 files changed, 39 insertions(+), 0 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 09905cb..7e8be9e 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1775,6 +1775,7 @@ registers, find a list below: PPC | KVM_REG_PPC_VPA_DTL | 128 PPC | KVM_REG_PPC_EPCR | 32 PPC | KVM_REG_PPC_EPR| 32 + PPC | KVM_REG_PPC_DEBUG_INST| 32 4.69 KVM_GET_ONE_REG diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 44a657a..b3c481e 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -235,6 +235,7 @@ union kvmppc_one_reg { void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); +u32 kvmppc_core_debug_inst_op(void); void kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 16064d0..e81ae5b 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -417,4 +417,7 @@ struct kvm_get_htab_header { #define KVM_REG_PPC_EPCR(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x85) #define KVM_REG_PPC_EPR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x86) +/* Debugging: Special instruction for software breakpoint */ #define +KVM_REG_PPC_DEBUG_INST (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x87) + #endif /* __LINUX_KVM_POWERPC_H */ diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c index 3d7fd21..41501be 100644 --- a/arch/powerpc/kvm/44x.c +++ b/arch/powerpc/kvm/44x.c @@ -114,6 +114,11 @@ int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu, return 0; } +u32 kvmppc_core_debug_inst_op(void) +{ + return -1; The way you handle it here this needs to be an int kvmppc_core_debug_inst_op(u32 *inst) so you can return an error for 440. I don't think it's worth to worry about a case where we don't know about the inst though. Just return the same as what we use on e500v2 here. +} + void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { kvmppc_get_sregs_ivor(vcpu, sregs); diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index d2f502d..453a10f 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c Please provide the DEBUG_INST on a more global level - across all ppc subarchs. Do you mean defining in powerpc.c ? We are using one_reg for DEBUG_INST and one_reg_ioctl and defined in respective subarchs (booke and books have their separate handler). So how you want this to be defined in more common way for all subarchs? Just add it to all subarch's one_reg handlers. Alex -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 4/8] Added ONE_REG interface for debug instruction
-Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Thursday, January 31, 2013 11:23 PM To: Bhushan Bharat-R65777 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org Subject: Re: [PATCH 4/8] Added ONE_REG interface for debug instruction On 31.01.2013, at 18:44, Bhushan Bharat-R65777 wrote: -Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Friday, January 25, 2013 5:18 PM To: Bhushan Bharat-R65777 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Bhushan Bharat-R65777 Subject: Re: [PATCH 4/8] Added ONE_REG interface for debug instruction On 16.01.2013, at 09:24, Bharat Bhushan wrote: This patch adds the one_reg interface to get the special instruction to be used for setting software breakpoint from userspace. Signed-off-by: Bharat Bhushan bharat.bhus...@freescale.com --- Documentation/virtual/kvm/api.txt |1 + arch/powerpc/include/asm/kvm_ppc.h |1 + arch/powerpc/include/uapi/asm/kvm.h |3 +++ arch/powerpc/kvm/44x.c |5 + arch/powerpc/kvm/booke.c| 10 ++ arch/powerpc/kvm/e500.c |5 + arch/powerpc/kvm/e500.h |9 + arch/powerpc/kvm/e500mc.c |5 + 8 files changed, 39 insertions(+), 0 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 09905cb..7e8be9e 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1775,6 +1775,7 @@ registers, find a list below: PPC | KVM_REG_PPC_VPA_DTL | 128 PPC | KVM_REG_PPC_EPCR | 32 PPC | KVM_REG_PPC_EPR | 32 + PPC | KVM_REG_PPC_DEBUG_INST| 32 4.69 KVM_GET_ONE_REG diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 44a657a..b3c481e 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -235,6 +235,7 @@ union kvmppc_one_reg { void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); +u32 kvmppc_core_debug_inst_op(void); void kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 16064d0..e81ae5b 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -417,4 +417,7 @@ struct kvm_get_htab_header { #define KVM_REG_PPC_EPCR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x85) #define KVM_REG_PPC_EPR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x86) +/* Debugging: Special instruction for software breakpoint */ +#define KVM_REG_PPC_DEBUG_INST (KVM_REG_PPC | KVM_REG_SIZE_U32 | +0x87) + #endif /* __LINUX_KVM_POWERPC_H */ diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c index 3d7fd21..41501be 100644 --- a/arch/powerpc/kvm/44x.c +++ b/arch/powerpc/kvm/44x.c @@ -114,6 +114,11 @@ int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu, return 0; } +u32 kvmppc_core_debug_inst_op(void) { + return -1; The way you handle it here this needs to be an int kvmppc_core_debug_inst_op(u32 *inst) so you can return an error for 440. I don't think it's worth to worry about a case where we don't know about the inst though. Just return the same as what we use on e500v2 here. +} + void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { kvmppc_get_sregs_ivor(vcpu, sregs); diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index d2f502d..453a10f 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c Please provide the DEBUG_INST on a more global level - across all ppc subarchs. Do you mean defining in powerpc.c ? We are using one_reg for DEBUG_INST and one_reg_ioctl and defined in respective subarchs (booke and books have their separate handler). So how you want this to be defined in more common way for all subarchs? Just add it to all subarch's one_reg handlers. And what book3s etc should return? -1 ? Thanks -Bharat -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 8/8] KVM:PPC:booke: Allow debug interrupt injection to guest
-Original Message- From: kvm-ow...@vger.kernel.org [mailto:kvm-ow...@vger.kernel.org] On Behalf Of Alexander Graf Sent: Thursday, January 31, 2013 5:34 PM To: Bhushan Bharat-R65777 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org Subject: Re: [PATCH 8/8] KVM:PPC:booke: Allow debug interrupt injection to guest On 30.01.2013, at 12:12, Bhushan Bharat-R65777 wrote: -Original Message- From: kvm-ppc-ow...@vger.kernel.org [mailto:kvm-ppc-ow...@vger.kernel.org] On Behalf Of Alexander Graf Sent: Friday, January 25, 2013 5:44 PM To: Bhushan Bharat-R65777 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Bhushan Bharat-R65777 Subject: Re: [PATCH 8/8] KVM:PPC:booke: Allow debug interrupt injection to guest On 16.01.2013, at 09:24, Bharat Bhushan wrote: Allow userspace to inject debug interrupt to guest. QEMU can s/QEMU/user space. inject the debug interrupt to guest if it is not able to handle the debug interrupt. Signed-off-by: Bharat Bhushan bharat.bhus...@freescale.com --- arch/powerpc/kvm/booke.c | 32 +++- arch/powerpc/kvm/e500mc.c | 10 +- 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index faa0a0b..547797f 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -133,6 +133,13 @@ static void kvmppc_vcpu_sync_fpu(struct kvm_vcpu *vcpu) #endif } +#ifdef CONFIG_KVM_BOOKE_HV +static int kvmppc_core_pending_debug(struct kvm_vcpu *vcpu) { + return test_bit(BOOKE_IRQPRIO_DEBUG, +vcpu-arch.pending_exceptions); } #endif + /* * Helper function for full MSR writes. No need to call this if only * EE/CE/ME/DE/RI are changing. @@ -144,7 +151,11 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr) #ifdef CONFIG_KVM_BOOKE_HV new_msr |= MSR_GS; - if (vcpu-guest_debug) + /* + * Set MSR_DE if the hardware debug resources are owned by user-space + * and there is no debug interrupt pending for guest to handle. Why? QEMU is using the IAC/DAC registers to set hardware breakpoint/watchpoints via debug ioctls. As debug events are enabled/gated by MSR_DE so somehow we need to set MSR_DE on hardware MSR when guest is running in this case. Reading this 5 times I still have no idea what you're really checking for here. Maybe the naming for kvmppc_core_pending_debug is just unnatural? What does that function do really? On bookehv this is how I am controlling the MSR_DE in hardware MSR. And why is this whole thing only executed on HV? On e500v2 we always enable MSR_DE using vcpu-arch.shadow_msr in e500.c #ifndef CONFIG_KVM_BOOKE_HV - vcpu-arch.shadow_msr = MSR_USER | MSR_IS | MSR_DS; + vcpu-arch.shadow_msr = MSR_USER | MSR_DE | MSR_IS | MSR_DS; diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index b340a62..1e2d663 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -151,10 +151,14 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr) /* * Set MSR_DE if the hardware debug resources are owned by user-space -* and there is no debug interrupt pending for guest to handle. */ - if (vcpu-guest_debug !kvmppc_core_pending_debug(vcpu)) + if (vcpu-guest_debug) new_msr |= MSR_DE; +#else + if (vcpu-guest_debug) + vcpu-arch.shadow_msr |= MSR_DE; #endif But do not when I should clear? Why? How is e500v2 any different wrt debug? And why wouldn't that work for e500mc? Alex -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 8/8] KVM:PPC:booke: Allow debug interrupt injection to guest
On 01/31/2013 06:04:29 AM, Alexander Graf wrote: On 30.01.2013, at 12:12, Bhushan Bharat-R65777 wrote: On bookehv this is how I am controlling the MSR_DE in hardware MSR. And why is this whole thing only executed on HV? On e500v2 we always enable MSR_DE using vcpu-arch.shadow_msr in e500.c #ifndef CONFIG_KVM_BOOKE_HV - vcpu-arch.shadow_msr = MSR_USER | MSR_IS | MSR_DS; + vcpu-arch.shadow_msr = MSR_USER | MSR_DE | MSR_IS | MSR_DS; Why? How is e500v2 any different wrt debug? And why wouldn't that work for e500mc? shadow_msr isn't used at all on bookehv. -Scott -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 8/8] KVM:PPC:booke: Allow debug interrupt injection to guest
On 31.01.2013, at 18:59, Bhushan Bharat-R65777 wrote: -Original Message- From: kvm-ow...@vger.kernel.org [mailto:kvm-ow...@vger.kernel.org] On Behalf Of Alexander Graf Sent: Thursday, January 31, 2013 5:34 PM To: Bhushan Bharat-R65777 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org Subject: Re: [PATCH 8/8] KVM:PPC:booke: Allow debug interrupt injection to guest On 30.01.2013, at 12:12, Bhushan Bharat-R65777 wrote: -Original Message- From: kvm-ppc-ow...@vger.kernel.org [mailto:kvm-ppc-ow...@vger.kernel.org] On Behalf Of Alexander Graf Sent: Friday, January 25, 2013 5:44 PM To: Bhushan Bharat-R65777 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Bhushan Bharat-R65777 Subject: Re: [PATCH 8/8] KVM:PPC:booke: Allow debug interrupt injection to guest On 16.01.2013, at 09:24, Bharat Bhushan wrote: Allow userspace to inject debug interrupt to guest. QEMU can s/QEMU/user space. inject the debug interrupt to guest if it is not able to handle the debug interrupt. Signed-off-by: Bharat Bhushan bharat.bhus...@freescale.com --- arch/powerpc/kvm/booke.c | 32 +++- arch/powerpc/kvm/e500mc.c | 10 +- 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index faa0a0b..547797f 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -133,6 +133,13 @@ static void kvmppc_vcpu_sync_fpu(struct kvm_vcpu *vcpu) #endif } +#ifdef CONFIG_KVM_BOOKE_HV +static int kvmppc_core_pending_debug(struct kvm_vcpu *vcpu) { + return test_bit(BOOKE_IRQPRIO_DEBUG, +vcpu-arch.pending_exceptions); } #endif + /* * Helper function for full MSR writes. No need to call this if only * EE/CE/ME/DE/RI are changing. @@ -144,7 +151,11 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr) #ifdef CONFIG_KVM_BOOKE_HV new_msr |= MSR_GS; - if (vcpu-guest_debug) + /* + * Set MSR_DE if the hardware debug resources are owned by user-space + * and there is no debug interrupt pending for guest to handle. Why? QEMU is using the IAC/DAC registers to set hardware breakpoint/watchpoints via debug ioctls. As debug events are enabled/gated by MSR_DE so somehow we need to set MSR_DE on hardware MSR when guest is running in this case. Reading this 5 times I still have no idea what you're really checking for here. Maybe the naming for kvmppc_core_pending_debug is just unnatural? What does that function do really? On bookehv this is how I am controlling the MSR_DE in hardware MSR. And why is this whole thing only executed on HV? On e500v2 we always enable MSR_DE using vcpu-arch.shadow_msr in e500.c #ifndef CONFIG_KVM_BOOKE_HV - vcpu-arch.shadow_msr = MSR_USER | MSR_IS | MSR_DS; + vcpu-arch.shadow_msr = MSR_USER | MSR_DE | MSR_IS | MSR_DS; diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index b340a62..1e2d663 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -151,10 +151,14 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr) /* * Set MSR_DE if the hardware debug resources are owned by user-space -* and there is no debug interrupt pending for guest to handle. */ - if (vcpu-guest_debug !kvmppc_core_pending_debug(vcpu)) + if (vcpu-guest_debug) new_msr |= MSR_DE; +#else + if (vcpu-guest_debug) + vcpu-arch.shadow_msr |= MSR_DE; #endif But do not when I should clear? How about something like this? Then both targets at least suck as much :). Thanks to e500mc's awful hardware design, we don't know who sets the MSR_DE bit. Once we forced it onto the guest, we have no change to know whether the guest also set it or not. We could only guess. So I would assume it's for the best to just treat both the same: always expose MSR_DE into guest visibility. This will break when the guest disables MSR_DE. But I have no good idea on how to solve this properly - except for hypercalls to tell us that MSR_DE is set or not. Scott, do you have an idea? Alex diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 38a62ef..3f8cbbd 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -133,6 +133,19 @@ static void kvmppc_vcpu_sync_fpu(struct kvm_vcpu *vcpu) #endif } +static void kvmppc_vcpu_sync_debug(struct kvm_vcpu *vcpu) +{ + /* Force debug to on in guest space when user space wants to debug */ + if (vcpu-guest_debug) + vcpu-arch.shared-msr |= MSR_DE; + +#if !defined(CONFIG_KVM_BOOKE_HV) + /* Synchronize MSR_DE into shadow MSR */ + vcpu-arch.shadow_msr = ~MSR_DE; + vcpu-arch.shadow_msr |= vcpu-arch.shared-msr MSR_DE; +#endif +} + /* * Helper function for full MSR writes. No need to call this if only * EE/CE/ME/DE/RI are changing. @@ -150,6 +163,7 @@ void kvmppc_set_msr(struct
Re: [PATCH 4/8] Added ONE_REG interface for debug instruction
On 31.01.2013, at 18:58, Bhushan Bharat-R65777 wrote: -Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Thursday, January 31, 2013 11:23 PM To: Bhushan Bharat-R65777 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org Subject: Re: [PATCH 4/8] Added ONE_REG interface for debug instruction On 31.01.2013, at 18:44, Bhushan Bharat-R65777 wrote: -Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Friday, January 25, 2013 5:18 PM To: Bhushan Bharat-R65777 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Bhushan Bharat-R65777 Subject: Re: [PATCH 4/8] Added ONE_REG interface for debug instruction On 16.01.2013, at 09:24, Bharat Bhushan wrote: This patch adds the one_reg interface to get the special instruction to be used for setting software breakpoint from userspace. Signed-off-by: Bharat Bhushan bharat.bhus...@freescale.com --- Documentation/virtual/kvm/api.txt |1 + arch/powerpc/include/asm/kvm_ppc.h |1 + arch/powerpc/include/uapi/asm/kvm.h |3 +++ arch/powerpc/kvm/44x.c |5 + arch/powerpc/kvm/booke.c| 10 ++ arch/powerpc/kvm/e500.c |5 + arch/powerpc/kvm/e500.h |9 + arch/powerpc/kvm/e500mc.c |5 + 8 files changed, 39 insertions(+), 0 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 09905cb..7e8be9e 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1775,6 +1775,7 @@ registers, find a list below: PPC | KVM_REG_PPC_VPA_DTL | 128 PPC | KVM_REG_PPC_EPCR | 32 PPC | KVM_REG_PPC_EPR | 32 + PPC | KVM_REG_PPC_DEBUG_INST| 32 4.69 KVM_GET_ONE_REG diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 44a657a..b3c481e 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -235,6 +235,7 @@ union kvmppc_one_reg { void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); +u32 kvmppc_core_debug_inst_op(void); void kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 16064d0..e81ae5b 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -417,4 +417,7 @@ struct kvm_get_htab_header { #define KVM_REG_PPC_EPCR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x85) #define KVM_REG_PPC_EPR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x86) +/* Debugging: Special instruction for software breakpoint */ +#define KVM_REG_PPC_DEBUG_INST (KVM_REG_PPC | KVM_REG_SIZE_U32 | +0x87) + #endif /* __LINUX_KVM_POWERPC_H */ diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c index 3d7fd21..41501be 100644 --- a/arch/powerpc/kvm/44x.c +++ b/arch/powerpc/kvm/44x.c @@ -114,6 +114,11 @@ int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu, return 0; } +u32 kvmppc_core_debug_inst_op(void) { + return -1; The way you handle it here this needs to be an int kvmppc_core_debug_inst_op(u32 *inst) so you can return an error for 440. I don't think it's worth to worry about a case where we don't know about the inst though. Just return the same as what we use on e500v2 here. +} + void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { kvmppc_get_sregs_ivor(vcpu, sregs); diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index d2f502d..453a10f 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c Please provide the DEBUG_INST on a more global level - across all ppc subarchs. Do you mean defining in powerpc.c ? We are using one_reg for DEBUG_INST and one_reg_ioctl and defined in respective subarchs (booke and books have their separate handler). So how you want this to be defined in more common way for all subarchs? Just add it to all subarch's one_reg handlers. And what book3s etc should return? -1 ? trap maybe? Alex -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 8/8] KVM:PPC:booke: Allow debug interrupt injection to guest
On 01/31/2013 12:21:07 PM, Alexander Graf wrote: How about something like this? Then both targets at least suck as much :). I'm not sure that should be the goal... Thanks to e500mc's awful hardware design, we don't know who sets the MSR_DE bit. Once we forced it onto the guest, we have no change to know whether the guest also set it or not. We could only guess. MSRP[DEP] can prevent the guest from modifying MSR[DE] -- but we still need to set it in the first place. According to ISA V2.06B, the hypervisor should set DBCR0[EDM] to let the guest know that the debug resources are not available, and that the value of MSR[DE] is not specified and not modifiable. -Scott -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] What to do about non-qdevified devices?
Andreas Färber afaer...@suse.de writes: Am 30.01.2013 13:35, schrieb Markus Armbruster: Peter Maydell peter.mayd...@linaro.org writes: On 30 January 2013 07:02, Markus Armbruster arm...@redhat.com wrote: Anthony Liguori aligu...@us.ibm.com writes: [...] The problems I ran into were (1) this is a lot of work (2) it basically requires that all bus children have been qdev/QOM-ified. Even with something like the ISA bus which is where I started, quite a few devices were not qdevified still. So what's the plan to complete the qdevification job? Lay really low and quietly hope the problem goes away? We've tried that for about three years, doesn't seem to work. Do we have a list of not-yet-qdevified devices? Maybe we need to start saying fix X Y and Z or platform P is dropped from the next release. (This would of course be easier if we had a way to let users know that platform P was in danger...) I think that's a good idea. Only problem is identifying pre-qdev devices in the code requires code inspection (grep won't do, I'm afraid). +1 That would address my request as well. Having a list of low-hanging fruit on the Wiki might also give new contributors some ideas of where and how to start poking at the code. If we agree on a qdevify or else plan, I'd be prepared to help with the digging up of devices. I disagree on the or else part. I have been qdev'ifying and QOM'ifying devices in my maintenance area, and progress is slow. It gets even Good work, much appreciated. slower if one leaves clearly maintained areas. I see no good reason to force a pistol on someone's breast, like you have done for IDE, unless there is a good reason to do so. Currently I don't see any. There's the reason that made me hijack this thread. Paraphrashing Anthony: doing IRQs right involves Pin objects, and ultimately requires all bus children have been qdevified. Even for ISA, there are still stragglers holding us back. Is that sufficient reason to rip out devices *now*? No, and I didn't call for it. Could it become sufficient reason in the not too distant future? Possibly. Should we plan ahead for such a contingency? Probably. But I didn't call for that either. What I actually wrote was 1. I think mapping the remaining qdevification work is a good idea, and 2. if we commit to attempt doing that work in a reasonable time frame, I'd be willing to help with the mapping. Implying that without such a committment, sorry, got more immediately useful things to do. And by the way, the kind of pistol I get to brandish in this group is about as scary as a water pistol in the middle of the Gobi desert. [...] -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 8/8] KVM:PPC:booke: Allow debug interrupt injection to guest
On 31.01.2013, at 19:43, Scott Wood wrote: On 01/31/2013 12:21:07 PM, Alexander Graf wrote: How about something like this? Then both targets at least suck as much :). I'm not sure that should be the goal... Thanks to e500mc's awful hardware design, we don't know who sets the MSR_DE bit. Once we forced it onto the guest, we have no change to know whether the guest also set it or not. We could only guess. MSRP[DEP] can prevent the guest from modifying MSR[DE] -- but we still need to set it in the first place. According to ISA V2.06B, the hypervisor should set DBCR0[EDM] to let the guest know that the debug resources are not available, and that the value of MSR[DE] is not specified and not modifiable. So what would the guest do then to tell the hypervisor that it actually wants to know about debug events? Alex -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 8/8] KVM:PPC:booke: Allow debug interrupt injection to guest
On 01/31/2013 12:52:41 PM, Alexander Graf wrote: On 31.01.2013, at 19:43, Scott Wood wrote: On 01/31/2013 12:21:07 PM, Alexander Graf wrote: How about something like this? Then both targets at least suck as much :). I'm not sure that should be the goal... Thanks to e500mc's awful hardware design, we don't know who sets the MSR_DE bit. Once we forced it onto the guest, we have no change to know whether the guest also set it or not. We could only guess. MSRP[DEP] can prevent the guest from modifying MSR[DE] -- but we still need to set it in the first place. According to ISA V2.06B, the hypervisor should set DBCR0[EDM] to let the guest know that the debug resources are not available, and that the value of MSR[DE] is not specified and not modifiable. So what would the guest do then to tell the hypervisor that it actually wants to know about debug events? The guest is out of luck, just as if a JTAG were in use. -Scott -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 8/8] KVM:PPC:booke: Allow debug interrupt injection to guest
On 31.01.2013, at 19:54, Scott Wood wrote: On 01/31/2013 12:52:41 PM, Alexander Graf wrote: On 31.01.2013, at 19:43, Scott Wood wrote: On 01/31/2013 12:21:07 PM, Alexander Graf wrote: How about something like this? Then both targets at least suck as much :). I'm not sure that should be the goal... Thanks to e500mc's awful hardware design, we don't know who sets the MSR_DE bit. Once we forced it onto the guest, we have no change to know whether the guest also set it or not. We could only guess. MSRP[DEP] can prevent the guest from modifying MSR[DE] -- but we still need to set it in the first place. According to ISA V2.06B, the hypervisor should set DBCR0[EDM] to let the guest know that the debug resources are not available, and that the value of MSR[DE] is not specified and not modifiable. So what would the guest do then to tell the hypervisor that it actually wants to know about debug events? The guest is out of luck, just as if a JTAG were in use. Hrm. Can we somehow generalize this out of luck behavior? Every time we would set or clear an MSR bit in shadow_msr on e500v2, we would instead set or clear it in the real MSR. That way only e500mc is out of luck, but the code would still be shared. Alex -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 8/8] KVM:PPC:booke: Allow debug interrupt injection to guest
On 31.01.2013, at 20:05, Alexander Graf wrote: On 31.01.2013, at 19:54, Scott Wood wrote: On 01/31/2013 12:52:41 PM, Alexander Graf wrote: On 31.01.2013, at 19:43, Scott Wood wrote: On 01/31/2013 12:21:07 PM, Alexander Graf wrote: How about something like this? Then both targets at least suck as much :). I'm not sure that should be the goal... Thanks to e500mc's awful hardware design, we don't know who sets the MSR_DE bit. Once we forced it onto the guest, we have no change to know whether the guest also set it or not. We could only guess. MSRP[DEP] can prevent the guest from modifying MSR[DE] -- but we still need to set it in the first place. According to ISA V2.06B, the hypervisor should set DBCR0[EDM] to let the guest know that the debug resources are not available, and that the value of MSR[DE] is not specified and not modifiable. So what would the guest do then to tell the hypervisor that it actually wants to know about debug events? The guest is out of luck, just as if a JTAG were in use. Hrm. Can we somehow generalize this out of luck behavior? Every time we would set or clear an MSR bit in shadow_msr on e500v2, we would instead set or clear it in the real MSR. That way only e500mc is out of luck, but the code would still be shared. Something like this. We could also define a SHADOW_MSR(vcpu) macro to hide the glorious details, but I think this way it's easier to understand what's going on. Alex diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 38a62ef..9bdb845 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -133,6 +133,29 @@ static void kvmppc_vcpu_sync_fpu(struct kvm_vcpu *vcpu) #endif } +static void kvmppc_vcpu_sync_debug(struct kvm_vcpu *vcpu) +{ + u32 is_debug = vcpu-arch.shared-msr MSR_DE; + + /* Force debug to on in guest space when user space wants to debug */ + if (vcpu-guest_debug) + is_debug = MSR_DE; + +#ifdef CONFIG_KVM_BOOKE_HV + /* +* Since there is no shadow MSR, sync MSR_DE into the guest +* visible MSR. +*/ + vcpu-arch.shared-msr = ~MSR_DE; + vcpu-arch.shared-msr |= is_debug; +#endif + +#ifndef CONFIG_KVM_BOOKE_HV + vcpu-arch.shadow_msr = ~MSR_DE; + vcpu-arch.shadow_msr |= is_debug; +#endif +} + /* * Helper function for full MSR writes. No need to call this if only * EE/CE/ME/DE/RI are changing. @@ -150,6 +173,7 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr) kvmppc_mmu_msr_notify(vcpu, old_msr); kvmppc_vcpu_sync_spe(vcpu); kvmppc_vcpu_sync_fpu(vcpu); + kvmppc_vcpu_sync_debug(vcpu); } static void kvmppc_booke_queue_irqprio(struct kvm_vcpu *vcpu,-- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] KVM: Remove duplicate text in api.txt
Signed-off-by: Geoff Levand ge...@infradead.org --- Saw this in v3.8-rc5, please apply. Documentation/virtual/kvm/api.txt | 13 - 1 file changed, 13 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index a4df553..a65a6b3 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -212,33 +212,20 @@ hardware requires all the hardware threads in a CPU core to be in the same partition.) The KVM_CAP_PPC_SMT capability indicates the number of vcpus per virtual core (vcore). The vcore id is obtained by dividing the vcpu id by the number of vcpus per vcore. The vcpus in a given vcore will always be in the same physical core as each other (though that might be a different physical core from time to time). Userspace can control the threading (SMT) mode of the guest by its allocation of vcpu ids. For example, if userspace wants single-threaded guest vcpus, it should make all vcpu ids be a multiple of the number of vcpus per vcore. -On powerpc using book3s_hv mode, the vcpus are mapped onto virtual -threads in one or more virtual CPU cores. (This is because the -hardware requires all the hardware threads in a CPU core to be in the -same partition.) The KVM_CAP_PPC_SMT capability indicates the number -of vcpus per virtual core (vcore). The vcore id is obtained by -dividing the vcpu id by the number of vcpus per vcore. The vcpus in a -given vcore will always be in the same physical core as each other -(though that might be a different physical core from time to time). -Userspace can control the threading (SMT) mode of the guest by its -allocation of vcpu ids. For example, if userspace wants -single-threaded guest vcpus, it should make all vcpu ids be a multiple -of the number of vcpus per vcore. - For virtual cpus that have been created with S390 user controlled virtual machines, the resulting vcpu fd can be memory mapped at page offset KVM_S390_SIE_PAGE_OFFSET in order to obtain a memory map of the virtual cpu's hardware control block. 4.8 KVM_GET_DIRTY_LOG (vm ioctl) Capability: basic Architectures: x86 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 02/18] KVM/MIPS32: Arch specific KVM data structures.
On 11/21/2012 06:34 PM, Sanjay Lal wrote: Signed-off-by: Sanjay Lal sanj...@kymasys.com --- arch/mips/include/asm/kvm.h | 55 asm/kvm.h defines the user space ABI, and thus should be placed in arch/mips/include/uapi/asm instead. arch/mips/include/asm/kvm_host.h | 669 +++ 2 files changed, 724 insertions(+) create mode 100644 arch/mips/include/asm/kvm.h create mode 100644 arch/mips/include/asm/kvm_host.h diff --git a/arch/mips/include/asm/kvm.h b/arch/mips/include/asm/kvm.h new file mode 100644 index 000..85789ea --- /dev/null +++ b/arch/mips/include/asm/kvm.h @@ -0,0 +1,55 @@ +/* +* This file is subject to the terms and conditions of the GNU General Public +* License. See the file COPYING in the main directory of this archive +* for more details. +* +* Copyright (C) 2012 MIPS Technologies, Inc. All rights reserved. +* Authors: Sanjay Lal sanj...@kymasys.com +*/ + +#ifndef __LINUX_KVM_MIPS_H +#define __LINUX_KVM_MIPS_H + +#include linux/types.h + +#define __KVM_MIPS + +#define N_MIPS_COPROC_REGS 32 +#define N_MIPS_COPROC_SEL 8 + +/* for KVM_GET_REGS and KVM_SET_REGS */ +struct kvm_regs { + __u32 gprs[32]; MIPS64 registers are 64 bits wide. How is this going to work for MIPS64? It seems a little important to answer this question as this is a userspace ABI that really cannot be changed once it is published. + __u32 hi; + __u32 lo; + __u32 pc; + + __u32 cp0reg[N_MIPS_COPROC_REGS][N_MIPS_COPROC_SEL]; Do we really want CP0 regs in here? Other architectures don't have things like this. They use things like KVM_GET_MSRS and KVM_SET_MSRS for this. +}; + +/* for KVM_GET_SREGS and KVM_SET_SREGS */ +struct kvm_sregs { +}; + +/* for KVM_GET_FPU and KVM_SET_FPU */ +struct kvm_fpu { This is a userspace ABI, and MIPS definitely has a FPU. That means that we cannot change the definition after it is merged, but we know this must have the FPU registers in it. So it cannot be both present and empty. +}; + +struct kvm_debug_exit_arch { +}; + +/* for KVM_SET_GUEST_DEBUG */ +struct kvm_guest_debug_arch { +}; + +struct kvm_mips_interrupt { + /* in */ + __u32 cpu; + __u32 irq; +}; + +/* definition of registers in kvm_run */ +struct kvm_sync_regs { +}; + +#endif /* __LINUX_KVM_MIPS_H */ [...] -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] tcm_vhost: Multi-target support
On Thu, 2013-01-31 at 17:28 +0800, Asias He wrote: Hello Nicholas, On 01/31/2013 03:33 PM, Asias He wrote: In order to take advantages of Paolo's multi-queue virito-scsi, we need multi-target support in tcm_vhost first. Otherwise all the requests go to one queue and other queues are idle. This patch makes: 1. All the targets under the wwpn is seen and can be used by guest. 2. No need to pass the tpgt number in struct vhost_scsi_target to tcm_vhost.ko. Only wwpn is needed. 3. We can always pass max_target = 255 to guest now, since we abort the request who's target id does not exist. Signed-off-by: Asias He as...@redhat.com --- drivers/vhost/tcm_vhost.c | 115 -- drivers/vhost/tcm_vhost.h | 4 +- 2 files changed, 74 insertions(+), 45 deletions(-) diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c index 218deb6..d50cb95 100644 --- a/drivers/vhost/tcm_vhost.c +++ b/drivers/vhost/tcm_vhost.c @@ -59,13 +59,18 @@ enum { VHOST_SCSI_VQ_IO = 2, }; +#define VHOST_SCSI_MAX_TARGET 256 + struct vhost_scsi { - struct tcm_vhost_tpg *vs_tpg; /* Protected by vhost_scsi-dev.mutex */ + /* Protected by vhost_scsi-dev.mutex */ + struct tcm_vhost_tpg *vs_tpg[VHOST_SCSI_MAX_TARGET]; struct vhost_dev dev; struct vhost_virtqueue vqs[3]; struct vhost_work vs_completion_work; /* cmd completion work item */ struct llist_head vs_completion_list; /* cmd completion queue */ + char vs_vhost_wwpn[TRANSPORT_IQN_LEN]; + int vs_num_target; }; /* Local pointer to allocated TCM configfs fabric module */ @@ -564,13 +569,7 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs) u32 exp_data_len, data_first, data_num, data_direction; unsigned out, in, i; int head, ret; - - /* Must use ioctl VHOST_SCSI_SET_ENDPOINT */ - tv_tpg = vs-vs_tpg; - if (unlikely(!tv_tpg)) { - pr_err(%s endpoint not set\n, __func__); - return; - } + u8 target; mutex_lock(vq-mutex); vhost_disable_notify(vs-dev, vq); @@ -637,6 +636,35 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs) break; } + /* Extract the tpgt */ + target = v_req.lun[1]; + + /* Target does not exit, fail the request */ + if (unlikely(target = vs-vs_num_target)) { + struct virtio_scsi_cmd_resp __user *resp; + struct virtio_scsi_cmd_resp rsp; + + memset(rsp, 0, sizeof(rsp)); + rsp.response = VIRTIO_SCSI_S_BAD_TARGET; + resp = vq-iov[out].iov_base; + ret = copy_to_user(resp, rsp, sizeof(rsp)); + if (!ret) + vhost_add_used_and_signal(vs-dev, + vs-vqs[2], head, 0); + else + pr_err(Faulted on virtio_scsi_cmd_resp\n); + + continue; + } + + tv_tpg = vs-vs_tpg[target]; + if (unlikely(!tv_tpg)) { + /* Must use ioctl VHOST_SCSI_SET_ENDPOINT */ + pr_err(endpoint not set, target = %d\n, target); + vhost_discard_vq_desc(vq, 1); + break; + } + exp_data_len = 0; for (i = 0; i data_num; i++) exp_data_len += vq-iov[data_first + i].iov_len; @@ -771,14 +799,11 @@ static int vhost_scsi_set_endpoint( } tv_tport = tv_tpg-tport; - if (!strcmp(tv_tport-tport_name, t-vhost_wwpn) - (tv_tpg-tport_tpgt == t-vhost_tpgt)) { + if (!strcmp(tv_tport-tport_name, t-vhost_wwpn)) { tv_tpg-tv_tpg_vhost_count++; - mutex_unlock(tv_tpg-tv_tpg_mutex); - mutex_unlock(tcm_vhost_mutex); mutex_lock(vs-dev.mutex); - if (vs-vs_tpg) { + if (vs-vs_tpg[tv_tpg-tport_tpgt - 1]) { mutex_unlock(vs-dev.mutex); mutex_lock(tv_tpg-tv_tpg_mutex); tv_tpg-tv_tpg_vhost_count--; @@ -786,15 +811,17 @@ static int vhost_scsi_set_endpoint( return -EEXIST; } - vs-vs_tpg = tv_tpg; + vs-vs_tpg[tv_tpg-tport_tpgt - 1] = tv_tpg; tv_tpg-tport_tpgt starts from 0, right? I thought it starts from 1, because I always got it starts from 1 in targetcli. o- vhost o- naa.6001405bd4e8476d o- tpg1 o- luns o- lun0 o- tpg2 o- luns o- lun0 o- tpg3 o- luns o- lun0
Re: [Qemu-devel] KVM call minutes 2013-01-29 - Port I/O
On Thu, Jan 31, 2013 at 09:34:03AM -0700, Alex Williamson wrote: On Thu, 2013-01-31 at 12:49 +0200, Michael S. Tsirkin wrote: On Wed, Jan 30, 2013 at 04:28:30PM -0700, Alex Williamson wrote: On Thu, 2013-01-31 at 10:02 +1100, Benjamin Herrenschmidt wrote: On Thu, 2013-01-31 at 00:49 +0200, Michael S. Tsirkin wrote: In practice they do (VGA at least) From a SW modelling standpoint, I don't think it's worth differentiating PCI and PCIE. Cheers, Ben. Interesting. Do you have such hardware? Could you please dump the output of lspci -vv? Any ATI or nVidia card still supports hard decoding of VGA regions for the sake of legacy operating systems and BIOSes :-) I don't know about Intel but I suppose it's the same. For example: -[:00]-+-00.0 Advanced Micro Devices [AMD] nee ATI RD890 PCI to PCI bridge (external gfx0 p +-04.0-[02]--+-00.0 Advanced Micro Devices [AMD] nee ATI Cedar PRO [Radeon HD 5450/6350] 00:04.0 PCI bridge: Advanced Micro Devices [AMD] nee ATI RD890 PCI to PCI bridge (PCI express gpp port D) (prog-if 00 [Normal decode]) Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR- FastB2B- DisINTx- Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast TAbort- TAbort- MAbort- SERR- PERR- INTx- Latency: 0, Cache Line Size: 64 bytes Bus: primary=00, secondary=02, subordinate=02, sec-latency=0 I/O behind bridge: c000-cfff Memory behind bridge: fd10-fd1f Prefetchable memory behind bridge: d000-dfff Secondary status: 66MHz- FastB2B- ParErr- DEVSEL=fast TAbort- TAbort- MAbort+ SERR- PERR- BridgeCtl: Parity- SERR- NoISA- VGA+ MAbort- Reset- FastB2B- VGA+ (VGA Enable) indicates positive decode of 0x3b0 - 0x3bb, 0x3c0 - 0x3df, and 0xa - 0xbfff. Device 2:00.0 of course doesn't report these ISA ranges as they're implicit in the VGA class code. OK but this appears behind a bridge. So the bridge configuration tells the root complex where to send accesses to the VGA. But qemu currently puts devices directly on root bus. And as far as I can tell when we present devices directly on bus 0, we pretend these are integrated in the root complex. The spec seems to say explicitly that root complex integrated devices should not use legacy addresses or support hotplug. So I would be surprised if such one appears in real world. Luckily guests do not seem to be worried as long as we use ACPI. Yes, in fact I just figured out last night that Windows is unhappy with assigned PCI devices on bus 0 that claim to be an endpoint in their PCIe capability rather than an integrated endpoint. We'll need to do extra mangling of the PCIe capability to massage it into the guest visible topology. For now, just put you device behind an express bridge. This breaks acpi hotplug for now, but I'm looking into hotplug with bridges anyway. If you really need it I can give you a hack for hotplug too. Of course express does not allow hotplug of root complex parts but happens to work because we use ACPI. Section 1.3.2.3 of the 3.0 spec says integrated endpoints must not require I/O resources claimed through BAR(s). VGA skirts around this by not having the legacy resources claimed by BARs, but instead being implicit. Aha. I missed this point. Are there other sections restricting legacy I/O? One other interesting things is that VGA enable bit (for bridge control register) does not appear in express spec at all. It's common that a plugin VGA card sits behind a root port where the bridge registers tell us about VGA routing, but integrated VGA devices are often on bus 0 though, here's an example: -[:00]-+-00.0 Intel Corporation 2nd Generation Core Processor Family DRAM Controller +-02.0 Intel Corporation 2nd Generation Core Processor Family Integrated Graphics Controller Often these systems will disable the integrated graphics when a plugin graphics is installed below a root port. I'm not sure how the system knows to route VGA to the integrated device vs the root port otherwise. I am guessing it disables the integrated graphics? Here's a more interesting example: -+-[:01]-+-00.0 NVIDIA Corporation GT218 [GeForce G210M] | \-00.1 NVIDIA Corporation High Definition Audio Controller \-[:00]-+-00.0 Intel Corporation Mobile 4 Series Chipset Memory Controller Hub +-01.0 Intel Corporation Mobile 4 Series Chipset PCI Express Graphics Port This system seems to have two host bridges with VGA behind each of them. There's no bridge to control VGA routing, so I don't know how the selection is done. Is IO space disabled for the inactive card? Maybe that is how. It's possible the g210m never sees
Re: [Qemu-devel] KVM call minutes 2013-01-29 - Port I/O
On Thu, 2013-01-31 at 23:11 +0200, Michael S. Tsirkin wrote: On Thu, Jan 31, 2013 at 09:34:03AM -0700, Alex Williamson wrote: On Thu, 2013-01-31 at 12:49 +0200, Michael S. Tsirkin wrote: On Wed, Jan 30, 2013 at 04:28:30PM -0700, Alex Williamson wrote: On Thu, 2013-01-31 at 10:02 +1100, Benjamin Herrenschmidt wrote: On Thu, 2013-01-31 at 00:49 +0200, Michael S. Tsirkin wrote: In practice they do (VGA at least) From a SW modelling standpoint, I don't think it's worth differentiating PCI and PCIE. Cheers, Ben. Interesting. Do you have such hardware? Could you please dump the output of lspci -vv? Any ATI or nVidia card still supports hard decoding of VGA regions for the sake of legacy operating systems and BIOSes :-) I don't know about Intel but I suppose it's the same. For example: -[:00]-+-00.0 Advanced Micro Devices [AMD] nee ATI RD890 PCI to PCI bridge (external gfx0 p +-04.0-[02]--+-00.0 Advanced Micro Devices [AMD] nee ATI Cedar PRO [Radeon HD 5450/6350] 00:04.0 PCI bridge: Advanced Micro Devices [AMD] nee ATI RD890 PCI to PCI bridge (PCI express gpp port D) (prog-if 00 [Normal decode]) Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR- FastB2B- DisINTx- Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast TAbort- TAbort- MAbort- SERR- PERR- INTx- Latency: 0, Cache Line Size: 64 bytes Bus: primary=00, secondary=02, subordinate=02, sec-latency=0 I/O behind bridge: c000-cfff Memory behind bridge: fd10-fd1f Prefetchable memory behind bridge: d000-dfff Secondary status: 66MHz- FastB2B- ParErr- DEVSEL=fast TAbort- TAbort- MAbort+ SERR- PERR- BridgeCtl: Parity- SERR- NoISA- VGA+ MAbort- Reset- FastB2B- VGA+ (VGA Enable) indicates positive decode of 0x3b0 - 0x3bb, 0x3c0 - 0x3df, and 0xa - 0xbfff. Device 2:00.0 of course doesn't report these ISA ranges as they're implicit in the VGA class code. OK but this appears behind a bridge. So the bridge configuration tells the root complex where to send accesses to the VGA. But qemu currently puts devices directly on root bus. And as far as I can tell when we present devices directly on bus 0, we pretend these are integrated in the root complex. The spec seems to say explicitly that root complex integrated devices should not use legacy addresses or support hotplug. So I would be surprised if such one appears in real world. Luckily guests do not seem to be worried as long as we use ACPI. Yes, in fact I just figured out last night that Windows is unhappy with assigned PCI devices on bus 0 that claim to be an endpoint in their PCIe capability rather than an integrated endpoint. We'll need to do extra mangling of the PCIe capability to massage it into the guest visible topology. For now, just put you device behind an express bridge. This breaks acpi hotplug for now, but I'm looking into hotplug with bridges anyway. We have the problem in both directions though, Endpoints that should be Integrated Endpoints and Integrated Endpoints that should be Endpoints. So I think we need to mangle the type. If you really need it I can give you a hack for hotplug too. Of course express does not allow hotplug of root complex parts but happens to work because we use ACPI. That's a little odd. Section 1.3.2.3 of the 3.0 spec says integrated endpoints must not require I/O resources claimed through BAR(s). VGA skirts around this by not having the legacy resources claimed by BARs, but instead being implicit. Aha. I missed this point. Are there other sections restricting legacy I/O? One other interesting things is that VGA enable bit (for bridge control register) does not appear in express spec at all. Yep, but it appears on hardware. It's common that a plugin VGA card sits behind a root port where the bridge registers tell us about VGA routing, but integrated VGA devices are often on bus 0 though, here's an example: -[:00]-+-00.0 Intel Corporation 2nd Generation Core Processor Family DRAM Controller +-02.0 Intel Corporation 2nd Generation Core Processor Family Integrated Graphics Controller Often these systems will disable the integrated graphics when a plugin graphics is installed below a root port. I'm not sure how the system knows to route VGA to the integrated device vs the root port otherwise. I am guessing it disables the integrated graphics? Here's a more interesting example: -+-[:01]-+-00.0 NVIDIA Corporation GT218 [GeForce G210M] | \-00.1 NVIDIA Corporation High Definition
Re: [Qemu-devel] KVM call minutes 2013-01-29 - Port I/O
On Thu, 2013-01-31 at 12:49 +0200, Michael S. Tsirkin wrote: OK but this appears behind a bridge. So the bridge configuration tells the root complex where to send accesses to the VGA. Sort-of, again the root complex isn't sending anything targeted here. PCIe is point to point and any device is behind a bridge, real or virtual. But qemu currently puts devices directly on root bus. Sure, because qemu doesn't specifically model PCIe but something else And as far as I can tell when we present devices directly on bus 0, we pretend these are integrated in the root complex. Right, it's a bit gross. The spec seems to say explicitly that root complex integrated devices should not use legacy addresses or support hotplug. So I would be surprised if such one appears in real world. Sure but that doesn't change the fact that there's no point in treating things differently between PCI and PCIe for the sake of address range decoding. The high level model remains the same. Luckily guests do not seem to be worried as long as we use ACPI. Right, it all just looks like PCI to the guest anyway and is mostly treated as such for the sake of routing and decoding (until you turn on ARI but that's a different can of worms). BTW, I've been working on vfio-pci support of VGA assignment which makes use of the VGA arbiter in the host to manipulate the VGA Enable control register, allowing us to select which device to access. The qemu side is simply registering memory regions for the VGA areas and expecting to be used with -vga none, but I'll adopt whatever strategy we choose for hard coded address range support. Current base patches at the links below. Thanks, Alex https://github.com/awilliam/qemu-vfio/commit/ea2befa59010a429dcf13c10dbccdf8b64e82fbd https://github.com/awilliam/linux-vfio/commit/bae182d929229cbf1eaeb01e5fad4f77f81a4c61 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] KVM call minutes 2013-01-29 - Port I/O
On Thu, 2013-01-31 at 09:34 -0700, Alex Williamson wrote: Luckily guests do not seem to be worried as long as we use ACPI. Yes, in fact I just figured out last night that Windows is unhappy with assigned PCI devices on bus 0 that claim to be an endpoint in their PCIe capability rather than an integrated endpoint. We'll need to do extra mangling of the PCIe capability to massage it into the guest visible topology. If you are on bus 0, you need to either not have the capability, or if you do, have it be root complex or RC intergrated endpoint. It's fair game for any OS to assume that an endpoint will have a parent bridge (either a RC or a downstream port) and to muck around with link control etc... Typically on my laptop with intel chipset, bus 0 has devices that just don't have any PCIe capabilities. Section 1.3.2.3 of the 3.0 spec says integrated endpoints must not require I/O resources claimed through BAR(s). VGA skirts around this by not having the legacy resources claimed by BARs, but instead being implicit. Are there other sections restricting legacy I/O? Right this is odd, I don't know why they put that in. Legacy endpoints don't have that limitation and I doubt system software actually cares. On the other hand, I suspect that doesn't apply if you simply doesn't have the PCIe capability at all :-) IE, that's basically what my laptop looks like here. The Intel graphics appears on bus 0 and has IO ports mapped with a BAR and no PCIe cap. Same with the on-chip SATA. In fact they have a PCI Advanced features capability, but not PCIe. Then they have a bunch of root complexes as siblings. It's common that a plugin VGA card sits behind a root port where the bridge registers tell us about VGA routing, but integrated VGA devices are often on bus 0 though, here's an example: -[:00]-+-00.0 Intel Corporation 2nd Generation Core Processor Family DRAM Controller +-02.0 Intel Corporation 2nd Generation Core Processor Family Integrated Graphics Controller Often these systems will disable the integrated graphics when a plugin graphics is installed below a root port. I'm not sure how the system knows to route VGA to the integrated device vs the root port otherwise. It's a good question... I would say the cleanest way is to use the VGA Enable bit of the root complex. If the RC is set to forward downstream, then the plug-in card gets the VGA cycles, else, they go to the integrated one (substractive decoding -style). However, the PCI-E spec has removed that bit from the bridge control register definition :-) So whatever mechanism those chipsets use has to be somewhat proprietary. On the other hand, I don't see it hurting to make our own proprietary mechanism consist of using ... the bridge control VGA enable bit. IE. The bit is not used in the PCIe spec and probably never will be so we can use it for its original purpose. Here's a more interesting example: -+-[:01]-+-00.0 NVIDIA Corporation GT218 [GeForce G210M] | \-00.1 NVIDIA Corporation High Definition Audio Controller \-[:00]-+-00.0 Intel Corporation Mobile 4 Series Chipset Memory Controller Hub +-01.0 Intel Corporation Mobile 4 Series Chipset PCI Express Graphics Port This system seems to have two host bridges with VGA behind each of them. There's no bridge to control VGA routing, so I don't know how the selection is done. It's possible the g210m never sees legacy VGA accesses in this mode. This bios has another mode which makes the g210m the primary graphics and hides the integrated graphics, essentially the same as I mention above with hiding integrated endpoint graphics when plugin graphics are used. Thanks, Wait, those are two different busses ... and there's no bridge ? Is that the funky x86 multi domain crackpot where you have multiple roots with non overlapping bus numbers in the same domain ? Ben. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] KVM call minutes 2013-01-29 - Port I/O
On Thu, Jan 31, 2013 at 02:21:50PM -0700, Alex Williamson wrote: On Thu, 2013-01-31 at 23:11 +0200, Michael S. Tsirkin wrote: On Thu, Jan 31, 2013 at 09:34:03AM -0700, Alex Williamson wrote: On Thu, 2013-01-31 at 12:49 +0200, Michael S. Tsirkin wrote: On Wed, Jan 30, 2013 at 04:28:30PM -0700, Alex Williamson wrote: On Thu, 2013-01-31 at 10:02 +1100, Benjamin Herrenschmidt wrote: On Thu, 2013-01-31 at 00:49 +0200, Michael S. Tsirkin wrote: In practice they do (VGA at least) From a SW modelling standpoint, I don't think it's worth differentiating PCI and PCIE. Cheers, Ben. Interesting. Do you have such hardware? Could you please dump the output of lspci -vv? Any ATI or nVidia card still supports hard decoding of VGA regions for the sake of legacy operating systems and BIOSes :-) I don't know about Intel but I suppose it's the same. For example: -[:00]-+-00.0 Advanced Micro Devices [AMD] nee ATI RD890 PCI to PCI bridge (external gfx0 p +-04.0-[02]--+-00.0 Advanced Micro Devices [AMD] nee ATI Cedar PRO [Radeon HD 5450/6350] 00:04.0 PCI bridge: Advanced Micro Devices [AMD] nee ATI RD890 PCI to PCI bridge (PCI express gpp port D) (prog-if 00 [Normal decode]) Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR- FastB2B- DisINTx- Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast TAbort- TAbort- MAbort- SERR- PERR- INTx- Latency: 0, Cache Line Size: 64 bytes Bus: primary=00, secondary=02, subordinate=02, sec-latency=0 I/O behind bridge: c000-cfff Memory behind bridge: fd10-fd1f Prefetchable memory behind bridge: d000-dfff Secondary status: 66MHz- FastB2B- ParErr- DEVSEL=fast TAbort- TAbort- MAbort+ SERR- PERR- BridgeCtl: Parity- SERR- NoISA- VGA+ MAbort- Reset- FastB2B- VGA+ (VGA Enable) indicates positive decode of 0x3b0 - 0x3bb, 0x3c0 - 0x3df, and 0xa - 0xbfff. Device 2:00.0 of course doesn't report these ISA ranges as they're implicit in the VGA class code. OK but this appears behind a bridge. So the bridge configuration tells the root complex where to send accesses to the VGA. But qemu currently puts devices directly on root bus. And as far as I can tell when we present devices directly on bus 0, we pretend these are integrated in the root complex. The spec seems to say explicitly that root complex integrated devices should not use legacy addresses or support hotplug. So I would be surprised if such one appears in real world. Luckily guests do not seem to be worried as long as we use ACPI. Yes, in fact I just figured out last night that Windows is unhappy with assigned PCI devices on bus 0 that claim to be an endpoint in their PCIe capability rather than an integrated endpoint. We'll need to do extra mangling of the PCIe capability to massage it into the guest visible topology. For now, just put you device behind an express bridge. This breaks acpi hotplug for now, but I'm looking into hotplug with bridges anyway. We have the problem in both directions though, Endpoints that should be Integrated Endpoints and Integrated Endpoints that should be Endpoints. So I think we need to mangle the type. If you really need it I can give you a hack for hotplug too. Of course express does not allow hotplug of root complex parts but happens to work because we use ACPI. That's a little odd. Section 1.3.2.3 of the 3.0 spec says integrated endpoints must not require I/O resources claimed through BAR(s). VGA skirts around this by not having the legacy resources claimed by BARs, but instead being implicit. Aha. I missed this point. Are there other sections restricting legacy I/O? One other interesting things is that VGA enable bit (for bridge control register) does not appear in express spec at all. Yep, but it appears on hardware. It's common that a plugin VGA card sits behind a root port where the bridge registers tell us about VGA routing, but integrated VGA devices are often on bus 0 though, here's an example: -[:00]-+-00.0 Intel Corporation 2nd Generation Core Processor Family DRAM Controller +-02.0 Intel Corporation 2nd Generation Core Processor Family Integrated Graphics Controller Often these systems will disable the integrated graphics when a plugin graphics is installed below a root port. I'm not sure how the system knows to route VGA to the integrated device vs the root port otherwise. I am guessing it disables the
Re: [Qemu-devel] KVM call minutes 2013-01-29 - Port I/O
On Fri, Feb 01, 2013 at 08:22:33AM +1100, Benjamin Herrenschmidt wrote: On Thu, 2013-01-31 at 12:49 +0200, Michael S. Tsirkin wrote: OK but this appears behind a bridge. So the bridge configuration tells the root complex where to send accesses to the VGA. Sort-of, again the root complex isn't sending anything targeted here. PCIe is point to point and any device is behind a bridge, real or virtual. I think we are arguing about terminology here. root complex has a virtual bridge for each port, presumably it examines bridge control for each port to know which link to use for a VGA access. I say presumably because VGA enable bit in bridge control is not listed in spec (but as Alex says some real hardware has it implemented). But qemu currently puts devices directly on root bus. Sure, because qemu doesn't specifically model PCIe but something else And as far as I can tell when we present devices directly on bus 0, we pretend these are integrated in the root complex. Right, it's a bit gross. The spec seems to say explicitly that root complex integrated devices should not use legacy addresses or support hotplug. So I would be surprised if such one appears in real world. Sure but that doesn't change the fact that there's no point in treating things differently between PCI and PCIe for the sake of address range decoding. The high level model remains the same. Yes, and it's not by chance. Luckily guests do not seem to be worried as long as we use ACPI. Right, it all just looks like PCI to the guest anyway and is mostly treated as such for the sake of routing and decoding (until you turn on ARI but that's a different can of worms). Right, ARI only affects config cycles. BTW, I've been working on vfio-pci support of VGA assignment which makes use of the VGA arbiter in the host to manipulate the VGA Enable control register, allowing us to select which device to access. The qemu side is simply registering memory regions for the VGA areas and expecting to be used with -vga none, but I'll adopt whatever strategy we choose for hard coded address range support. Current base patches at the links below. Thanks, Alex https://github.com/awilliam/qemu-vfio/commit/ea2befa59010a429dcf13c10dbccdf8b64e82fbd https://github.com/awilliam/linux-vfio/commit/bae182d929229cbf1eaeb01e5fad4f77f81a4c61 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] KVM call minutes 2013-01-29 - Port I/O
Here's a more interesting example: -+-[:01]-+-00.0 NVIDIA Corporation GT218 [GeForce G210M] | \-00.1 NVIDIA Corporation High Definition Audio Controller \-[:00]-+-00.0 Intel Corporation Mobile 4 Series Chipset Memory Controller Hub +-01.0 Intel Corporation Mobile 4 Series Chipset PCI Express Graphics Port This system seems to have two host bridges with VGA behind each of them. There's no bridge to control VGA routing, so I don't know how the selection is done. It's possible the g210m never sees legacy VGA accesses in this mode. This bios has another mode which makes the g210m the primary graphics and hides the integrated graphics, essentially the same as I mention above with hiding integrated endpoint graphics when plugin graphics are used. Thanks, Wait, those are two different busses ... and there's no bridge ? Is that the funky x86 multi domain crackpot where you have multiple roots with non overlapping bus numbers in the same domain ? Ben. Domain numbering on x86 comes from firmware and you know what Linus said about firmware developers ... -- MST -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 8/8] KVM:PPC:booke: Allow debug interrupt injection to guest
On 01/31/2013 01:20:39 PM, Alexander Graf wrote: On 31.01.2013, at 20:05, Alexander Graf wrote: On 31.01.2013, at 19:54, Scott Wood wrote: On 01/31/2013 12:52:41 PM, Alexander Graf wrote: On 31.01.2013, at 19:43, Scott Wood wrote: On 01/31/2013 12:21:07 PM, Alexander Graf wrote: How about something like this? Then both targets at least suck as much :). I'm not sure that should be the goal... Thanks to e500mc's awful hardware design, we don't know who sets the MSR_DE bit. Once we forced it onto the guest, we have no change to know whether the guest also set it or not. We could only guess. MSRP[DEP] can prevent the guest from modifying MSR[DE] -- but we still need to set it in the first place. According to ISA V2.06B, the hypervisor should set DBCR0[EDM] to let the guest know that the debug resources are not available, and that the value of MSR[DE] is not specified and not modifiable. So what would the guest do then to tell the hypervisor that it actually wants to know about debug events? The guest is out of luck, just as if a JTAG were in use. Hrm. Can we somehow generalize this out of luck behavior? Every time we would set or clear an MSR bit in shadow_msr on e500v2, we would instead set or clear it in the real MSR. That way only e500mc is out of luck, but the code would still be shared. I don't follow. e500v2 is just as out-of-luck. The mechanism simply does not support sharing debug resources. What do you mean by the real MSR? The real MSR is shadow_msr, and MSR_DE must always be set there if the host is debugging the guest. As for reflecting it into the guest MSR, we could, but I don't really see the point. We're never going to actually send a debug exception to the guest when the host owns the debug resources. Speaking of naming issues, guest_debug is very ambiguous... diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 38a62ef..9bdb845 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -133,6 +133,29 @@ static void kvmppc_vcpu_sync_fpu(struct kvm_vcpu *vcpu) #endif } +static void kvmppc_vcpu_sync_debug(struct kvm_vcpu *vcpu) +{ + u32 is_debug = vcpu-arch.shared-msr MSR_DE; + + /* Force debug to on in guest space when user space wants to debug */ + if (vcpu-guest_debug) + is_debug = MSR_DE; + +#ifdef CONFIG_KVM_BOOKE_HV + /* +* Since there is no shadow MSR, sync MSR_DE into the guest +* visible MSR. +*/ + vcpu-arch.shared-msr = ~MSR_DE; + vcpu-arch.shared-msr |= is_debug; +#endif + +#ifndef CONFIG_KVM_BOOKE_HV + vcpu-arch.shadow_msr = ~MSR_DE; + vcpu-arch.shadow_msr |= is_debug; +#endif +} The = ~MSR_DE line is pointless on bookehv, and makes it harder to read. I had to stare at it a while before noticing that you initially set is_debug from the guest MSR and that you'd never really clear MSR_DE here on bookehv. -Scott -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] KVM call minutes 2013-01-29 - Port I/O
On Fri, 2013-02-01 at 08:44 +1100, Benjamin Herrenschmidt wrote: On Thu, 2013-01-31 at 09:34 -0700, Alex Williamson wrote: Luckily guests do not seem to be worried as long as we use ACPI. Yes, in fact I just figured out last night that Windows is unhappy with assigned PCI devices on bus 0 that claim to be an endpoint in their PCIe capability rather than an integrated endpoint. We'll need to do extra mangling of the PCIe capability to massage it into the guest visible topology. If you are on bus 0, you need to either not have the capability, or if you do, have it be root complex or RC intergrated endpoint. It's fair game for any OS to assume that an endpoint will have a parent bridge (either a RC or a downstream port) and to muck around with link control etc... Yep, converting Endpoint to Integrated Endpoint is just a matter of changing the guest visible type and hiding all the link(2) cap, control, and status. Integrated Endpoint to Endpoint appears to require inventing some link capabilities since it's a required field. Legacy Endpoint to Integrated Endpoint seems incompatible, but I don't think we model anything at a level that would care. We could also take the opportunity to remove the PCIe capability when exposing devices on 440fx, but I'm nervous that would break drivers that are dumb and look for it anyway. Typically on my laptop with intel chipset, bus 0 has devices that just don't have any PCIe capabilities. Oddly the audio device seems to be the only one that consistently has it. Section 1.3.2.3 of the 3.0 spec says integrated endpoints must not require I/O resources claimed through BAR(s). VGA skirts around this by not having the legacy resources claimed by BARs, but instead being implicit. Are there other sections restricting legacy I/O? Right this is odd, I don't know why they put that in. Legacy endpoints don't have that limitation and I doubt system software actually cares. On the other hand, I suspect that doesn't apply if you simply doesn't have the PCIe capability at all :-) IE, that's basically what my laptop looks like here. The Intel graphics appears on bus 0 and has IO ports mapped with a BAR and no PCIe cap. Same with the on-chip SATA. In fact they have a PCI Advanced features capability, but not PCIe. Then they have a bunch of root complexes as siblings. It's common that a plugin VGA card sits behind a root port where the bridge registers tell us about VGA routing, but integrated VGA devices are often on bus 0 though, here's an example: -[:00]-+-00.0 Intel Corporation 2nd Generation Core Processor Family DRAM Controller +-02.0 Intel Corporation 2nd Generation Core Processor Family Integrated Graphics Controller Often these systems will disable the integrated graphics when a plugin graphics is installed below a root port. I'm not sure how the system knows to route VGA to the integrated device vs the root port otherwise. It's a good question... I would say the cleanest way is to use the VGA Enable bit of the root complex. If the RC is set to forward downstream, then the plug-in card gets the VGA cycles, else, they go to the integrated one (substractive decoding -style). However, the PCI-E spec has removed that bit from the bridge control register definition :-) So whatever mechanism those chipsets use has to be somewhat proprietary. On the other hand, I don't see it hurting to make our own proprietary mechanism consist of using ... the bridge control VGA enable bit. IE. The bit is not used in the PCIe spec and probably never will be so we can use it for its original purpose. Yes, our emulated root ports should include this, otherwise we have little hope of properly supporting multiple assigned (or emulated) graphics devices, each behind their own root port. So we need the ability for multiple devices to register VGA address (1 per bus?) and change MemoryRegion routing just like hardware does. Here's a more interesting example: -+-[:01]-+-00.0 NVIDIA Corporation GT218 [GeForce G210M] | \-00.1 NVIDIA Corporation High Definition Audio Controller \-[:00]-+-00.0 Intel Corporation Mobile 4 Series Chipset Memory Controller Hub +-01.0 Intel Corporation Mobile 4 Series Chipset PCI Express Graphics Port This system seems to have two host bridges with VGA behind each of them. There's no bridge to control VGA routing, so I don't know how the selection is done. It's possible the g210m never sees legacy VGA accesses in this mode. This bios has another mode which makes the g210m the primary graphics and hides the integrated graphics, essentially the same as I mention above with hiding integrated endpoint graphics when plugin graphics are used. Thanks, Wait, those are two different busses ... and there's no bridge ? Is that the funky x86 multi domain crackpot where you have multiple roots
Re: [PATCH 8/8] KVM:PPC:booke: Allow debug interrupt injection to guest
On 31.01.2013, at 23:40, Scott Wood wrote: On 01/31/2013 01:20:39 PM, Alexander Graf wrote: On 31.01.2013, at 20:05, Alexander Graf wrote: On 31.01.2013, at 19:54, Scott Wood wrote: On 01/31/2013 12:52:41 PM, Alexander Graf wrote: On 31.01.2013, at 19:43, Scott Wood wrote: On 01/31/2013 12:21:07 PM, Alexander Graf wrote: How about something like this? Then both targets at least suck as much :). I'm not sure that should be the goal... Thanks to e500mc's awful hardware design, we don't know who sets the MSR_DE bit. Once we forced it onto the guest, we have no change to know whether the guest also set it or not. We could only guess. MSRP[DEP] can prevent the guest from modifying MSR[DE] -- but we still need to set it in the first place. According to ISA V2.06B, the hypervisor should set DBCR0[EDM] to let the guest know that the debug resources are not available, and that the value of MSR[DE] is not specified and not modifiable. So what would the guest do then to tell the hypervisor that it actually wants to know about debug events? The guest is out of luck, just as if a JTAG were in use. Hrm. Can we somehow generalize this out of luck behavior? Every time we would set or clear an MSR bit in shadow_msr on e500v2, we would instead set or clear it in the real MSR. That way only e500mc is out of luck, but the code would still be shared. I don't follow. e500v2 is just as out-of-luck. The mechanism simply does not support sharing debug resources. For e500v2 we have 2 fields * MSR as the guest sees it * MSR as we execute when the guest runs Since we know the MSR when the guest sees it, we can decide what to do when we get an unhandled debug interrupt. We can simulate what hardware would do depending on the guest's MSR_DE setting. For e500mc we only have * MSR as the guest sees it and as we execute when the guest runs Because there is only one field, as soon as we OR MSR_DE into there, we can no longer distinguish whether the guest wanted to have MSR_DE enabled or not. What do you mean by the real MSR? The real MSR is shadow_msr, and MSR_DE must always be set there if the host is debugging the guest. As for reflecting it into the guest MSR, we could, but I don't really see the point. We're never going to actually send a debug exception to the guest when the host owns the debug resources. Why not? That's the whole point of jumping through user space. 1) guest exits with debug interrupt 2) QEMU gets a debug exit 3) QEMU checks in its list whether it belongs to its own debug points 4) if not, it reinjects the interrupt into the guest Step 4 is pretty difficult to do when we don't know whether the guest is actually capable of handling debug interrupts at that moment. Speaking of naming issues, guest_debug is very ambiguous... I agree. diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 38a62ef..9bdb845 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -133,6 +133,29 @@ static void kvmppc_vcpu_sync_fpu(struct kvm_vcpu *vcpu) #endif } +static void kvmppc_vcpu_sync_debug(struct kvm_vcpu *vcpu) +{ +u32 is_debug = vcpu-arch.shared-msr MSR_DE; + +/* Force debug to on in guest space when user space wants to debug */ +if (vcpu-guest_debug) +is_debug = MSR_DE; + +#ifdef CONFIG_KVM_BOOKE_HV +/* + * Since there is no shadow MSR, sync MSR_DE into the guest + * visible MSR. + */ +vcpu-arch.shared-msr = ~MSR_DE; +vcpu-arch.shared-msr |= is_debug; +#endif + +#ifndef CONFIG_KVM_BOOKE_HV +vcpu-arch.shadow_msr = ~MSR_DE; +vcpu-arch.shadow_msr |= is_debug; +#endif +} The = ~MSR_DE line is pointless on bookehv, and makes it harder to read. I had to stare at it a while before noticing that you initially set is_debug from the guest MSR and that you'd never really clear MSR_DE here on bookehv. Well, I'm mostly bouncing ideas here to find a way to express what we're trying to say in a way that someone who hasn't read this email thread would still understand what's going on :). How about this version? diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 38a62ef..9929c41 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -133,6 +133,28 @@ static void kvmppc_vcpu_sync_fpu(struct kvm_vcpu *vcpu) #endif } +static void kvmppc_vcpu_sync_debug(struct kvm_vcpu *vcpu) +{ +#ifndef CONFIG_KVM_BOOKE_HV + /* Synchronize guest's desire to get debug interrupts into shadow MSR */ + vcpu-arch.shadow_msr = ~MSR_DE; + vcpu-arch.shadow_msr |= vcpu-arch.shared-msr MSR_DE; +#endif + + /* Force enable debug interrupts when user space wants to debug */ + if (vcpu-guest_debug) { +#ifdef CONFIG_KVM_BOOKE_HV + /* +* Since there is no shadow MSR, sync MSR_DE into the guest +* visible
KVM Test report, kernel 3f0c3d0b... qemu 4d9367b7...
Hi All, This is KVM upstream test result against kvm.git next branch and qemu-kvm.git master branch. kvm.git next branch: 3f0c3d0bb2bcc4b88b22452a7cf0073ee9a0f1e6 based on kernel 3.7.0 qemu-kvm.git master branch: 4d9367b76f71c6d938cf8201392abe4bfb1136cb We found no new bug and no bug fixed in the past two weeks. New issue (0): Fixed issue (0): Old issues (6): -- 1. Nested-virt: L1 (kvm on kvm)guest panic with parameter -cpu host in qemu command line. https://bugs.launchpad.net/qemu/+bug/994378 2. Can't install or boot up 32bit win8 guest. https://bugs.launchpad.net/qemu/+bug/1007269 3. vCPU hot-add makes the guest abort. https://bugs.launchpad.net/qemu/+bug/1019179 4. Nested Virt: VMX can't be initialized in L1 Xen (Xen on KVM) https://bugzilla.kernel.org/show_bug.cgi?id=45931 5. Guest has no xsave feature with parameter -cpu qemu64,+xsave in qemu command line. https://bugs.launchpad.net/qemu/+bug/1042561 6. Guest hang when doing kernel build and writing data in guest. https://bugs.launchpad.net/qemu/+bug/1096814 Test environment: == Platform Westmere-EP Sandybridge-EP CPU Cores 24 32 Memory size 24G 32G Regards Yongjie Ren (Jay) -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH 0/8] KVM: BOOKE/BOOKEHV : Added debug stub support
-Original Message- From: kvm-ppc-ow...@vger.kernel.org [mailto:kvm-ppc-ow...@vger.kernel.org] On Behalf Of Alexander Graf Sent: Friday, January 25, 2013 6:08 PM To: Bhushan Bharat-R65777 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Bhushan Bharat-R65777 Subject: Re: [PATCH 0/8] KVM: BOOKE/BOOKEHV : Added debug stub support On 16.01.2013, at 09:20, Bharat Bhushan wrote: This patchset adds the QEMU debug stub support for powerpc (booke/bookehv). [1/8] KVM: PPC: booke: use vcpu reference from thread_struct - This is a cleanup patch to use vcpu reference from thread struct [2/8] KVM: PPC: booke: Allow multiple exception types [3/8] KVM: PPC: booke: Added debug handler - These two patches install the KVM debug handler. [4/8] Added ONE_REG interface for debug instruction - Add the ioctl interface to get the debug instruction for setting software breakpoint from QEMU debug stub. [5/8] KVM: PPC: debug stub interface parameter defined [6/8] booke: Added DBCR4 SPR number [7/8] KVM: booke/bookehv: Add debug stub support - Add the debug stub interface on booke/bookehv [8/8] KVM:PPC:booke: Allow debug interrupt injection to guest -- with this qemu can inject debug interrupt to guest Thanks, applied 1/8, 2/8, 6/8. Alex I cannot see these 3 patches on kvm-ppc-next branch. Are those applied on some other branch ? Thanks -Bharat -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] tcm_vhost: Multi-target support
On 01/31/2013 07:13 PM, Michael S. Tsirkin wrote: On Thu, Jan 31, 2013 at 05:28:21PM +0800, Asias He wrote: Hello Nicholas, On 01/31/2013 03:33 PM, Asias He wrote: In order to take advantages of Paolo's multi-queue virito-scsi, we need multi-target support in tcm_vhost first. Otherwise all the requests go to one queue and other queues are idle. This patch makes: 1. All the targets under the wwpn is seen and can be used by guest. 2. No need to pass the tpgt number in struct vhost_scsi_target to tcm_vhost.ko. Only wwpn is needed. 3. We can always pass max_target = 255 to guest now, since we abort the request who's target id does not exist. Signed-off-by: Asias He as...@redhat.com --- drivers/vhost/tcm_vhost.c | 115 -- drivers/vhost/tcm_vhost.h | 4 +- 2 files changed, 74 insertions(+), 45 deletions(-) diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c index 218deb6..d50cb95 100644 --- a/drivers/vhost/tcm_vhost.c +++ b/drivers/vhost/tcm_vhost.c @@ -59,13 +59,18 @@ enum { VHOST_SCSI_VQ_IO = 2, }; +#define VHOST_SCSI_MAX_TARGET 256 + struct vhost_scsi { - struct tcm_vhost_tpg *vs_tpg; /* Protected by vhost_scsi-dev.mutex */ + /* Protected by vhost_scsi-dev.mutex */ + struct tcm_vhost_tpg *vs_tpg[VHOST_SCSI_MAX_TARGET]; struct vhost_dev dev; struct vhost_virtqueue vqs[3]; struct vhost_work vs_completion_work; /* cmd completion work item */ struct llist_head vs_completion_list; /* cmd completion queue */ + char vs_vhost_wwpn[TRANSPORT_IQN_LEN]; + int vs_num_target; }; /* Local pointer to allocated TCM configfs fabric module */ @@ -564,13 +569,7 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs) u32 exp_data_len, data_first, data_num, data_direction; unsigned out, in, i; int head, ret; - - /* Must use ioctl VHOST_SCSI_SET_ENDPOINT */ - tv_tpg = vs-vs_tpg; - if (unlikely(!tv_tpg)) { - pr_err(%s endpoint not set\n, __func__); - return; - } + u8 target; mutex_lock(vq-mutex); vhost_disable_notify(vs-dev, vq); @@ -637,6 +636,35 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs) break; } + /* Extract the tpgt */ + target = v_req.lun[1]; + + /* Target does not exit, fail the request */ + if (unlikely(target = vs-vs_num_target)) { + struct virtio_scsi_cmd_resp __user *resp; + struct virtio_scsi_cmd_resp rsp; + + memset(rsp, 0, sizeof(rsp)); + rsp.response = VIRTIO_SCSI_S_BAD_TARGET; + resp = vq-iov[out].iov_base; + ret = copy_to_user(resp, rsp, sizeof(rsp)); + if (!ret) + vhost_add_used_and_signal(vs-dev, + vs-vqs[2], head, 0); + else + pr_err(Faulted on virtio_scsi_cmd_resp\n); + + continue; + } + + tv_tpg = vs-vs_tpg[target]; + if (unlikely(!tv_tpg)) { + /* Must use ioctl VHOST_SCSI_SET_ENDPOINT */ + pr_err(endpoint not set, target = %d\n, target); + vhost_discard_vq_desc(vq, 1); + break; + } + exp_data_len = 0; for (i = 0; i data_num; i++) exp_data_len += vq-iov[data_first + i].iov_len; @@ -771,14 +799,11 @@ static int vhost_scsi_set_endpoint( } tv_tport = tv_tpg-tport; - if (!strcmp(tv_tport-tport_name, t-vhost_wwpn) - (tv_tpg-tport_tpgt == t-vhost_tpgt)) { + if (!strcmp(tv_tport-tport_name, t-vhost_wwpn)) { tv_tpg-tv_tpg_vhost_count++; - mutex_unlock(tv_tpg-tv_tpg_mutex); - mutex_unlock(tcm_vhost_mutex); mutex_lock(vs-dev.mutex); - if (vs-vs_tpg) { + if (vs-vs_tpg[tv_tpg-tport_tpgt - 1]) { mutex_unlock(vs-dev.mutex); mutex_lock(tv_tpg-tv_tpg_mutex); tv_tpg-tv_tpg_vhost_count--; @@ -786,15 +811,17 @@ static int vhost_scsi_set_endpoint( return -EEXIST; } - vs-vs_tpg = tv_tpg; + vs-vs_tpg[tv_tpg-tport_tpgt - 1] = tv_tpg; tv_tpg-tport_tpgt starts from 0, right? I thought it starts from 1, because I always got it starts from 1 in targetcli. o- vhost o- naa.6001405bd4e8476d o- tpg1 o- luns o- lun0 o- tpg2 o- luns o- lun0 o- tpg3 o- luns o- lun0 o- tpg4 o- luns o- lun0 If it is
Re: [PATCH] tcm_vhost: Multi-target support
On 02/01/2013 04:59 AM, Nicholas A. Bellinger wrote: On Thu, 2013-01-31 at 17:28 +0800, Asias He wrote: Hello Nicholas, On 01/31/2013 03:33 PM, Asias He wrote: In order to take advantages of Paolo's multi-queue virito-scsi, we need multi-target support in tcm_vhost first. Otherwise all the requests go to one queue and other queues are idle. This patch makes: 1. All the targets under the wwpn is seen and can be used by guest. 2. No need to pass the tpgt number in struct vhost_scsi_target to tcm_vhost.ko. Only wwpn is needed. 3. We can always pass max_target = 255 to guest now, since we abort the request who's target id does not exist. Signed-off-by: Asias He as...@redhat.com --- drivers/vhost/tcm_vhost.c | 115 -- drivers/vhost/tcm_vhost.h | 4 +- 2 files changed, 74 insertions(+), 45 deletions(-) diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c index 218deb6..d50cb95 100644 --- a/drivers/vhost/tcm_vhost.c +++ b/drivers/vhost/tcm_vhost.c @@ -59,13 +59,18 @@ enum { VHOST_SCSI_VQ_IO = 2, }; +#define VHOST_SCSI_MAX_TARGET 256 + struct vhost_scsi { - struct tcm_vhost_tpg *vs_tpg; /* Protected by vhost_scsi-dev.mutex */ + /* Protected by vhost_scsi-dev.mutex */ + struct tcm_vhost_tpg *vs_tpg[VHOST_SCSI_MAX_TARGET]; struct vhost_dev dev; struct vhost_virtqueue vqs[3]; struct vhost_work vs_completion_work; /* cmd completion work item */ struct llist_head vs_completion_list; /* cmd completion queue */ + char vs_vhost_wwpn[TRANSPORT_IQN_LEN]; + int vs_num_target; }; /* Local pointer to allocated TCM configfs fabric module */ @@ -564,13 +569,7 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs) u32 exp_data_len, data_first, data_num, data_direction; unsigned out, in, i; int head, ret; - - /* Must use ioctl VHOST_SCSI_SET_ENDPOINT */ - tv_tpg = vs-vs_tpg; - if (unlikely(!tv_tpg)) { - pr_err(%s endpoint not set\n, __func__); - return; - } + u8 target; mutex_lock(vq-mutex); vhost_disable_notify(vs-dev, vq); @@ -637,6 +636,35 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs) break; } + /* Extract the tpgt */ + target = v_req.lun[1]; + + /* Target does not exit, fail the request */ + if (unlikely(target = vs-vs_num_target)) { + struct virtio_scsi_cmd_resp __user *resp; + struct virtio_scsi_cmd_resp rsp; + + memset(rsp, 0, sizeof(rsp)); + rsp.response = VIRTIO_SCSI_S_BAD_TARGET; + resp = vq-iov[out].iov_base; + ret = copy_to_user(resp, rsp, sizeof(rsp)); + if (!ret) + vhost_add_used_and_signal(vs-dev, + vs-vqs[2], head, 0); + else + pr_err(Faulted on virtio_scsi_cmd_resp\n); + + continue; + } + + tv_tpg = vs-vs_tpg[target]; + if (unlikely(!tv_tpg)) { + /* Must use ioctl VHOST_SCSI_SET_ENDPOINT */ + pr_err(endpoint not set, target = %d\n, target); + vhost_discard_vq_desc(vq, 1); + break; + } + exp_data_len = 0; for (i = 0; i data_num; i++) exp_data_len += vq-iov[data_first + i].iov_len; @@ -771,14 +799,11 @@ static int vhost_scsi_set_endpoint( } tv_tport = tv_tpg-tport; - if (!strcmp(tv_tport-tport_name, t-vhost_wwpn) - (tv_tpg-tport_tpgt == t-vhost_tpgt)) { + if (!strcmp(tv_tport-tport_name, t-vhost_wwpn)) { tv_tpg-tv_tpg_vhost_count++; - mutex_unlock(tv_tpg-tv_tpg_mutex); - mutex_unlock(tcm_vhost_mutex); mutex_lock(vs-dev.mutex); - if (vs-vs_tpg) { + if (vs-vs_tpg[tv_tpg-tport_tpgt - 1]) { mutex_unlock(vs-dev.mutex); mutex_lock(tv_tpg-tv_tpg_mutex); tv_tpg-tv_tpg_vhost_count--; @@ -786,15 +811,17 @@ static int vhost_scsi_set_endpoint( return -EEXIST; } - vs-vs_tpg = tv_tpg; + vs-vs_tpg[tv_tpg-tport_tpgt - 1] = tv_tpg; tv_tpg-tport_tpgt starts from 0, right? I thought it starts from 1, because I always got it starts from 1 in targetcli. o- vhost o- naa.6001405bd4e8476d o- tpg1 o- luns o- lun0 o- tpg2 o- luns o- lun0 o- tpg3 o- luns o- lun0 o- tpg4 o- luns o- lun0 So at least
RE: [PATCH 3/8] KVM: PPC: booke: Added debug handler
-Original Message- From: kvm-ppc-ow...@vger.kernel.org [mailto:kvm-ppc-ow...@vger.kernel.org] On Behalf Of Alexander Graf Sent: Thursday, January 31, 2013 10:38 PM To: Bhushan Bharat-R65777 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org Subject: Re: [PATCH 3/8] KVM: PPC: booke: Added debug handler On 31.01.2013, at 17:58, Bhushan Bharat-R65777 wrote: -Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Thursday, January 31, 2013 5:47 PM To: Bhushan Bharat-R65777 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org Subject: Re: [PATCH 3/8] KVM: PPC: booke: Added debug handler On 30.01.2013, at 12:30, Bhushan Bharat-R65777 wrote: -Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Friday, January 25, 2013 5:13 PM To: Bhushan Bharat-R65777 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Bhushan Bharat-R65777 Subject: Re: [PATCH 3/8] KVM: PPC: booke: Added debug handler On 16.01.2013, at 09:24, Bharat Bhushan wrote: From: Bharat Bhushan bharat.bhus...@freescale.com Installed debug handler will be used for guest debug support and debug facility emulation features (patches for these features will follow this patch). Signed-off-by: Liu Yu yu@freescale.com [bharat.bhus...@freescale.com: Substantial changes] Signed-off-by: Bharat Bhushan bharat.bhus...@freescale.com --- arch/powerpc/include/asm/kvm_host.h |1 + arch/powerpc/kernel/asm-offsets.c |1 + arch/powerpc/kvm/booke_interrupts.S | 49 ++- -- -- 3 files changed, 44 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 8a72d59..f4ba881 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -503,6 +503,7 @@ struct kvm_vcpu_arch { u32 tlbcfg[4]; u32 mmucfg; u32 epr; + u32 crit_save; struct kvmppc_booke_debug_reg dbg_reg; #endif gpa_t paddr_accessed; diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 46f6afd..02048f3 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -562,6 +562,7 @@ int main(void) DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst)); DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear)); DEFINE(VCPU_FAULT_ESR, offsetof(struct kvm_vcpu, arch.fault_esr)); + DEFINE(VCPU_CRIT_SAVE, offsetof(struct kvm_vcpu, +arch.crit_save)); #endif /* CONFIG_PPC_BOOK3S */ #endif /* CONFIG_KVM */ diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S index eae8483..dd9c5d4 100644 --- a/arch/powerpc/kvm/booke_interrupts.S +++ b/arch/powerpc/kvm/booke_interrupts.S @@ -52,12 +52,7 @@ (1BOOKE_INTERRUPT_PROGRAM) | \ (1BOOKE_INTERRUPT_DTLB_MISS)) -.macro KVM_HANDLER ivor_nr scratch srr0 -_GLOBAL(kvmppc_handler_\ivor_nr) - /* Get pointer to vcpu and record exit number. */ - mtspr \scratch , r4 - mfspr r4, SPRN_SPRG_THREAD - lwz r4, THREAD_KVM_VCPU(r4) +.macro __KVM_HANDLER ivor_nr scratch srr0 stw r3, VCPU_GPR(R3)(r4) stw r5, VCPU_GPR(R5)(r4) stw r6, VCPU_GPR(R6)(r4) @@ -74,6 +69,46 @@ _GLOBAL(kvmppc_handler_\ivor_nr) bctr .endm +.macro KVM_HANDLER ivor_nr scratch srr0 +_GLOBAL(kvmppc_handler_\ivor_nr) + /* Get pointer to vcpu and record exit number. */ + mtspr \scratch , r4 + mfspr r4, SPRN_SPRG_THREAD + lwz r4, THREAD_KVM_VCPU(r4) + __KVM_HANDLER \ivor_nr \scratch \srr0 .endm + +.macro KVM_DBG_HANDLER ivor_nr scratch srr0 +_GLOBAL(kvmppc_handler_\ivor_nr) + mtspr \scratch, r4 + mfspr r4, SPRN_SPRG_THREAD + lwz r4, THREAD_KVM_VCPU(r4) + stw r3, VCPU_CRIT_SAVE(r4) + mfcrr3 + mfspr r4, SPRN_CSRR1 + andi. r4, r4, MSR_PR + bne 1f + /* debug interrupt happened in enter/exit path */ + mfspr r4, SPRN_CSRR1 + rlwinm r4, r4, 0, ~MSR_DE + mtspr SPRN_CSRR1, r4 + lis r4, 0x + ori r4, r4, 0x + mtspr SPRN_DBSR, r4 + mfspr r4, SPRN_SPRG_THREAD + lwz r4, THREAD_KVM_VCPU(r4) + mtcrr3 + lwz r3, VCPU_CRIT_SAVE(r4) + mfspr r4, \scratch + rfci What is this part doing? Try to ignore the debug exit? As BOOKE doesn't have hardware support for virtualization, hardware never know current pc is in guest or in host. So when enable hardware single step for guest, it cannot be disabled at the time guest exit. Thus, we'll see that an single step interrupt happens
RE: [PATCH 7/8] KVM: PPC: booke/bookehv: Add debug stub support
-Original Message- From: Alexander Graf [mailto:ag...@suse.de] Sent: Friday, January 25, 2013 5:37 PM To: Bhushan Bharat-R65777 Cc: kvm-...@vger.kernel.org; kvm@vger.kernel.org; Bhushan Bharat-R65777 Subject: Re: [PATCH 7/8] KVM: PPC: booke/bookehv: Add debug stub support On 16.01.2013, at 09:24, Bharat Bhushan wrote: This patch adds the debug stub support on booke/bookehv. Now QEMU debug stub can use hw breakpoint, watchpoint and software breakpoint to debug guest. Signed-off-by: Bharat Bhushan bharat.bhus...@freescale.com --- arch/powerpc/include/asm/kvm_host.h |5 + arch/powerpc/include/asm/kvm_ppc.h|2 + arch/powerpc/include/uapi/asm/kvm.h | 22 - arch/powerpc/kernel/asm-offsets.c | 26 ++ arch/powerpc/kvm/booke.c | 124 + arch/powerpc/kvm/booke_interrupts.S | 114 ++ arch/powerpc/kvm/bookehv_interrupts.S | 145 - arch/powerpc/kvm/e500_emulate.c |6 ++ arch/powerpc/kvm/e500mc.c |3 +- 9 files changed, 422 insertions(+), 25 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index f4ba881..a9feeb0 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -504,7 +504,12 @@ struct kvm_vcpu_arch { u32 mmucfg; u32 epr; u32 crit_save; + /* guest debug registers*/ struct kvmppc_booke_debug_reg dbg_reg; + /* shadow debug registers */ + struct kvmppc_booke_debug_reg shadow_dbg_reg; + /* host debug registers*/ + struct kvmppc_booke_debug_reg host_dbg_reg; #endif gpa_t paddr_accessed; gva_t vaddr_accessed; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index b3c481e..e4b3398 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -45,6 +45,8 @@ enum emulation_result { EMULATE_FAIL, /* can't emulate this instruction */ EMULATE_AGAIN,/* something went wrong. go again */ EMULATE_DO_PAPR, /* kvm_run filled with PAPR request */ + EMULATE_DEBUG_INST, /* debug instruction for software +breakpoint, exit to userspace */ Does this do something different from DO_PAPR? Maybe it makes sense to have an exit code EMULATE_EXIT_USER? I think EMULATE_DO_PAPR does something similar but the name is confusing. May be we can rename EMULATE_DO_PAPR to EMULATE_EXIT_USER. Thanks -Bharat }; extern int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu); diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index e8842ed..a81ab29 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -25,6 +25,7 @@ /* Select powerpc specific features in linux/kvm.h */ #define __KVM_HAVE_SPAPR_TCE #define __KVM_HAVE_PPC_SMT +#define __KVM_HAVE_GUEST_DEBUG struct kvm_regs { __u64 pc; @@ -267,7 +268,24 @@ struct kvm_fpu { __u64 fpr[32]; }; +/* + * Defines for h/w breakpoint, watchpoint (read, write or both) and + * software breakpoint. + * These are used as type in KVM_SET_GUEST_DEBUG ioctl and status + * for KVM_DEBUG_EXIT. + */ +#define KVMPPC_DEBUG_NONE 0x0 +#define KVMPPC_DEBUG_BREAKPOINT(1UL 1) +#define KVMPPC_DEBUG_WATCH_WRITE (1UL 2) +#define KVMPPC_DEBUG_WATCH_READ(1UL 3) struct kvm_debug_exit_arch { + __u64 address; + /* +* exiting to userspace because of h/w breakpoint, watchpoint +* (read, write or both) and software breakpoint. +*/ + __u32 status; + __u32 reserved; }; /* for KVM_SET_GUEST_DEBUG */ @@ -279,10 +297,6 @@ struct kvm_guest_debug_arch { * Type denotes h/w breakpoint, read watchpoint, write * watchpoint or watchpoint (both read and write). */ -#define KVMPPC_DEBUG_NOTYPE0x0 -#define KVMPPC_DEBUG_BREAKPOINT(1UL 1) -#define KVMPPC_DEBUG_WATCH_WRITE (1UL 2) -#define KVMPPC_DEBUG_WATCH_READ(1UL 3) __u32 type; __u32 reserved; } bp[16]; diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 02048f3..22deda7 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -563,6 +563,32 @@ int main(void) DEFINE(VCPU_FAULT_DEAR, offsetof(struct kvm_vcpu, arch.fault_dear)); DEFINE(VCPU_FAULT_ESR, offsetof(struct kvm_vcpu, arch.fault_esr)); DEFINE(VCPU_CRIT_SAVE, offsetof(struct kvm_vcpu, arch.crit_save)); + DEFINE(VCPU_DBSR, offsetof(struct kvm_vcpu, arch.dbsr)); + DEFINE(VCPU_SHADOW_DBG, offsetof(struct kvm_vcpu, arch.shadow_dbg_reg)); + DEFINE(VCPU_HOST_DBG,
Re: [PATCH] tcm_vhost: Multi-target support
On Fri, 2013-02-01 at 12:03 +0800, Asias He wrote: On 02/01/2013 04:59 AM, Nicholas A. Bellinger wrote: On Thu, 2013-01-31 at 17:28 +0800, Asias He wrote: Hello Nicholas, On 01/31/2013 03:33 PM, Asias He wrote: In order to take advantages of Paolo's multi-queue virito-scsi, we need multi-target support in tcm_vhost first. Otherwise all the requests go to one queue and other queues are idle. SNIP @@ -771,14 +799,11 @@ static int vhost_scsi_set_endpoint( } tv_tport = tv_tpg-tport; - if (!strcmp(tv_tport-tport_name, t-vhost_wwpn) - (tv_tpg-tport_tpgt == t-vhost_tpgt)) { + if (!strcmp(tv_tport-tport_name, t-vhost_wwpn)) { tv_tpg-tv_tpg_vhost_count++; - mutex_unlock(tv_tpg-tv_tpg_mutex); - mutex_unlock(tcm_vhost_mutex); mutex_lock(vs-dev.mutex); - if (vs-vs_tpg) { + if (vs-vs_tpg[tv_tpg-tport_tpgt - 1]) { mutex_unlock(vs-dev.mutex); mutex_lock(tv_tpg-tv_tpg_mutex); tv_tpg-tv_tpg_vhost_count--; @@ -786,15 +811,17 @@ static int vhost_scsi_set_endpoint( return -EEXIST; } - vs-vs_tpg = tv_tpg; + vs-vs_tpg[tv_tpg-tport_tpgt - 1] = tv_tpg; tv_tpg-tport_tpgt starts from 0, right? I thought it starts from 1, because I always got it starts from 1 in targetcli. o- vhost o- naa.6001405bd4e8476d o- tpg1 o- luns o- lun0 o- tpg2 o- luns o- lun0 o- tpg3 o- luns o- lun0 o- tpg4 o- luns o- lun0 So at least with iscsi-target, we start from tpgt=1 to avoid some legacy initiators that have issues handling tgpt=0. Given that rtslib/targetcli currently expect this with the tpgs feature is enabled, starting from tpgt=1 with tcm_vhost probably makes the most sense. Okay. But tgpt can be 0, right? Most certainly, in the end it's totally up to the fabric. ;) I saw this setup: cd /sys/kernel/config/target mkdir -p core/fileio_0/fileio echo 'fd_dev_name=/home/pbonzini/test.img,fd_dev_size=5905580032' core/fileio_0/fileio/control echo 1 core/fileio_0/fileio/enable mkdir -p vhost/naa.600140554cf3a18e/tpgt_0/lun/lun_0 cd vhost/naa.600140554cf3a18e/tpgt_0 ln -sf ../../../../../core/fileio_0/fileio/ lun/lun_0/virtual_scsi_port echo naa.60014053226f0388 nexus And this: ** Setup wwpn and tpgt $ wwpn=naa.0 $ tpgt=/sys/kernel/config/target/vhost/$wwpn/tpgt_0 $ nexus=$tpgt/nexus $ mkdir -p $tpgt $ echo -n $wwpn $nexus OK, I think you'll want to avoid the extra vs-vs_tpg[tpgt - 1] offset above to properly support this. --nab If it is true. I will cook v2 of this patch. Also, the tv_tpg-tport_tpgt can be none-continuous. e.g. o- vhost o- naa.6001405bd4e8476d o- tpg1 o- luns o- lun0 o- tpg2 o- luns o- lun0 o- tpg4 o- luns o- lun0 I will handle this in v2. Correct, tpgt values may be optionally non-contiguous up to unsigned short. ok. --nab -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V4 RESEND 00/22] Multiqueue virtio-net
Hello all: This seires is an update of last version of multiqueue virtio-net support. This series tries to brings multiqueue support to virtio-net through a multiqueue support tap backend and multiple vhost threads. Patch 1 converts bitfield in TAPState to bool. Patch 2 replace assert(0) with abort() in tap. To support this, multiqueue nic support were added to qemu. This is done by introducing an array of NetClientStates in NICState, and make each pair of peers to be an queue of the nic. This is done in patch 3-9. Tap were also converted to be able to create a multiple queue backend. Currently, only linux support this by issuing TUNSETIFF N times with the same device name to create N queues. Each fd returned by TUNSETIFF were a queue supported by kernel. Three new command lines were introduced, queues were used to tell how many queues will be created by qemu; fds were used to pass multiple pre-created tap file descriptors to qemu; vhostfds were used to pass multiple pre-created vhost descriptors to qemu. This is done in patch 10-15. A method of deleting a queue and queue_index were also introduce for virtio, this is done in patch 16-17. Vhost were also changed to support multiqueue by introducing a start vq index which tracks the first virtqueue that will be used by vhost instead of the assumption that the vhost always use virtqueue from index 0. This is done in patch 18. The last part is the multiqueue userspace changes, this is done in patch 19-22. With this changes, user could start a multiqueue virtio-net device through ./qemu -netdev tap,id=hn0,queues=2,vhost=on -device virtio-net-pci,netdev=hn0 Management tools such as libvirt can pass multiple pre-created fds/vhostfds through ./qemu -netdev tap,id=hn0,fds=X:Y,vhostfds=M:N -device virtio-net-pci,netdev=hn0 For the one who wants to try, a git tree is available at: git://github.com/jasowang/qemu.git Changes from V4: - fix the conflict with Michael's tree and rebase to the latest (Michael) Changes from V3: - convert bitfield to bool in TAPState (Blue) - use abort() instead of assert(0) in tap code (Blue) - rebase to the latest - fix a bug that breaks the non-tap network Changes from V2: - Don't start/stop vhost threads when changing queues and simplify the interface between virtio-net and vhost further. Changes from V1: - silent checkpatch (Blue) - use fds/vhostfds instead of fd/vhostfd (Stefan) - use fds=X:Y:Z instead of fd=X,fd=Y,fd=Z (Anthony) - split patches (Stefan) - typos in commit log (Stefan) - Warn 'queues=' when fds/vhostfds is used (Stefan) - rename __net_init_tap to net_init_tap_one (Stefan) - check the consistency of vnet_hdr of multiple tap fds (Stefan) - disable multiqueue support for bridge-helper (Stefan) - rename tap_attach()/tap_detach() to tap_enable()/tap_disable() (Stefan) - fix booting with legacy guest (WanLong) - don't bump the version when doing migration (Michael) - simplify the interface between virtio-net and multiqueue vhost_net (Michael) - rebase the patches to latest - re-order the patches that let the net part comes first to simplify the reviewing - simplify the interface between virtio-net and multiqueue vhost_net - move the guest notifiers setup from vhost to vhost_net - fix a build issue of hw/mcf_fce.c Changes from RFC v2: - rebase the codes to latest qemu - align the multiqueue virtio-net implementation to virtio spec - split the patches into more smaller patches - set_link and hotplug support Changes from RFC V1: - rebase to the latest - fix memory leak in parse_netdev - fix guest notifiers assignment/de-assignment - changes the command lines to: qemu -netdev tap,queues=2 -device virtio-net-pci,queues=2 Reference: V1: http://lists.nongnu.org/archive/html/qemu-devel/2012-12/msg03558.html RFC v2: http://lists.gnu.org/archive/html/qemu-devel/2012-06/msg04108.html RFC v1: http://comments.gmane.org/gmane.comp.emulators.qemu/100481 Perf Numbers: - norm is short for normalize result - trans.rate is short for transaction rate Two Intel Xeon 5620 with direct connected intel 82599EB Host/Guest kernel: David net tree vhost enabled - lots of improvents of both latency and cpu utilization in request-reponse test - get regression of guest sending small packets which because TCP tends to batch less when the latency were improved 1q/2q/4q TCP_RR size #sessions trans.rate norm trans.rate norm trans.rate norm 1 1 9393.26 595.64 9408.18 597.34 9375.19 584.12 1 2072162.1 2214.24 129880.22 2456.13 196949.81 2298.13 1 50107513.38 2653.99 139721.93 2490.58 259713.82 2873.57 1 100 126734.63 2676.54 145553.5 2406.63 265252.68 2943 64 19453.42 632.33 9371.37 616.13 9338.19 615.97 64 20 70620.03 2093.68 125155.75 2409.15 191239.91 2253.32 64 50 1069662448.29 146518.67 2514.47 242134.07 2720.91 64 100 117046.35 2394.56 190153.09 2696.82 238881.29 2704.41 256 1 8733.29 736.36 8701.07 680.83 8608.92 530.1 256 20 69279.89 2274.45 115103.07
[PATCH V4 RESEND 01/22] net: tap: using bool instead of bitfield
Signed-off-by: Jason Wang jasow...@redhat.com Signed-off-by: Michael S. Tsirkin m...@redhat.com --- hw/virtio-net.c |2 +- include/net/tap.h |4 ++-- net/tap-win32.c |6 +++--- net/tap.c | 38 ++ 4 files changed, 24 insertions(+), 26 deletions(-) diff --git a/hw/virtio-net.c b/hw/virtio-net.c index dfb9687..b5579b4 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -1102,7 +1102,7 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf, n-nic = qemu_new_nic(net_virtio_info, conf, object_get_typename(OBJECT(dev)), dev-id, n); peer_test_vnet_hdr(n); if (peer_has_vnet_hdr(n)) { -tap_using_vnet_hdr(n-nic-nc.peer, 1); +tap_using_vnet_hdr(n-nic-nc.peer, true); n-host_hdr_len = sizeof(struct virtio_net_hdr); } else { n-host_hdr_len = 0; diff --git a/include/net/tap.h b/include/net/tap.h index bb7efb5..883cebf 100644 --- a/include/net/tap.h +++ b/include/net/tap.h @@ -29,10 +29,10 @@ #include qemu-common.h #include qapi-types.h -int tap_has_ufo(NetClientState *nc); +bool tap_has_ufo(NetClientState *nc); int tap_has_vnet_hdr(NetClientState *nc); int tap_has_vnet_hdr_len(NetClientState *nc, int len); -void tap_using_vnet_hdr(NetClientState *nc, int using_vnet_hdr); +void tap_using_vnet_hdr(NetClientState *nc, bool using_vnet_hdr); void tap_set_offload(NetClientState *nc, int csum, int tso4, int tso6, int ecn, int ufo); void tap_set_vnet_hdr_len(NetClientState *nc, int len); diff --git a/net/tap-win32.c b/net/tap-win32.c index 265369c..3052bba 100644 --- a/net/tap-win32.c +++ b/net/tap-win32.c @@ -722,9 +722,9 @@ int net_init_tap(const NetClientOptions *opts, const char *name, return 0; } -int tap_has_ufo(NetClientState *nc) +bool tap_has_ufo(NetClientState *nc) { -return 0; +return false; } int tap_has_vnet_hdr(NetClientState *nc) @@ -741,7 +741,7 @@ void tap_fd_set_vnet_hdr_len(int fd, int len) { } -void tap_using_vnet_hdr(NetClientState *nc, int using_vnet_hdr) +void tap_using_vnet_hdr(NetClientState *nc, bool using_vnet_hdr) { } diff --git a/net/tap.c b/net/tap.c index eb40c42..5542c98 100644 --- a/net/tap.c +++ b/net/tap.c @@ -55,10 +55,10 @@ typedef struct TAPState { char down_script[1024]; char down_script_arg[128]; uint8_t buf[TAP_BUFSIZE]; -unsigned int read_poll : 1; -unsigned int write_poll : 1; -unsigned int using_vnet_hdr : 1; -unsigned int has_ufo: 1; +bool read_poll; +bool write_poll; +bool using_vnet_hdr; +bool has_ufo; VHostNetState *vhost_net; unsigned host_vnet_hdr_len; } TAPState; @@ -78,15 +78,15 @@ static void tap_update_fd_handler(TAPState *s) s); } -static void tap_read_poll(TAPState *s, int enable) +static void tap_read_poll(TAPState *s, bool enable) { -s-read_poll = !!enable; +s-read_poll = enable; tap_update_fd_handler(s); } -static void tap_write_poll(TAPState *s, int enable) +static void tap_write_poll(TAPState *s, bool enable) { -s-write_poll = !!enable; +s-write_poll = enable; tap_update_fd_handler(s); } @@ -94,7 +94,7 @@ static void tap_writable(void *opaque) { TAPState *s = opaque; -tap_write_poll(s, 0); +tap_write_poll(s, false); qemu_flush_queued_packets(s-nc); } @@ -108,7 +108,7 @@ static ssize_t tap_write_packet(TAPState *s, const struct iovec *iov, int iovcnt } while (len == -1 errno == EINTR); if (len == -1 errno == EAGAIN) { -tap_write_poll(s, 1); +tap_write_poll(s, true); return 0; } @@ -186,7 +186,7 @@ ssize_t tap_read_packet(int tapfd, uint8_t *buf, int maxlen) static void tap_send_completed(NetClientState *nc, ssize_t len) { TAPState *s = DO_UPCAST(TAPState, nc, nc); -tap_read_poll(s, 1); +tap_read_poll(s, true); } static void tap_send(void *opaque) @@ -209,12 +209,12 @@ static void tap_send(void *opaque) size = qemu_send_packet_async(s-nc, buf, size, tap_send_completed); if (size == 0) { -tap_read_poll(s, 0); +tap_read_poll(s, false); } } while (size 0 qemu_can_send_packet(s-nc)); } -int tap_has_ufo(NetClientState *nc) +bool tap_has_ufo(NetClientState *nc) { TAPState *s = DO_UPCAST(TAPState, nc, nc); @@ -253,12 +253,10 @@ void tap_set_vnet_hdr_len(NetClientState *nc, int len) s-host_vnet_hdr_len = len; } -void tap_using_vnet_hdr(NetClientState *nc, int using_vnet_hdr) +void tap_using_vnet_hdr(NetClientState *nc, bool using_vnet_hdr) { TAPState *s = DO_UPCAST(TAPState, nc, nc); -using_vnet_hdr = using_vnet_hdr != 0; - assert(nc-info-type == NET_CLIENT_OPTIONS_KIND_TAP); assert(!!s-host_vnet_hdr_len == using_vnet_hdr); @@ -290,8 +288,8 @@ static void tap_cleanup(NetClientState *nc) if (s-down_script[0]) launch_script(s-down_script, s-down_script_arg, s-fd); -tap_read_poll(s,
[PATCH V4 RESEND 02/22] net: tap: use abort() instead of assert(0)
Signed-off-by: Jason Wang jasow...@redhat.com Signed-off-by: Michael S. Tsirkin m...@redhat.com --- net/tap-linux.c |4 ++-- net/tap-win32.c |2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/tap-linux.c b/net/tap-linux.c index 059f5f3..0a6acc7 100644 --- a/net/tap-linux.c +++ b/net/tap-linux.c @@ -164,7 +164,7 @@ int tap_probe_vnet_hdr_len(int fd, int len) if (ioctl(fd, TUNSETVNETHDRSZ, orig) == -1) { fprintf(stderr, TUNGETVNETHDRSZ ioctl() failed: %s. Exiting.\n, strerror(errno)); -assert(0); +abort(); return -errno; } return 1; @@ -175,7 +175,7 @@ void tap_fd_set_vnet_hdr_len(int fd, int len) if (ioctl(fd, TUNSETVNETHDRSZ, len) == -1) { fprintf(stderr, TUNSETVNETHDRSZ ioctl() failed: %s. Exiting.\n, strerror(errno)); -assert(0); +abort(); } } diff --git a/net/tap-win32.c b/net/tap-win32.c index 3052bba..601437e 100644 --- a/net/tap-win32.c +++ b/net/tap-win32.c @@ -762,5 +762,5 @@ int tap_has_vnet_hdr_len(NetClientState *nc, int len) void tap_set_vnet_hdr_len(NetClientState *nc, int len) { -assert(0); +abort(); } -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V4 RESEND 03/22] net: introduce qemu_get_queue()
To support multiqueue, the patch introduce a helper qemu_get_queue() which is used to get the NetClientState of a device. The following patches would refactor this helper to support multiqueue. Signed-off-by: Jason Wang jasow...@redhat.com Signed-off-by: Michael S. Tsirkin m...@redhat.com --- hw/cadence_gem.c|9 +++-- hw/dp8393x.c|9 +++-- hw/e1000.c | 24 -- hw/eepro100.c | 12 +++--- hw/etraxfs_eth.c|5 ++- hw/lan9118.c| 10 +++--- hw/mcf_fec.c|4 +- hw/milkymist-minimac2.c |4 +- hw/mipsnet.c|4 +- hw/musicpal.c |2 +- hw/ne2000-isa.c |2 +- hw/ne2000.c |7 ++-- hw/opencores_eth.c |6 ++-- hw/pcnet-pci.c |2 +- hw/pcnet.c |7 ++-- hw/rtl8139.c| 14 hw/smc91c111.c |4 +- hw/spapr_llan.c |4 +- hw/stellaris_enet.c |5 ++- hw/usb/dev-network.c| 10 +++--- hw/virtio-net.c | 78 ++- hw/xen_nic.c| 13 +--- hw/xgmac.c |4 +- hw/xilinx_axienet.c |4 +- hw/xilinx_ethlite.c |6 ++-- include/net/net.h |1 + net/net.c |5 +++ savevm.c|2 +- 28 files changed, 141 insertions(+), 116 deletions(-) diff --git a/hw/cadence_gem.c b/hw/cadence_gem.c index b77423d..b8071a4 100644 --- a/hw/cadence_gem.c +++ b/hw/cadence_gem.c @@ -389,10 +389,10 @@ static void gem_init_register_masks(GemState *s) */ static void phy_update_link(GemState *s) { -DB_PRINT(down %d\n, s-nic-nc.link_down); +DB_PRINT(down %d\n, qemu_get_queue(s-nic)-link_down); /* Autonegotiation status mirrors link status. */ -if (s-nic-nc.link_down) { +if (qemu_get_queue(s-nic)-link_down) { s-phy_regs[PHY_REG_STATUS] = ~(PHY_REG_STATUS_ANEGCMPL | PHY_REG_STATUS_LINK); s-phy_regs[PHY_REG_INT_ST] |= PHY_REG_INT_ST_LINKC; @@ -908,9 +908,10 @@ static void gem_transmit(GemState *s) /* Send the packet somewhere */ if (s-phy_loop) { -gem_receive(s-nic-nc, tx_packet, total_bytes); +gem_receive(qemu_get_queue(s-nic), tx_packet, total_bytes); } else { -qemu_send_packet(s-nic-nc, tx_packet, total_bytes); +qemu_send_packet(qemu_get_queue(s-nic), tx_packet, + total_bytes); } /* Prepare for next packet */ diff --git a/hw/dp8393x.c b/hw/dp8393x.c index b501450..c2d0bc8 100644 --- a/hw/dp8393x.c +++ b/hw/dp8393x.c @@ -339,6 +339,7 @@ static void do_receiver_disable(dp8393xState *s) static void do_transmit_packets(dp8393xState *s) { +NetClientState *nc = qemu_get_queue(s-nic); uint16_t data[12]; int width, size; int tx_len, len; @@ -408,13 +409,13 @@ static void do_transmit_packets(dp8393xState *s) if (s-regs[SONIC_RCR] (SONIC_RCR_LB1 | SONIC_RCR_LB0)) { /* Loopback */ s-regs[SONIC_TCR] |= SONIC_TCR_CRSL; -if (s-nic-nc.info-can_receive(s-nic-nc)) { +if (nc-info-can_receive(nc)) { s-loopback_packet = 1; -s-nic-nc.info-receive(s-nic-nc, s-tx_buffer, tx_len); +nc-info-receive(nc, s-tx_buffer, tx_len); } } else { /* Transmit packet */ -qemu_send_packet(s-nic-nc, s-tx_buffer, tx_len); +qemu_send_packet(nc, s-tx_buffer, tx_len); } s-regs[SONIC_TCR] |= SONIC_TCR_PTX; @@ -903,7 +904,7 @@ void dp83932_init(NICInfo *nd, hwaddr base, int it_shift, s-nic = qemu_new_nic(net_dp83932_info, s-conf, nd-model, nd-name, s); -qemu_format_nic_info_str(s-nic-nc, s-conf.macaddr.a); +qemu_format_nic_info_str(qemu_get_queue(s-nic), s-conf.macaddr.a); qemu_register_reset(nic_reset, s); nic_reset(s); diff --git a/hw/e1000.c b/hw/e1000.c index ee85c53..3622392 100644 --- a/hw/e1000.c +++ b/hw/e1000.c @@ -167,11 +167,11 @@ set_phy_ctrl(E1000State *s, int index, uint16_t val) { if ((val MII_CR_AUTO_NEG_EN) (val MII_CR_RESTART_AUTO_NEG)) { /* no need auto-negotiation if link was down */ -if (s-nic-nc.link_down) { +if (qemu_get_queue(s-nic)-link_down) { s-phy_reg[PHY_STATUS] |= MII_SR_AUTONEG_COMPLETE; return; } -s-nic-nc.link_down = true; +qemu_get_queue(s-nic)-link_down = true; e1000_link_down(s); s-phy_reg[PHY_STATUS] = ~MII_SR_AUTONEG_COMPLETE; DBGOUT(PHY, Start link auto negotiation\n); @@ -183,7 +183,7 @@ static void e1000_autoneg_timer(void *opaque) { E1000State *s = opaque; -s-nic-nc.link_down = false; +qemu_get_queue(s-nic)-link_down = false; e1000_link_up(s);
[PATCH V4 RESEND 04/22] net: introduce qemu_get_nic()
To support multiqueue, this patch introduces a helper qemu_get_nic() to get NICState from a NetClientState. The following patches would refactor this helper to support multiqueue. Signed-off-by: Jason Wang jasow...@redhat.com Signed-off-by: Michael S. Tsirkin m...@redhat.com --- hw/cadence_gem.c|8 hw/dp8393x.c|6 +++--- hw/e1000.c |8 hw/eepro100.c |6 +++--- hw/etraxfs_eth.c|6 +++--- hw/lan9118.c|6 +++--- hw/lance.c |2 +- hw/mcf_fec.c|6 +++--- hw/milkymist-minimac2.c |6 +++--- hw/mipsnet.c|6 +++--- hw/musicpal.c |4 ++-- hw/ne2000-isa.c |2 +- hw/ne2000.c |6 +++--- hw/opencores_eth.c |6 +++--- hw/pcnet-pci.c |2 +- hw/pcnet.c |6 +++--- hw/rtl8139.c|8 hw/smc91c111.c |6 +++--- hw/spapr_llan.c |4 ++-- hw/stellaris_enet.c |6 +++--- hw/usb/dev-network.c|6 +++--- hw/virtio-net.c | 10 +- hw/xen_nic.c|4 ++-- hw/xgmac.c |6 +++--- hw/xilinx_axienet.c |6 +++--- hw/xilinx_ethlite.c |6 +++--- include/net/net.h |2 ++ net/net.c | 20 28 files changed, 92 insertions(+), 78 deletions(-) diff --git a/hw/cadence_gem.c b/hw/cadence_gem.c index b8071a4..ab86c17 100644 --- a/hw/cadence_gem.c +++ b/hw/cadence_gem.c @@ -409,7 +409,7 @@ static int gem_can_receive(NetClientState *nc) { GemState *s; -s = DO_UPCAST(NICState, nc, nc)-opaque; +s = qemu_get_nic_opaque(nc); DB_PRINT(\n); @@ -612,7 +612,7 @@ static ssize_t gem_receive(NetClientState *nc, const uint8_t *buf, size_t size) uint8_trxbuf[2048]; uint8_t *rxbuf_ptr; -s = DO_UPCAST(NICState, nc, nc)-opaque; +s = qemu_get_nic_opaque(nc); /* Do nothing if receive is not enabled. */ if (!(s-regs[GEM_NWCTRL] GEM_NWCTRL_RXENA)) { @@ -1152,7 +1152,7 @@ static const MemoryRegionOps gem_ops = { static void gem_cleanup(NetClientState *nc) { -GemState *s = DO_UPCAST(NICState, nc, nc)-opaque; +GemState *s = qemu_get_nic_opaque(nc); DB_PRINT(\n); s-nic = NULL; @@ -1161,7 +1161,7 @@ static void gem_cleanup(NetClientState *nc) static void gem_set_link(NetClientState *nc) { DB_PRINT(\n); -phy_update_link(DO_UPCAST(NICState, nc, nc)-opaque); +phy_update_link(qemu_get_nic_opaque(nc)); } static NetClientInfo net_gem_info = { diff --git a/hw/dp8393x.c b/hw/dp8393x.c index c2d0bc8..0273fad 100644 --- a/hw/dp8393x.c +++ b/hw/dp8393x.c @@ -676,7 +676,7 @@ static const MemoryRegionOps dp8393x_ops = { static int nic_can_receive(NetClientState *nc) { -dp8393xState *s = DO_UPCAST(NICState, nc, nc)-opaque; +dp8393xState *s = qemu_get_nic_opaque(nc); if (!(s-regs[SONIC_CR] SONIC_CR_RXEN)) return 0; @@ -725,7 +725,7 @@ static int receive_filter(dp8393xState *s, const uint8_t * buf, int size) static ssize_t nic_receive(NetClientState *nc, const uint8_t * buf, size_t size) { -dp8393xState *s = DO_UPCAST(NICState, nc, nc)-opaque; +dp8393xState *s = qemu_get_nic_opaque(nc); uint16_t data[10]; int packet_type; uint32_t available, address; @@ -861,7 +861,7 @@ static void nic_reset(void *opaque) static void nic_cleanup(NetClientState *nc) { -dp8393xState *s = DO_UPCAST(NICState, nc, nc)-opaque; +dp8393xState *s = qemu_get_nic_opaque(nc); memory_region_del_subregion(s-address_space, s-mmio); memory_region_destroy(s-mmio); diff --git a/hw/e1000.c b/hw/e1000.c index 3622392..df6c693 100644 --- a/hw/e1000.c +++ b/hw/e1000.c @@ -753,7 +753,7 @@ receive_filter(E1000State *s, const uint8_t *buf, int size) static void e1000_set_link_status(NetClientState *nc) { -E1000State *s = DO_UPCAST(NICState, nc, nc)-opaque; +E1000State *s = qemu_get_nic_opaque(nc); uint32_t old_status = s-mac_reg[STATUS]; if (nc-link_down) { @@ -787,7 +787,7 @@ static bool e1000_has_rxbufs(E1000State *s, size_t total_size) static int e1000_can_receive(NetClientState *nc) { -E1000State *s = DO_UPCAST(NICState, nc, nc)-opaque; +E1000State *s = qemu_get_nic_opaque(nc); return (s-mac_reg[RCTL] E1000_RCTL_EN) e1000_has_rxbufs(s, 1); } @@ -803,7 +803,7 @@ static uint64_t rx_desc_base(E1000State *s) static ssize_t e1000_receive(NetClientState *nc, const uint8_t *buf, size_t size) { -E1000State *s = DO_UPCAST(NICState, nc, nc)-opaque; +E1000State *s = qemu_get_nic_opaque(nc); struct e1000_rx_desc desc; dma_addr_t base; unsigned int n, rdt; @@ -1240,7 +1240,7 @@ e1000_mmio_setup(E1000State *d) static void e1000_cleanup(NetClientState *nc) { -E1000State *s = DO_UPCAST(NICState, nc, nc)-opaque; +E1000State *s = qemu_get_nic_opaque(nc); s-nic = NULL;
[PATCH V4 RESEND 05/22] net: intorduce qemu_del_nic()
To support multiqueue nic, this patch separate the nic destructor from qemu_del_net_client() to a new helper qemu_del_nic() since the mapping bettween NiCState and NetClientState were not 1:1 in multiqueue. The following patches would refactor this function to support multiqueue nic. Signed-off-by: Jason Wang jasow...@redhat.com Signed-off-by: Michael S. Tsirkin m...@redhat.com --- hw/e1000.c |2 +- hw/eepro100.c|2 +- hw/ne2000.c |2 +- hw/pcnet-pci.c |2 +- hw/rtl8139.c |2 +- hw/usb/dev-network.c |2 +- hw/virtio-net.c |2 +- hw/xen_nic.c |2 +- include/net/net.h|1 + net/net.c| 15 ++- 10 files changed, 23 insertions(+), 9 deletions(-) diff --git a/hw/e1000.c b/hw/e1000.c index df6c693..7dd0455 100644 --- a/hw/e1000.c +++ b/hw/e1000.c @@ -1254,7 +1254,7 @@ pci_e1000_uninit(PCIDevice *dev) qemu_free_timer(d-autoneg_timer); memory_region_destroy(d-mmio); memory_region_destroy(d-io); -qemu_del_net_client(qemu_get_queue(d-nic)); +qemu_del_nic(d-nic); } static NetClientInfo net_e1000_info = { diff --git a/hw/eepro100.c b/hw/eepro100.c index f9856ae..5d23796 100644 --- a/hw/eepro100.c +++ b/hw/eepro100.c @@ -1849,7 +1849,7 @@ static void pci_nic_uninit(PCIDevice *pci_dev) memory_region_destroy(s-flash_bar); vmstate_unregister(pci_dev-qdev, s-vmstate, s); eeprom93xx_free(pci_dev-qdev, s-eeprom); -qemu_del_net_client(qemu_get_queue(s-nic)); +qemu_del_nic(s-nic); } static NetClientInfo net_eepro100_info = { diff --git a/hw/ne2000.c b/hw/ne2000.c index c989190..3dd1c84 100644 --- a/hw/ne2000.c +++ b/hw/ne2000.c @@ -751,7 +751,7 @@ static void pci_ne2000_exit(PCIDevice *pci_dev) NE2000State *s = d-ne2000; memory_region_destroy(s-io); -qemu_del_net_client(qemu_get_queue(s-nic)); +qemu_del_nic(s-nic); } static Property ne2000_properties[] = { diff --git a/hw/pcnet-pci.c b/hw/pcnet-pci.c index 26c90bf..df63b22 100644 --- a/hw/pcnet-pci.c +++ b/hw/pcnet-pci.c @@ -279,7 +279,7 @@ static void pci_pcnet_uninit(PCIDevice *dev) memory_region_destroy(d-io_bar); qemu_del_timer(d-state.poll_timer); qemu_free_timer(d-state.poll_timer); -qemu_del_net_client(qemu_get_queue(d-state.nic)); +qemu_del_nic(d-state.nic); } static NetClientInfo net_pci_pcnet_info = { diff --git a/hw/rtl8139.c b/hw/rtl8139.c index b825e83..d7716be 100644 --- a/hw/rtl8139.c +++ b/hw/rtl8139.c @@ -3446,7 +3446,7 @@ static void pci_rtl8139_uninit(PCIDevice *dev) } qemu_del_timer(s-timer); qemu_free_timer(s-timer); -qemu_del_net_client(qemu_get_queue(s-nic)); +qemu_del_nic(s-nic); } static void rtl8139_set_link_status(NetClientState *nc) diff --git a/hw/usb/dev-network.c b/hw/usb/dev-network.c index abc6eac..a01a5e7 100644 --- a/hw/usb/dev-network.c +++ b/hw/usb/dev-network.c @@ -1330,7 +1330,7 @@ static void usb_net_handle_destroy(USBDevice *dev) /* TODO: remove the nd_table[] entry */ rndis_clear_responsequeue(s); -qemu_del_net_client(qemu_get_queue(s-nic)); +qemu_del_nic(s-nic); } static NetClientInfo net_usbnet_info = { diff --git a/hw/virtio-net.c b/hw/virtio-net.c index e69313b..a967006 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -1157,6 +1157,6 @@ void virtio_net_exit(VirtIODevice *vdev) qemu_bh_delete(n-tx_bh); } -qemu_del_net_client(qemu_get_queue(n-nic)); +qemu_del_nic(n-nic); virtio_cleanup(n-vdev); } diff --git a/hw/xen_nic.c b/hw/xen_nic.c index 55b7960..4be077d 100644 --- a/hw/xen_nic.c +++ b/hw/xen_nic.c @@ -408,7 +408,7 @@ static void net_disconnect(struct XenDevice *xendev) netdev-rxs = NULL; } if (netdev-nic) { -qemu_del_net_client(qemu_get_queue(netdev-nic)); +qemu_del_nic(netdev-nic); netdev-nic = NULL; } } diff --git a/include/net/net.h b/include/net/net.h index 96e05c4..f0d1aa2 100644 --- a/include/net/net.h +++ b/include/net/net.h @@ -77,6 +77,7 @@ NICState *qemu_new_nic(NetClientInfo *info, const char *model, const char *name, void *opaque); +void qemu_del_nic(NICState *nic); NetClientState *qemu_get_queue(NICState *nic); NICState *qemu_get_nic(NetClientState *nc); void *qemu_get_nic_opaque(NetClientState *nc); diff --git a/net/net.c b/net/net.c index 606e860..47d56e3 100644 --- a/net/net.c +++ b/net/net.c @@ -291,6 +291,15 @@ void qemu_del_net_client(NetClientState *nc) return; } +assert(nc-info-type != NET_CLIENT_OPTIONS_KIND_NIC); + +qemu_cleanup_net_client(nc); +qemu_free_net_client(nc); +} + +void qemu_del_nic(NICState *nic) +{ +NetClientState *nc = qemu_get_queue(nic); /* If this is a peer NIC and peer has already been deleted, free it now. */ if (nc-peer nc-info-type == NET_CLIENT_OPTIONS_KIND_NIC) { NICState *nic = qemu_get_nic(nc); @@
[PATCH V4 RESEND 06/22] net: introduce qemu_find_net_clients_except()
In multiqueue, all NetClientState that belongs to the same netdev or nic has the same id. So this patches introduces an helper qemu_find_net_clients_except() which finds all NetClientState with the same id. This will be used by multiqueue networking. Signed-off-by: Jason Wang jasow...@redhat.com Signed-off-by: Michael S. Tsirkin m...@redhat.com --- include/net/net.h |2 ++ net/net.c | 21 + 2 files changed, 23 insertions(+), 0 deletions(-) diff --git a/include/net/net.h b/include/net/net.h index f0d1aa2..995df5c 100644 --- a/include/net/net.h +++ b/include/net/net.h @@ -68,6 +68,8 @@ typedef struct NICState { } NICState; NetClientState *qemu_find_netdev(const char *id); +int qemu_find_net_clients_except(const char *id, NetClientState **ncs, + NetClientOptionsKind type, int max); NetClientState *qemu_new_net_client(NetClientInfo *info, NetClientState *peer, const char *model, diff --git a/net/net.c b/net/net.c index 47d56e3..16dd327 100644 --- a/net/net.c +++ b/net/net.c @@ -508,6 +508,27 @@ NetClientState *qemu_find_netdev(const char *id) return NULL; } +int qemu_find_net_clients_except(const char *id, NetClientState **ncs, + NetClientOptionsKind type, int max) +{ +NetClientState *nc; +int ret = 0; + +QTAILQ_FOREACH(nc, net_clients, next) { +if (nc-info-type == type) { +continue; +} +if (!strcmp(nc-name, id)) { +if (ret max) { +ncs[ret] = nc; +} +ret++; +} +} + +return ret; +} + static int nic_get_free_idx(void) { int index; -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V4 RESEND 07/22] net: introduce qemu_net_client_setup()
This patch separates the setup of NetClientState from its allocation, this will allow allocating an arrays of NetClientState and does the initialization one by one which is what multiqueue needs. Signed-off-by: Jason Wang jasow...@redhat.com Signed-off-by: Michael S. Tsirkin m...@redhat.com --- net/net.c | 29 +++-- 1 files changed, 19 insertions(+), 10 deletions(-) diff --git a/net/net.c b/net/net.c index 16dd327..3a5bdf6 100644 --- a/net/net.c +++ b/net/net.c @@ -182,17 +182,12 @@ static char *assign_name(NetClientState *nc1, const char *model) return g_strdup(buf); } -NetClientState *qemu_new_net_client(NetClientInfo *info, -NetClientState *peer, -const char *model, -const char *name) +static void qemu_net_client_setup(NetClientState *nc, + NetClientInfo *info, + NetClientState *peer, + const char *model, + const char *name) { -NetClientState *nc; - -assert(info-size = sizeof(NetClientState)); - -nc = g_malloc0(info-size); - nc-info = info; nc-model = g_strdup(model); if (name) { @@ -210,6 +205,20 @@ NetClientState *qemu_new_net_client(NetClientInfo *info, nc-send_queue = qemu_new_net_queue(nc); +} + +NetClientState *qemu_new_net_client(NetClientInfo *info, +NetClientState *peer, +const char *model, +const char *name) +{ +NetClientState *nc; + +assert(info-size = sizeof(NetClientState)); + +nc = g_malloc0(info-size); +qemu_net_client_setup(nc, info, peer, model, name); + return nc; } -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V4 RESEND 08/22] net: introduce NetClientState destructor
To allow allocating an array of NetClientState and free it once, this patch introduces destructor of NetClientState. Which could do type specific free, which could be used by multiqueue to free the array once. Signed-off-by: Jason Wang jasow...@redhat.com Signed-off-by: Michael S. Tsirkin m...@redhat.com --- include/net/net.h |2 ++ net/net.c | 17 + 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/include/net/net.h b/include/net/net.h index 995df5c..22adc99 100644 --- a/include/net/net.h +++ b/include/net/net.h @@ -35,6 +35,7 @@ typedef ssize_t (NetReceive)(NetClientState *, const uint8_t *, size_t); typedef ssize_t (NetReceiveIOV)(NetClientState *, const struct iovec *, int); typedef void (NetCleanup) (NetClientState *); typedef void (LinkStatusChanged)(NetClientState *); +typedef void (NetClientDestructor)(NetClientState *); typedef struct NetClientInfo { NetClientOptionsKind type; @@ -58,6 +59,7 @@ struct NetClientState { char *name; char info_str[256]; unsigned receive_disabled : 1; +NetClientDestructor *destructor; }; typedef struct NICState { diff --git a/net/net.c b/net/net.c index 3a5bdf6..98a1934 100644 --- a/net/net.c +++ b/net/net.c @@ -182,11 +182,17 @@ static char *assign_name(NetClientState *nc1, const char *model) return g_strdup(buf); } +static void qemu_net_client_destructor(NetClientState *nc) +{ +g_free(nc); +} + static void qemu_net_client_setup(NetClientState *nc, NetClientInfo *info, NetClientState *peer, const char *model, - const char *name) + const char *name, + NetClientDestructor *destructor) { nc-info = info; nc-model = g_strdup(model); @@ -204,7 +210,7 @@ static void qemu_net_client_setup(NetClientState *nc, QTAILQ_INSERT_TAIL(net_clients, nc, next); nc-send_queue = qemu_new_net_queue(nc); - +nc-destructor = destructor; } NetClientState *qemu_new_net_client(NetClientInfo *info, @@ -217,7 +223,8 @@ NetClientState *qemu_new_net_client(NetClientInfo *info, assert(info-size = sizeof(NetClientState)); nc = g_malloc0(info-size); -qemu_net_client_setup(nc, info, peer, model, name); +qemu_net_client_setup(nc, info, peer, model, name, + qemu_net_client_destructor); return nc; } @@ -279,7 +286,9 @@ static void qemu_free_net_client(NetClientState *nc) } g_free(nc-name); g_free(nc-model); -g_free(nc); +if (nc-destructor) { +nc-destructor(nc); +} } void qemu_del_net_client(NetClientState *nc) -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V4 RESEND 09/22] net: multiqueue support
This patch adds basic multiqueue support for qemu. The idea is simple, an array of NetClientStates were introduced in NICState, parse_netdev() were extended to find and match all NetClientStates belongs to the backend and place their pointers in NICConf. Then qemu_new_nic can setup a N:N mapping between NICStates that belongs to a nic and NICStates belongs to the netdev. And a queue_index were introduced in NetClientState to track its index. After this, each peers of a NICState were abstracted as a queue. After this change, all NetClientState that belongs to the same backend/nic has the same id. When use want to change the link status, all NetClientStates that belongs to the same backend/nic will be also changed. When user want to delete a device or netdev, all NetClientStates that belongs to the same backend/nic will be deleted also. Changing or deleting an specific queue is not allowed. Signed-off-by: Jason Wang jasow...@redhat.com Signed-off-by: Michael S. Tsirkin m...@redhat.com --- hw/dp8393x.c|2 +- hw/mcf_fec.c|2 +- hw/qdev-properties-system.c | 46 +++--- hw/qdev-properties.h|6 +- include/net/net.h | 18 +-- net/net.c | 113 +++ 6 files changed, 139 insertions(+), 48 deletions(-) diff --git a/hw/dp8393x.c b/hw/dp8393x.c index 0273fad..808157b 100644 --- a/hw/dp8393x.c +++ b/hw/dp8393x.c @@ -900,7 +900,7 @@ void dp83932_init(NICInfo *nd, hwaddr base, int it_shift, s-regs[SONIC_SR] = 0x0004; /* only revision recognized by Linux */ s-conf.macaddr = nd-macaddr; -s-conf.peer = nd-netdev; +s-conf.peers.ncs[0] = nd-netdev; s-nic = qemu_new_nic(net_dp83932_info, s-conf, nd-model, nd-name, s); diff --git a/hw/mcf_fec.c b/hw/mcf_fec.c index 909e32b..8e60f09 100644 --- a/hw/mcf_fec.c +++ b/hw/mcf_fec.c @@ -472,7 +472,7 @@ void mcf_fec_init(MemoryRegion *sysmem, NICInfo *nd, memory_region_add_subregion(sysmem, base, s-iomem); s-conf.macaddr = nd-macaddr; -s-conf.peer = nd-netdev; +s-conf.peers.ncs[0] = nd-netdev; s-nic = qemu_new_nic(net_mcf_fec_info, s-conf, nd-model, nd-name, s); diff --git a/hw/qdev-properties-system.c b/hw/qdev-properties-system.c index ce0f793..ce3af22 100644 --- a/hw/qdev-properties-system.c +++ b/hw/qdev-properties-system.c @@ -173,16 +173,47 @@ PropertyInfo qdev_prop_chr = { static int parse_netdev(DeviceState *dev, const char *str, void **ptr) { -NetClientState *netdev = qemu_find_netdev(str); +NICPeers *peers_ptr = (NICPeers *)ptr; +NICConf *conf = container_of(peers_ptr, NICConf, peers); +NetClientState **ncs = peers_ptr-ncs; +NetClientState *peers[MAX_QUEUE_NUM]; +int queues, i = 0; +int ret; -if (netdev == NULL) { -return -ENOENT; +queues = qemu_find_net_clients_except(str, peers, + NET_CLIENT_OPTIONS_KIND_NIC, + MAX_QUEUE_NUM); +if (queues == 0) { +ret = -ENOENT; +goto err; } -if (netdev-peer) { -return -EEXIST; + +if (queues MAX_QUEUE_NUM) { +ret = -E2BIG; +goto err; +} + +for (i = 0; i queues; i++) { +if (peers[i] == NULL) { +ret = -ENOENT; +goto err; +} + +if (peers[i]-peer) { +ret = -EEXIST; +goto err; +} + +ncs[i] = peers[i]; +ncs[i]-queue_index = i; } -*ptr = netdev; + +conf-queues = queues; + return 0; + +err: +return ret; } static const char *print_netdev(void *ptr) @@ -249,7 +280,8 @@ static void set_vlan(Object *obj, Visitor *v, void *opaque, { DeviceState *dev = DEVICE(obj); Property *prop = opaque; -NetClientState **ptr = qdev_get_prop_ptr(dev, prop); +NICPeers *peers_ptr = qdev_get_prop_ptr(dev, prop); +NetClientState **ptr = peers_ptr-ncs[0]; Error *local_err = NULL; int32_t id; NetClientState *hubport; diff --git a/hw/qdev-properties.h b/hw/qdev-properties.h index ddcf774..20c67f3 100644 --- a/hw/qdev-properties.h +++ b/hw/qdev-properties.h @@ -31,7 +31,7 @@ extern PropertyInfo qdev_prop_pci_host_devaddr; .name = (_name),\ .info = (_prop), \ .offset= offsetof(_state, _field)\ -+ type_check(_type,typeof_field(_state, _field)),\ ++ type_check(_type, typeof_field(_state, _field)), \ } #define DEFINE_PROP_DEFAULT(_name, _state, _field, _defval, _prop, _type) { \ .name = (_name), \ @@ -77,9 +77,9 @@ extern PropertyInfo qdev_prop_pci_host_devaddr; #define DEFINE_PROP_STRING(_n, _s, _f) \ DEFINE_PROP(_n, _s, _f, qdev_prop_string, char*) #define DEFINE_PROP_NETDEV(_n, _s, _f)
[PATCH V4 RESEND 10/22] tap: import linux multiqueue constants
Import multiqueue constants from if_tun.h from 3.8-rc3. A new ifr flag IFF_MULTI_QUEUE were introduced to create a multiqueue backend by calling TUNSETIFF with the this flag and with the same interface name many times. A new ioctl TUNSETQUEUE were introduced. When doing this ioctl with IFF_DETACH_QUEUE, the queue were disabled in the linux kernel. When doing this ioctl with IFF_ATTACH_QUEUE, the queue were enabled in the linux kernel. Signed-off-by: Jason Wang jasow...@redhat.com Signed-off-by: Michael S. Tsirkin m...@redhat.com --- net/tap-linux.h |4 1 files changed, 4 insertions(+), 0 deletions(-) diff --git a/net/tap-linux.h b/net/tap-linux.h index cb2a6d4..65087e1 100644 --- a/net/tap-linux.h +++ b/net/tap-linux.h @@ -29,6 +29,7 @@ #define TUNSETSNDBUF _IOW('T', 212, int) #define TUNGETVNETHDRSZ _IOR('T', 215, int) #define TUNSETVNETHDRSZ _IOW('T', 216, int) +#define TUNSETQUEUE _IOW('T', 217, int) #endif @@ -36,6 +37,9 @@ #define IFF_TAP0x0002 #define IFF_NO_PI 0x1000 #define IFF_VNET_HDR 0x4000 +#define IFF_MULTI_QUEUE 0x0100 +#define IFF_ATTACH_QUEUE 0x0200 +#define IFF_DETACH_QUEUE 0x0400 /* Features for GSO (TUNSETOFFLOAD). */ #define TUN_F_CSUM 0x01/* You can hand me unchecksummed packets. */ -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V4 RESEND 11/22] tap: factor out common tap initialization
This patch factors out the common initialization of tap into a new helper net_init_tap_one(). This will be used by multiqueue tap patches. Signed-off-by: Jason Wang jasow...@redhat.com Signed-off-by: Michael S. Tsirkin m...@redhat.com --- net/tap.c | 130 ++--- 1 files changed, 73 insertions(+), 57 deletions(-) diff --git a/net/tap.c b/net/tap.c index 5542c98..23fb6e0 100644 --- a/net/tap.c +++ b/net/tap.c @@ -591,6 +591,73 @@ static int net_tap_init(const NetdevTapOptions *tap, int *vnet_hdr, return fd; } +static int net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer, +const char *model, const char *name, +const char *ifname, const char *script, +const char *downscript, const char *vhostfdname, +int vnet_hdr, int fd) +{ +TAPState *s; + +s = net_tap_fd_init(peer, model, name, fd, vnet_hdr); +if (!s) { +close(fd); +return -1; +} + +if (tap_set_sndbuf(s-fd, tap) 0) { +return -1; +} + +if (tap-has_fd) { +snprintf(s-nc.info_str, sizeof(s-nc.info_str), fd=%d, fd); +} else if (tap-has_helper) { +snprintf(s-nc.info_str, sizeof(s-nc.info_str), helper=%s, + tap-helper); +} else { +const char *downscript; + +downscript = tap-has_downscript ? tap-downscript : +DEFAULT_NETWORK_DOWN_SCRIPT; + +snprintf(s-nc.info_str, sizeof(s-nc.info_str), + ifname=%s,script=%s,downscript=%s, ifname, script, + downscript); + +if (strcmp(downscript, no) != 0) { +snprintf(s-down_script, sizeof(s-down_script), %s, downscript); +snprintf(s-down_script_arg, sizeof(s-down_script_arg), + %s, ifname); +} +} + +if (tap-has_vhost ? tap-vhost : +vhostfdname || (tap-has_vhostforce tap-vhostforce)) { +int vhostfd; + +if (tap-has_vhostfd) { +vhostfd = monitor_handle_fd_param(cur_mon, vhostfdname); +if (vhostfd == -1) { +return -1; +} +} else { +vhostfd = -1; +} + +s-vhost_net = vhost_net_init(s-nc, vhostfd, + tap-has_vhostforce tap-vhostforce); +if (!s-vhost_net) { +error_report(vhost-net requested but could not be initialized); +return -1; +} +} else if (tap-has_vhostfd) { +error_report(vhostfd= is not valid without vhost); +return -1; +} + +return 0; +} + int net_init_tap(const NetClientOptions *opts, const char *name, NetClientState *peer) { @@ -598,10 +665,10 @@ int net_init_tap(const NetClientOptions *opts, const char *name, int fd, vnet_hdr = 0; const char *model; -TAPState *s; /* for the no-fd, no-helper case */ const char *script = NULL; /* suppress wrong uninit'd use gcc warning */ +const char *downscript = NULL; char ifname[128]; assert(opts-kind == NET_CLIENT_OPTIONS_KIND_TAP); @@ -647,6 +714,8 @@ int net_init_tap(const NetClientOptions *opts, const char *name, } else { script = tap-has_script ? tap-script : DEFAULT_NETWORK_SCRIPT; +downscript = tap-has_downscript ? tap-downscript : +DEFAULT_NETWORK_DOWN_SCRIPT; fd = net_tap_init(tap, vnet_hdr, script, ifname, sizeof ifname); if (fd == -1) { return -1; @@ -655,62 +724,9 @@ int net_init_tap(const NetClientOptions *opts, const char *name, model = tap; } -s = net_tap_fd_init(peer, model, name, fd, vnet_hdr); -if (!s) { -close(fd); -return -1; -} - -if (tap_set_sndbuf(s-fd, tap) 0) { -return -1; -} - -if (tap-has_fd) { -snprintf(s-nc.info_str, sizeof(s-nc.info_str), fd=%d, fd); -} else if (tap-has_helper) { -snprintf(s-nc.info_str, sizeof(s-nc.info_str), helper=%s, - tap-helper); -} else { -const char *downscript; - -downscript = tap-has_downscript ? tap-downscript : - DEFAULT_NETWORK_DOWN_SCRIPT; - -snprintf(s-nc.info_str, sizeof(s-nc.info_str), - ifname=%s,script=%s,downscript=%s, ifname, script, - downscript); - -if (strcmp(downscript, no) != 0) { -snprintf(s-down_script, sizeof(s-down_script), %s, downscript); -snprintf(s-down_script_arg, sizeof(s-down_script_arg), %s, ifname); -} -} - -if (tap-has_vhost ? tap-vhost : -tap-has_vhostfd || (tap-has_vhostforce tap-vhostforce)) { -int vhostfd; - -if (tap-has_vhostfd) { -vhostfd = monitor_handle_fd_param(cur_mon, tap-vhostfd); -if (vhostfd == -1) { -return -1; -
[PATCH V4 RESEND 12/22] tap: add Linux multiqueue support
This patch add basic multiqueue support for Linux. When multiqueue is needed, we will first check whether kernel support multiqueue tap before creating more queues. Two new functions tap_fd_enable() and tap_fd_disable() were introduced to enable and disable a specific queue. Since the multiqueue is only supported in Linux, return error on other platforms. Signed-off-by: Jason Wang jasow...@redhat.com Signed-off-by: Michael S. Tsirkin m...@redhat.com --- net/tap-aix.c | 10 ++ net/tap-bsd.c | 10 ++ net/tap-haiku.c | 10 ++ net/tap-linux.c | 51 +++ net/tap-solaris.c | 10 ++ net/tap_int.h |2 ++ 6 files changed, 93 insertions(+), 0 deletions(-) diff --git a/net/tap-aix.c b/net/tap-aix.c index aff6c52..66e0574 100644 --- a/net/tap-aix.c +++ b/net/tap-aix.c @@ -59,3 +59,13 @@ void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo) { } + +int tap_fd_enable(int fd) +{ +return -1; +} + +int tap_fd_disable(int fd) +{ +return -1; +} diff --git a/net/tap-bsd.c b/net/tap-bsd.c index 01c705b..5ed2d16 100644 --- a/net/tap-bsd.c +++ b/net/tap-bsd.c @@ -145,3 +145,13 @@ void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo) { } + +int tap_fd_enable(int fd) +{ +return -1; +} + +int tap_fd_disable(int fd) +{ +return -1; +} diff --git a/net/tap-haiku.c b/net/tap-haiku.c index 08cc034..0f1b1fe 100644 --- a/net/tap-haiku.c +++ b/net/tap-haiku.c @@ -59,3 +59,13 @@ void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo) { } + +int tap_fd_enable(int fd) +{ +return -1; +} + +int tap_fd_disable(int fd) +{ +return -1; +} diff --git a/net/tap-linux.c b/net/tap-linux.c index 0a6acc7..42376cc 100644 --- a/net/tap-linux.c +++ b/net/tap-linux.c @@ -41,6 +41,7 @@ int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int vnet_hdr_required struct ifreq ifr; int fd, ret; int len = sizeof(struct virtio_net_hdr); +int mq_required = 0; TFR(fd = open(PATH_NET_TUN, O_RDWR)); if (fd 0) { @@ -76,6 +77,20 @@ int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int vnet_hdr_required ioctl(fd, TUNSETVNETHDRSZ, len); } +if (mq_required) { +unsigned int features; + +if ((ioctl(fd, TUNGETFEATURES, features) != 0) || +!(features IFF_MULTI_QUEUE)) { +error_report(multiqueue required, but no kernel + support for IFF_MULTI_QUEUE available); +close(fd); +return -1; +} else { +ifr.ifr_flags |= IFF_MULTI_QUEUE; +} +} + if (ifname[0] != '\0') pstrcpy(ifr.ifr_name, IFNAMSIZ, ifname); else @@ -209,3 +224,39 @@ void tap_fd_set_offload(int fd, int csum, int tso4, } } } + +/* Enable a specific queue of tap. */ +int tap_fd_enable(int fd) +{ +struct ifreq ifr; +int ret; + +memset(ifr, 0, sizeof(ifr)); + +ifr.ifr_flags = IFF_ATTACH_QUEUE; +ret = ioctl(fd, TUNSETQUEUE, (void *) ifr); + +if (ret != 0) { +error_report(could not enable queue); +} + +return ret; +} + +/* Disable a specific queue of tap/ */ +int tap_fd_disable(int fd) +{ +struct ifreq ifr; +int ret; + +memset(ifr, 0, sizeof(ifr)); + +ifr.ifr_flags = IFF_DETACH_QUEUE; +ret = ioctl(fd, TUNSETQUEUE, (void *) ifr); + +if (ret != 0) { +error_report(could not disable queue); +} + +return ret; +} diff --git a/net/tap-solaris.c b/net/tap-solaris.c index 486a7ea..cc08e9e 100644 --- a/net/tap-solaris.c +++ b/net/tap-solaris.c @@ -225,3 +225,13 @@ void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo) { } + +int tap_fd_enable(int fd) +{ +return -1; +} + +int tap_fd_disable(int fd) +{ +return -1; +} diff --git a/net/tap_int.h b/net/tap_int.h index 1dffe12..ca1c21b 100644 --- a/net/tap_int.h +++ b/net/tap_int.h @@ -42,5 +42,7 @@ int tap_probe_vnet_hdr_len(int fd, int len); int tap_probe_has_ufo(int fd); void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo); void tap_fd_set_vnet_hdr_len(int fd, int len); +int tap_fd_enable(int fd); +int tap_fd_disable(int fd); #endif /* QEMU_TAP_H */ -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V4 RESEND 13/22] tap: support enabling or disabling a queue
This patch introduce a new bit - enabled in TAPState which tracks whether a specific queue/fd is enabled. The tap/fd is enabled during initialization and could be enabled/disabled by tap_enalbe() and tap_disable() which calls platform specific helpers to do the real work. Polling of a tap fd can only done when the tap was enabled. Signed-off-by: Jason Wang jasow...@redhat.com Signed-off-by: Michael S. Tsirkin m...@redhat.com --- include/net/tap.h |2 ++ net/tap-win32.c | 10 ++ net/tap.c | 43 --- 3 files changed, 52 insertions(+), 3 deletions(-) diff --git a/include/net/tap.h b/include/net/tap.h index 883cebf..a994f20 100644 --- a/include/net/tap.h +++ b/include/net/tap.h @@ -35,6 +35,8 @@ int tap_has_vnet_hdr_len(NetClientState *nc, int len); void tap_using_vnet_hdr(NetClientState *nc, bool using_vnet_hdr); void tap_set_offload(NetClientState *nc, int csum, int tso4, int tso6, int ecn, int ufo); void tap_set_vnet_hdr_len(NetClientState *nc, int len); +int tap_enable(NetClientState *nc); +int tap_disable(NetClientState *nc); int tap_get_fd(NetClientState *nc); diff --git a/net/tap-win32.c b/net/tap-win32.c index 601437e..d0b89f6 100644 --- a/net/tap-win32.c +++ b/net/tap-win32.c @@ -764,3 +764,13 @@ void tap_set_vnet_hdr_len(NetClientState *nc, int len) { abort(); } + +int tap_enable(NetClientState *nc) +{ +return 0; +} + +int tap_disable(NetClientState *nc) +{ +abort(); +} diff --git a/net/tap.c b/net/tap.c index 23fb6e0..8610ba2 100644 --- a/net/tap.c +++ b/net/tap.c @@ -59,6 +59,7 @@ typedef struct TAPState { bool write_poll; bool using_vnet_hdr; bool has_ufo; +bool enabled; VHostNetState *vhost_net; unsigned host_vnet_hdr_len; } TAPState; @@ -72,9 +73,9 @@ static void tap_writable(void *opaque); static void tap_update_fd_handler(TAPState *s) { qemu_set_fd_handler2(s-fd, - s-read_poll ? tap_can_send : NULL, - s-read_poll ? tap_send : NULL, - s-write_poll ? tap_writable : NULL, + s-read_poll s-enabled ? tap_can_send : NULL, + s-read_poll s-enabled ? tap_send : NULL, + s-write_poll s-enabled ? tap_writable : NULL, s); } @@ -337,6 +338,7 @@ static TAPState *net_tap_fd_init(NetClientState *peer, s-host_vnet_hdr_len = vnet_hdr ? sizeof(struct virtio_net_hdr) : 0; s-using_vnet_hdr = false; s-has_ufo = tap_probe_has_ufo(s-fd); +s-enabled = true; tap_set_offload(s-nc, 0, 0, 0, 0, 0); /* * Make sure host header length is set correctly in tap: @@ -735,3 +737,38 @@ VHostNetState *tap_get_vhost_net(NetClientState *nc) assert(nc-info-type == NET_CLIENT_OPTIONS_KIND_TAP); return s-vhost_net; } + +int tap_enable(NetClientState *nc) +{ +TAPState *s = DO_UPCAST(TAPState, nc, nc); +int ret; + +if (s-enabled) { +return 0; +} else { +ret = tap_fd_enable(s-fd); +if (ret == 0) { +s-enabled = true; +tap_update_fd_handler(s); +} +return ret; +} +} + +int tap_disable(NetClientState *nc) +{ +TAPState *s = DO_UPCAST(TAPState, nc, nc); +int ret; + +if (s-enabled == 0) { +return 0; +} else { +ret = tap_fd_disable(s-fd); +if (ret == 0) { +qemu_purge_queued_packets(nc); +s-enabled = false; +tap_update_fd_handler(s); +} +return ret; +} +} -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V4 RESEND 14/22] tap: introduce a helper to get the name of an interface
This patch introduces a helper tap_get_ifname() to get the device name of tap device. This is needed when ifname is unspecified in the command line and qemu were asked to create tap device by itself. In this situation, the name were allocated by kernel, so if multiqueue is asked, we need to fetch its name after creating the first queue. Only linux has this support since it's the only platform that supports multiqueue tap. Signed-off-by: Jason Wang jasow...@redhat.com Signed-off-by: Michael S. Tsirkin m...@redhat.com --- include/net/tap.h |1 + net/tap-aix.c |5 + net/tap-bsd.c |5 + net/tap-haiku.c |5 + net/tap-linux.c | 14 ++ net/tap-solaris.c |5 + net/tap_int.h |1 + 7 files changed, 36 insertions(+), 0 deletions(-) diff --git a/include/net/tap.h b/include/net/tap.h index a994f20..c3eb85a 100644 --- a/include/net/tap.h +++ b/include/net/tap.h @@ -37,6 +37,7 @@ void tap_set_offload(NetClientState *nc, int csum, int tso4, int tso6, int ecn, void tap_set_vnet_hdr_len(NetClientState *nc, int len); int tap_enable(NetClientState *nc); int tap_disable(NetClientState *nc); +int tap_get_ifname(NetClientState *nc, char *ifname); int tap_get_fd(NetClientState *nc); diff --git a/net/tap-aix.c b/net/tap-aix.c index 66e0574..0e1eac3 100644 --- a/net/tap-aix.c +++ b/net/tap-aix.c @@ -69,3 +69,8 @@ int tap_fd_disable(int fd) { return -1; } + +int tap_fd_get_ifname(int fd, char *ifname) +{ +return -1; +} diff --git a/net/tap-bsd.c b/net/tap-bsd.c index 5ed2d16..4f22109 100644 --- a/net/tap-bsd.c +++ b/net/tap-bsd.c @@ -155,3 +155,8 @@ int tap_fd_disable(int fd) { return -1; } + +int tap_fd_get_ifname(int fd, char *ifname) +{ +return -1; +} diff --git a/net/tap-haiku.c b/net/tap-haiku.c index 0f1b1fe..b3b5fbb 100644 --- a/net/tap-haiku.c +++ b/net/tap-haiku.c @@ -69,3 +69,8 @@ int tap_fd_disable(int fd) { return -1; } + +int tap_fd_get_ifname(int fd, char *ifname) +{ +return -1; +} diff --git a/net/tap-linux.c b/net/tap-linux.c index 42376cc..3b21662 100644 --- a/net/tap-linux.c +++ b/net/tap-linux.c @@ -260,3 +260,17 @@ int tap_fd_disable(int fd) return ret; } + +int tap_fd_get_ifname(int fd, char *ifname) +{ +struct ifreq ifr; + +if (ioctl(fd, TUNGETIFF, ifr) != 0) { +error_report(TUNGETIFF ioctl() failed: %s, + strerror(errno)); +return -1; +} + +pstrcpy(ifname, sizeof(ifr.ifr_name), ifr.ifr_name); +return 0; +} diff --git a/net/tap-solaris.c b/net/tap-solaris.c index cc08e9e..214d95e 100644 --- a/net/tap-solaris.c +++ b/net/tap-solaris.c @@ -235,3 +235,8 @@ int tap_fd_disable(int fd) { return -1; } + +int tap_fd_get_ifname(int fd, char *ifname) +{ +return -1; +} diff --git a/net/tap_int.h b/net/tap_int.h index ca1c21b..125f83d 100644 --- a/net/tap_int.h +++ b/net/tap_int.h @@ -44,5 +44,6 @@ void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo); void tap_fd_set_vnet_hdr_len(int fd, int len); int tap_fd_enable(int fd); int tap_fd_disable(int fd); +int tap_fd_get_ifname(int fd, char *ifname); #endif /* QEMU_TAP_H */ -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V4 RESEND 15/22] tap: multiqueue support
Recently, linux support multiqueue tap which could let userspace call TUNSETIFF for a signle device many times to create multiple file descriptors as independent queues. User could also enable/disabe a specific queue through TUNSETQUEUE. The patch adds the generic infrastructure to create multiqueue taps. To achieve this a new parameter queues were introduced to specify how many queues were expected to be created for tap by qemu itself. Alternatively, management could also pass multiple pre-created tap file descriptors separated with ':' through a new parameter fds like -netdev tap,id=hn0,fds=X:Y:..:Z. Multiple vhost file descriptors could also be passed in this way. Each TAPState were still associated to a tap fd, which mean multiple TAPStates were created when user needs multiqueue taps. Since each TAPState contains one NetClientState, with the multiqueue nic support, an N peers of NetClientState were built up. A new parameter, mq_required were introduce in tap_open() to create multiqueue tap fds. Signed-off-by: Jason Wang jasow...@redhat.com Signed-off-by: Michael S. Tsirkin m...@redhat.com --- include/net/tap.h |1 - net/tap-aix.c |3 +- net/tap-bsd.c |3 +- net/tap-haiku.c |3 +- net/tap-linux.c |4 +- net/tap-solaris.c |3 +- net/tap.c | 158 + net/tap_int.h |3 +- qapi-schema.json |5 +- 9 files changed, 139 insertions(+), 44 deletions(-) diff --git a/include/net/tap.h b/include/net/tap.h index c3eb85a..a994f20 100644 --- a/include/net/tap.h +++ b/include/net/tap.h @@ -37,7 +37,6 @@ void tap_set_offload(NetClientState *nc, int csum, int tso4, int tso6, int ecn, void tap_set_vnet_hdr_len(NetClientState *nc, int len); int tap_enable(NetClientState *nc); int tap_disable(NetClientState *nc); -int tap_get_ifname(NetClientState *nc, char *ifname); int tap_get_fd(NetClientState *nc); diff --git a/net/tap-aix.c b/net/tap-aix.c index 0e1eac3..3953b60 100644 --- a/net/tap-aix.c +++ b/net/tap-aix.c @@ -25,7 +25,8 @@ #include tap_int.h #include stdio.h -int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int vnet_hdr_required) +int tap_open(char *ifname, int ifname_size, int *vnet_hdr, + int vnet_hdr_required, int mq_required) { fprintf(stderr, no tap on AIX\n); return -1; diff --git a/net/tap-bsd.c b/net/tap-bsd.c index 4f22109..bcdb268 100644 --- a/net/tap-bsd.c +++ b/net/tap-bsd.c @@ -33,7 +33,8 @@ #include net/if_tap.h #endif -int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int vnet_hdr_required) +int tap_open(char *ifname, int ifname_size, int *vnet_hdr, + int vnet_hdr_required, int mq_required) { int fd; #ifdef TAPGIFNAME diff --git a/net/tap-haiku.c b/net/tap-haiku.c index b3b5fbb..e5ce436 100644 --- a/net/tap-haiku.c +++ b/net/tap-haiku.c @@ -25,7 +25,8 @@ #include tap_int.h #include stdio.h -int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int vnet_hdr_required) +int tap_open(char *ifname, int ifname_size, int *vnet_hdr, + int vnet_hdr_required, int mq_required) { fprintf(stderr, no tap on Haiku\n); return -1; diff --git a/net/tap-linux.c b/net/tap-linux.c index 3b21662..a953189 100644 --- a/net/tap-linux.c +++ b/net/tap-linux.c @@ -36,12 +36,12 @@ #define PATH_NET_TUN /dev/net/tun -int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int vnet_hdr_required) +int tap_open(char *ifname, int ifname_size, int *vnet_hdr, + int vnet_hdr_required, int mq_required) { struct ifreq ifr; int fd, ret; int len = sizeof(struct virtio_net_hdr); -int mq_required = 0; TFR(fd = open(PATH_NET_TUN, O_RDWR)); if (fd 0) { diff --git a/net/tap-solaris.c b/net/tap-solaris.c index 214d95e..9c7278f 100644 --- a/net/tap-solaris.c +++ b/net/tap-solaris.c @@ -173,7 +173,8 @@ static int tap_alloc(char *dev, size_t dev_size) return tap_fd; } -int tap_open(char *ifname, int ifname_size, int *vnet_hdr, int vnet_hdr_required) +int tap_open(char *ifname, int ifname_size, int *vnet_hdr, + int vnet_hdr_required, int mq_required) { char dev[10]=; int fd; diff --git a/net/tap.c b/net/tap.c index 8610ba2..1bf7609 100644 --- a/net/tap.c +++ b/net/tap.c @@ -558,17 +558,10 @@ int net_init_bridge(const NetClientOptions *opts, const char *name, static int net_tap_init(const NetdevTapOptions *tap, int *vnet_hdr, const char *setup_script, char *ifname, -size_t ifname_sz) +size_t ifname_sz, int mq_required) { int fd, vnet_hdr_required; -if (tap-has_ifname) { -pstrcpy(ifname, ifname_sz, tap-ifname); -} else { -assert(ifname_sz 0); -ifname[0] = '\0'; -} - if (tap-has_vnet_hdr) { *vnet_hdr = tap-vnet_hdr; vnet_hdr_required = *vnet_hdr; @@ -577,7 +570,8 @@ static int net_tap_init(const
[PATCH V4 RESEND 18/22] virtio: add a queue_index to VirtQueue
Add a queue_index to VirtQueue and a helper to fetch it, this could be used by multiqueue supported device. Signed-off-by: Jason Wang jasow...@redhat.com Signed-off-by: Michael S. Tsirkin m...@redhat.com --- hw/virtio.c |8 hw/virtio.h |1 + 2 files changed, 9 insertions(+), 0 deletions(-) diff --git a/hw/virtio.c b/hw/virtio.c index d8c77b0..e259348 100644 --- a/hw/virtio.c +++ b/hw/virtio.c @@ -73,6 +73,8 @@ struct VirtQueue /* Notification enabled? */ bool notification; +uint16_t queue_index; + int inuse; uint16_t vector; @@ -931,6 +933,7 @@ void virtio_init(VirtIODevice *vdev, const char *name, for (i = 0; i VIRTIO_PCI_QUEUE_MAX; i++) { vdev-vq[i].vector = VIRTIO_NO_VECTOR; vdev-vq[i].vdev = vdev; +vdev-vq[i].queue_index = i; } vdev-name = name; @@ -1018,6 +1021,11 @@ VirtQueue *virtio_get_queue(VirtIODevice *vdev, int n) return vdev-vq + n; } +uint16_t virtio_get_queue_index(VirtQueue *vq) +{ +return vq-queue_index; +} + static void virtio_queue_guest_notifier_read(EventNotifier *n) { VirtQueue *vq = container_of(n, VirtQueue, guest_notifier); diff --git a/hw/virtio.h b/hw/virtio.h index d3da1d2..a29a54d 100644 --- a/hw/virtio.h +++ b/hw/virtio.h @@ -280,6 +280,7 @@ hwaddr virtio_queue_get_ring_size(VirtIODevice *vdev, int n); uint16_t virtio_queue_get_last_avail_idx(VirtIODevice *vdev, int n); void virtio_queue_set_last_avail_idx(VirtIODevice *vdev, int n, uint16_t idx); VirtQueue *virtio_get_queue(VirtIODevice *vdev, int n); +uint16_t virtio_get_queue_index(VirtQueue *vq); int virtio_queue_get_id(VirtQueue *vq); EventNotifier *virtio_queue_get_guest_notifier(VirtQueue *vq); void virtio_queue_set_guest_notifier_fd_handler(VirtQueue *vq, bool assign, -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V4 RESEND 19/22] virtio-net: separate virtqueue from VirtIONet
To support multiqueue virtio-net, the first step is to separate the virtqueue related fields from VirtIONet to a new structure VirtIONetQueue. The following patches will add an array of VirtIONetQueue to VirtIONet based on this patch. Signed-off-by: Jason Wang jasow...@redhat.com Signed-off-by: Michael S. Tsirkin m...@redhat.com --- hw/virtio-net.c | 195 --- 1 files changed, 114 insertions(+), 81 deletions(-) diff --git a/hw/virtio-net.c b/hw/virtio-net.c index f4146aa..4b285c1 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -26,28 +26,33 @@ #define MAC_TABLE_ENTRIES64 #define MAX_VLAN(1 12) /* Per 802.1Q definition */ +typedef struct VirtIONetQueue { +VirtQueue *rx_vq; +VirtQueue *tx_vq; +QEMUTimer *tx_timer; +QEMUBH *tx_bh; +int tx_waiting; +struct { +VirtQueueElement elem; +ssize_t len; +} async_tx; +struct VirtIONet *n; +} VirtIONetQueue; + typedef struct VirtIONet { VirtIODevice vdev; uint8_t mac[ETH_ALEN]; uint16_t status; -VirtQueue *rx_vq; -VirtQueue *tx_vq; +VirtIONetQueue vq; VirtQueue *ctrl_vq; NICState *nic; -QEMUTimer *tx_timer; -QEMUBH *tx_bh; uint32_t tx_timeout; int32_t tx_burst; -int tx_waiting; uint32_t has_vnet_hdr; size_t host_hdr_len; size_t guest_hdr_len; uint8_t has_ufo; -struct { -VirtQueueElement elem; -ssize_t len; -} async_tx; int mergeable_rx_bufs; uint8_t promisc; uint8_t allmulti; @@ -67,6 +72,12 @@ typedef struct VirtIONet DeviceState *qdev; } VirtIONet; +static VirtIONetQueue *virtio_net_get_queue(NetClientState *nc) +{ +VirtIONet *n = qemu_get_nic_opaque(nc); + +return n-vq; +} /* TODO * - we could suppress RX interrupt if we were so inclined. */ @@ -135,6 +146,8 @@ static void virtio_net_vhost_status(VirtIONet *n, uint8_t status) error_report(unable to start vhost net: %d: falling back on userspace virtio, -r); n-vhost_started = 0; +} else { +n-vhost_started = 1; } } else { vhost_net_stop(n-vdev, nc, 1); @@ -145,25 +158,26 @@ static void virtio_net_vhost_status(VirtIONet *n, uint8_t status) static void virtio_net_set_status(struct VirtIODevice *vdev, uint8_t status) { VirtIONet *n = to_virtio_net(vdev); +VirtIONetQueue *q = n-vq; virtio_net_vhost_status(n, status); -if (!n-tx_waiting) { +if (!q-tx_waiting) { return; } if (virtio_net_started(n, status) !n-vhost_started) { -if (n-tx_timer) { -qemu_mod_timer(n-tx_timer, +if (q-tx_timer) { +qemu_mod_timer(q-tx_timer, qemu_get_clock_ns(vm_clock) + n-tx_timeout); } else { -qemu_bh_schedule(n-tx_bh); +qemu_bh_schedule(q-tx_bh); } } else { -if (n-tx_timer) { -qemu_del_timer(n-tx_timer); +if (q-tx_timer) { +qemu_del_timer(q-tx_timer); } else { -qemu_bh_cancel(n-tx_bh); +qemu_bh_cancel(q-tx_bh); } } } @@ -507,35 +521,40 @@ static void virtio_net_handle_rx(VirtIODevice *vdev, VirtQueue *vq) static int virtio_net_can_receive(NetClientState *nc) { VirtIONet *n = qemu_get_nic_opaque(nc); +VirtIONetQueue *q = virtio_net_get_queue(nc); + if (!n-vdev.vm_running) { return 0; } -if (!virtio_queue_ready(n-rx_vq) || -!(n-vdev.status VIRTIO_CONFIG_S_DRIVER_OK)) +if (!virtio_queue_ready(q-rx_vq) || +!(n-vdev.status VIRTIO_CONFIG_S_DRIVER_OK)) { return 0; +} return 1; } -static int virtio_net_has_buffers(VirtIONet *n, int bufsize) +static int virtio_net_has_buffers(VirtIONetQueue *q, int bufsize) { -if (virtio_queue_empty(n-rx_vq) || +VirtIONet *n = q-n; +if (virtio_queue_empty(q-rx_vq) || (n-mergeable_rx_bufs - !virtqueue_avail_bytes(n-rx_vq, bufsize, 0))) { -virtio_queue_set_notification(n-rx_vq, 1); + !virtqueue_avail_bytes(q-rx_vq, bufsize, 0))) { +virtio_queue_set_notification(q-rx_vq, 1); /* To avoid a race condition where the guest has made some buffers * available after the above check but before notification was * enabled, check for available buffers again. */ -if (virtio_queue_empty(n-rx_vq) || +if (virtio_queue_empty(q-rx_vq) || (n-mergeable_rx_bufs - !virtqueue_avail_bytes(n-rx_vq, bufsize, 0))) + !virtqueue_avail_bytes(q-rx_vq, bufsize, 0))) { return 0; +} } -virtio_queue_set_notification(n-rx_vq, 0); +virtio_queue_set_notification(q-rx_vq, 0); return 1; } @@ -638,6 +657,7 @@ static int receive_filter(VirtIONet *n, const uint8_t *buf, int size) static
[PATCH V4 RESEND 20/22] virtio-net: multiqueue support
This patch implements both userspace and vhost support for multiple queue virtio-net (VIRTIO_NET_F_MQ). This is done by introducing an array of VirtIONetQueue to VirtIONet. Signed-off-by: Jason Wang jasow...@redhat.com Signed-off-by: Michael S. Tsirkin m...@redhat.com --- hw/virtio-net.c | 301 +++ hw/virtio-net.h | 27 +- 2 files changed, 261 insertions(+), 67 deletions(-) diff --git a/hw/virtio-net.c b/hw/virtio-net.c index 4b285c1..2067fa7 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -44,7 +44,7 @@ typedef struct VirtIONet VirtIODevice vdev; uint8_t mac[ETH_ALEN]; uint16_t status; -VirtIONetQueue vq; +VirtIONetQueue vqs[MAX_QUEUE_NUM]; VirtQueue *ctrl_vq; NICState *nic; uint32_t tx_timeout; @@ -70,14 +70,23 @@ typedef struct VirtIONet } mac_table; uint32_t *vlans; DeviceState *qdev; +int multiqueue; +uint16_t max_queues; +uint16_t curr_queues; } VirtIONet; -static VirtIONetQueue *virtio_net_get_queue(NetClientState *nc) +static VirtIONetQueue *virtio_net_get_subqueue(NetClientState *nc) { VirtIONet *n = qemu_get_nic_opaque(nc); -return n-vq; +return n-vqs[nc-queue_index]; } + +static int vq2q(int queue_index) +{ +return queue_index / 2; +} + /* TODO * - we could suppress RX interrupt if we were so inclined. */ @@ -93,6 +102,7 @@ static void virtio_net_get_config(VirtIODevice *vdev, uint8_t *config) struct virtio_net_config netcfg; stw_p(netcfg.status, n-status); +stw_p(netcfg.max_virtqueue_pairs, n-max_queues); memcpy(netcfg.mac, n-mac, ETH_ALEN); memcpy(config, netcfg, sizeof(netcfg)); } @@ -120,6 +130,7 @@ static bool virtio_net_started(VirtIONet *n, uint8_t status) static void virtio_net_vhost_status(VirtIONet *n, uint8_t status) { NetClientState *nc = qemu_get_queue(n-nic); +int queues = n-multiqueue ? n-max_queues : 1; if (!nc-peer) { return; @@ -131,6 +142,7 @@ static void virtio_net_vhost_status(VirtIONet *n, uint8_t status) if (!tap_get_vhost_net(nc-peer)) { return; } + if (!!n-vhost_started == virtio_net_started(n, status) !nc-peer-link_down) { return; @@ -141,16 +153,14 @@ static void virtio_net_vhost_status(VirtIONet *n, uint8_t status) return; } n-vhost_started = 1; -r = vhost_net_start(n-vdev, nc, 1); +r = vhost_net_start(n-vdev, n-nic-ncs, queues); if (r 0) { error_report(unable to start vhost net: %d: falling back on userspace virtio, -r); n-vhost_started = 0; -} else { -n-vhost_started = 1; } } else { -vhost_net_stop(n-vdev, nc, 1); +vhost_net_stop(n-vdev, n-nic-ncs, queues); n-vhost_started = 0; } } @@ -158,26 +168,38 @@ static void virtio_net_vhost_status(VirtIONet *n, uint8_t status) static void virtio_net_set_status(struct VirtIODevice *vdev, uint8_t status) { VirtIONet *n = to_virtio_net(vdev); -VirtIONetQueue *q = n-vq; +VirtIONetQueue *q; +int i; +uint8_t queue_status; virtio_net_vhost_status(n, status); -if (!q-tx_waiting) { -return; -} +for (i = 0; i n-max_queues; i++) { +q = n-vqs[i]; -if (virtio_net_started(n, status) !n-vhost_started) { -if (q-tx_timer) { -qemu_mod_timer(q-tx_timer, - qemu_get_clock_ns(vm_clock) + n-tx_timeout); +if ((!n-multiqueue i != 0) || i = n-curr_queues) { +queue_status = 0; } else { -qemu_bh_schedule(q-tx_bh); +queue_status = status; } -} else { -if (q-tx_timer) { -qemu_del_timer(q-tx_timer); + +if (!q-tx_waiting) { +continue; +} + +if (virtio_net_started(n, queue_status) !n-vhost_started) { +if (q-tx_timer) { +qemu_mod_timer(q-tx_timer, + qemu_get_clock_ns(vm_clock) + n-tx_timeout); +} else { +qemu_bh_schedule(q-tx_bh); +} } else { -qemu_bh_cancel(q-tx_bh); +if (q-tx_timer) { +qemu_del_timer(q-tx_timer); +} else { +qemu_bh_cancel(q-tx_bh); +} } } } @@ -209,6 +231,8 @@ static void virtio_net_reset(VirtIODevice *vdev) n-nomulti = 0; n-nouni = 0; n-nobcast = 0; +/* multiqueue is disabled by default */ +n-curr_queues = 1; /* Flush any MAC and VLAN filter table state */ n-mac_table.in_use = 0; @@ -251,18 +275,70 @@ static int peer_has_ufo(VirtIONet *n) static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int mergeable_rx_bufs) { +int i; +NetClientState *nc; + n-mergeable_rx_bufs = mergeable_rx_bufs;
[PATCH V4 RESEND 21/22] virtio-net: migration support for multiqueue
This patch add migration support for multiqueue virtio-net. Instead of bumping the version, we conditionally send the info of multiqueue only when the device support more than one queue to maintain the backward compatibility. Signed-off-by: Jason Wang jasow...@redhat.com Signed-off-by: Michael S. Tsirkin m...@redhat.com --- hw/virtio-net.c | 35 +-- 1 files changed, 29 insertions(+), 6 deletions(-) diff --git a/hw/virtio-net.c b/hw/virtio-net.c index 2067fa7..5699f5e 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -1093,8 +1093,8 @@ static void virtio_net_set_multiqueue(VirtIONet *n, int multiqueue, int ctrl) static void virtio_net_save(QEMUFile *f, void *opaque) { +int i; VirtIONet *n = opaque; -VirtIONetQueue *q = n-vqs[0]; /* At this point, backend must be stopped, otherwise * it might keep writing to memory. */ @@ -1102,7 +1102,7 @@ static void virtio_net_save(QEMUFile *f, void *opaque) virtio_save(n-vdev, f); qemu_put_buffer(f, n-mac, ETH_ALEN); -qemu_put_be32(f, q-tx_waiting); +qemu_put_be32(f, n-vqs[0].tx_waiting); qemu_put_be32(f, n-mergeable_rx_bufs); qemu_put_be16(f, n-status); qemu_put_byte(f, n-promisc); @@ -1118,13 +1118,19 @@ static void virtio_net_save(QEMUFile *f, void *opaque) qemu_put_byte(f, n-nouni); qemu_put_byte(f, n-nobcast); qemu_put_byte(f, n-has_ufo); +if (n-max_queues 1) { +qemu_put_be16(f, n-max_queues); +qemu_put_be16(f, n-curr_queues); +for (i = 1; i n-curr_queues; i++) { +qemu_put_be32(f, n-vqs[i].tx_waiting); +} +} } static int virtio_net_load(QEMUFile *f, void *opaque, int version_id) { VirtIONet *n = opaque; -VirtIONetQueue *q = n-vqs[0]; -int ret, i; +int ret, i, link_down; if (version_id 2 || version_id VIRTIO_NET_VM_VERSION) return -EINVAL; @@ -1135,7 +1141,7 @@ static int virtio_net_load(QEMUFile *f, void *opaque, int version_id) } qemu_get_buffer(f, n-mac, ETH_ALEN); -q-tx_waiting = qemu_get_be32(f); +n-vqs[0].tx_waiting = qemu_get_be32(f); virtio_net_set_mrg_rx_bufs(n, qemu_get_be32(f)); @@ -1205,6 +1211,20 @@ static int virtio_net_load(QEMUFile *f, void *opaque, int version_id) } } +if (n-max_queues 1) { +if (n-max_queues != qemu_get_be16(f)) { +error_report(virtio-net: different max_queues ); +return -1; +} + +n-curr_queues = qemu_get_be16(f); +for (i = 1; i n-curr_queues; i++) { +n-vqs[i].tx_waiting = qemu_get_be32(f); +} +} + +virtio_net_set_queues(n); + /* Find the first multicast entry in the saved MAC filter */ for (i = 0; i n-mac_table.in_use; i++) { if (n-mac_table.macs[i * ETH_ALEN] 1) { @@ -1215,7 +1235,10 @@ static int virtio_net_load(QEMUFile *f, void *opaque, int version_id) /* nc.link_down can't be migrated, so infer link_down according * to link status bit in n-status */ -qemu_get_queue(n-nic)-link_down = (n-status VIRTIO_NET_S_LINK_UP) == 0; +link_down = (n-status VIRTIO_NET_S_LINK_UP) == 0; +for (i = 0; i n-max_queues; i++) { +qemu_get_subqueue(n-nic, i)-link_down = link_down; +} return 0; } -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V4 RESEND 22/22] virtio-net: compat multiqueue support
Disable multiqueue support for pre 1.4. Signed-off-by: Jason Wang jasow...@redhat.com Signed-off-by: Michael S. Tsirkin m...@redhat.com --- hw/pc_piix.c |4 1 files changed, 4 insertions(+), 0 deletions(-) diff --git a/hw/pc_piix.c b/hw/pc_piix.c index ba09714..0af436c 100644 --- a/hw/pc_piix.c +++ b/hw/pc_piix.c @@ -313,6 +313,10 @@ static QEMUMachine pc_i440fx_machine_v1_4 = { .driver = virtio-net-pci,\ .property = ctrl_mac_addr,\ .value= off, \ +},{ \ +.driver = virtio-net-pci, \ +.property = mq, \ +.value= off, \ } static QEMUMachine pc_machine_v1_3 = { -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html