[no subject]
On Sun, Jan 06, 2013 at 02:36:13PM +0800, Asias He wrote: > This drops the cmd completion list spin lock and makes the cmd > completion queue lock-less. > > Signed-off-by: Asias He Nicholas, any feedback? > --- > drivers/vhost/tcm_vhost.c | 46 +- > drivers/vhost/tcm_vhost.h | 2 +- > 2 files changed, 14 insertions(+), 34 deletions(-) > > diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c > index b20df5c..3720604 100644 > --- a/drivers/vhost/tcm_vhost.c > +++ b/drivers/vhost/tcm_vhost.c > @@ -47,6 +47,7 @@ > #include > #include /* TODO vhost.h currently depends on this */ > #include > +#include > > #include "vhost.c" > #include "vhost.h" > @@ -64,8 +65,7 @@ struct vhost_scsi { > struct vhost_virtqueue vqs[3]; > > struct vhost_work vs_completion_work; /* cmd completion work item */ > - struct list_head vs_completion_list; /* cmd completion queue */ > - spinlock_t vs_completion_lock;/* protects s_completion_list */ > + struct llist_head vs_completion_list; /* cmd completion queue */ > }; > > /* Local pointer to allocated TCM configfs fabric module */ > @@ -301,9 +301,7 @@ static void vhost_scsi_complete_cmd(struct tcm_vhost_cmd > *tv_cmd) > { > struct vhost_scsi *vs = tv_cmd->tvc_vhost; > > - spin_lock_bh(&vs->vs_completion_lock); > - list_add_tail(&tv_cmd->tvc_completion_list, &vs->vs_completion_list); > - spin_unlock_bh(&vs->vs_completion_lock); > + llist_add(&tv_cmd->tvc_completion_list, &vs->vs_completion_list); > > vhost_work_queue(&vs->dev, &vs->vs_completion_work); > } > @@ -347,27 +345,6 @@ static void vhost_scsi_free_cmd(struct tcm_vhost_cmd > *tv_cmd) > kfree(tv_cmd); > } > > -/* Dequeue a command from the completion list */ > -static struct tcm_vhost_cmd *vhost_scsi_get_cmd_from_completion( > - struct vhost_scsi *vs) > -{ > - struct tcm_vhost_cmd *tv_cmd = NULL; > - > - spin_lock_bh(&vs->vs_completion_lock); > - if (list_empty(&vs->vs_completion_list)) { > - spin_unlock_bh(&vs->vs_completion_lock); > - return NULL; > - } > - > - list_for_each_entry(tv_cmd, &vs->vs_completion_list, > - tvc_completion_list) { > - list_del(&tv_cmd->tvc_completion_list); > - break; > - } > - spin_unlock_bh(&vs->vs_completion_lock); > - return tv_cmd; > -} > - > /* Fill in status and signal that we are done processing this command > * > * This is scheduled in the vhost work queue so we are called with the owner > @@ -377,12 +354,18 @@ static void vhost_scsi_complete_cmd_work(struct > vhost_work *work) > { > struct vhost_scsi *vs = container_of(work, struct vhost_scsi, > vs_completion_work); > + struct virtio_scsi_cmd_resp v_rsp; > struct tcm_vhost_cmd *tv_cmd; > + struct llist_node *llnode; > + struct se_cmd *se_cmd; > + int ret; > > - while ((tv_cmd = vhost_scsi_get_cmd_from_completion(vs))) { > - struct virtio_scsi_cmd_resp v_rsp; > - struct se_cmd *se_cmd = &tv_cmd->tvc_se_cmd; > - int ret; > + llnode = llist_del_all(&vs->vs_completion_list); > + while (llnode) { > + tv_cmd = llist_entry(llnode, struct tcm_vhost_cmd, > + tvc_completion_list); > + llnode = llist_next(llnode); > + se_cmd = &tv_cmd->tvc_se_cmd; > > pr_debug("%s tv_cmd %p resid %u status %#02x\n", __func__, > tv_cmd, se_cmd->residual_count, se_cmd->scsi_status); > @@ -426,7 +409,6 @@ static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd( > pr_err("Unable to allocate struct tcm_vhost_cmd\n"); > return ERR_PTR(-ENOMEM); > } > - INIT_LIST_HEAD(&tv_cmd->tvc_completion_list); > tv_cmd->tvc_tag = v_req->tag; > tv_cmd->tvc_task_attr = v_req->task_attr; > tv_cmd->tvc_exp_data_len = exp_data_len; > @@ -859,8 +841,6 @@ static int vhost_scsi_open(struct inode *inode, struct > file *f) > return -ENOMEM; > > vhost_work_init(&s->vs_completion_work, vhost_scsi_complete_cmd_work); > - INIT_LIST_HEAD(&s->vs_completion_list); > - spin_lock_init(&s->vs_completion_lock); > > s->vqs[VHOST_SCSI_VQ_CTL].handle_kick = vhost_scsi_ctl_handle_kick; > s->vqs[VHOST_SCSI_VQ_EVT].handle_kick = vhost_scsi_evt_handle_kick; > diff --git a/drivers/vhost/tcm_vhost.h b/drivers/vhost/tcm_vhost.h > index 7e87c63..47ee80b 100644 > --- a/drivers/vhost/tcm_vhost.h > +++ b/drivers/vhost/tcm_vhost.h > @@ -34,7 +34,7 @@ struct tcm_vhost_cmd { > /* Sense buffer that will be mapped into outgoing status */ > unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER]; > /* Completed commands list, serviced from vhost worker thread */ > - struct list_head tvc_completion_list; > + struct llist_n
Re: [PATCH v2 1/5] virtio: add functions for piecewise addition of buffers
Paolo Bonzini writes: > Il 07/01/2013 01:02, Rusty Russell ha scritto: >> Paolo Bonzini writes: >>> Il 02/01/2013 06:03, Rusty Russell ha scritto: Paolo Bonzini writes: > The virtqueue_add_buf function has two limitations: > > 1) it requires the caller to provide all the buffers in a single call; > > 2) it does not support chained scatterlists: the buffers must be > provided as an array of struct scatterlist; Chained scatterlists are a horrible interface, but that doesn't mean we shouldn't support them if there's a need. I think I once even had a patch which passed two chained sgs, rather than a combo sg and two length numbers. It's very old, but I've pasted it below. Duplicating the implementation by having another interface is pretty nasty; I think I'd prefer the chained scatterlists, if that's optimal for you. >>> >>> Unfortunately, that cannot work because not all architectures support >>> chained scatterlists. >> >> WHAT? I can't figure out what an arch needs to do to support this? > > It needs to use the iterator functions in its DMA driver. But we don't care for virtio. >> All archs we care about support them, though, so I think we can ignore >> this issue for now. > > Kind of... In principle all QEMU-supported arches can use virtio, and > the speedup can be quite useful. And there is no Kconfig symbol for SG > chains that I can use to disable virtio-scsi on unsupported arches. :/ Well, we #error if it's not supported. Then the lazy architectures can fix it. Cheers, Rusty. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [Xen-devel] [PATCH v3 00/11] xen: Initial kexec/kdump implementation
On 04/01/13 14:22, Daniel Kiper wrote: > On Wed, Jan 02, 2013 at 11:26:43AM +, Andrew Cooper wrote: >> On 27/12/12 18:02, Eric W. Biederman wrote: >>> Andrew Cooper writes: >>> On 27/12/2012 07:53, Eric W. Biederman wrote: > The syscall ABI still has the wrong semantics. > > Aka totally unmaintainable and umergeable. > > The concept of domU support is also strange. What does domU support even > mean, when the dom0 support is loading a kernel to pick up Xen when Xen > falls over. There are two requirements pulling at this patch series, but I agree that we need to clarify them. >>> It probably make sense to split them apart a little even. >>> >>> >> >> Thinking about this split, there might be a way to simply it even more. >> >> /sbin/kexec can load the "Xen" crash kernel itself by issuing >> hypercalls using /dev/xen/privcmd. This would remove the need for >> the dom0 kernel to distinguish between loading a crash kernel for >> itself and loading a kernel for Xen. >> >> Or is this just a silly idea complicating the matter? > > This is impossible with current Xen kexec/kdump interface. > It should be changed to do that. However, I suppose that > Xen community would not be interested in such changes. I don't see why the hypercall ABI cannot be extended with new sub-ops that do the right thing -- the existing ABI is a bit weird. I plan to start prototyping something shortly (hopefully next week) for the Xen kexec case. David ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [Xen-devel] [PATCH v3 00/11] xen: Initial kexec/kdump implementation
On Mon, Jan 07, 2013 at 01:34:04PM +0100, Daniel Kiper wrote: > On Fri, Jan 04, 2013 at 02:11:46PM -0500, Konrad Rzeszutek Wilk wrote: > > On Fri, Jan 04, 2013 at 06:07:51PM +0100, Daniel Kiper wrote: > > > On Fri, Jan 04, 2013 at 02:41:17PM +, Jan Beulich wrote: > > > > >>> On 04.01.13 at 15:22, Daniel Kiper wrote: > > > > > On Wed, Jan 02, 2013 at 11:26:43AM +, Andrew Cooper wrote: > > > > >> /sbin/kexec can load the "Xen" crash kernel itself by issuing > > > > >> hypercalls using /dev/xen/privcmd. This would remove the need for > > > > >> the dom0 kernel to distinguish between loading a crash kernel for > > > > >> itself and loading a kernel for Xen. > > > > >> > > > > >> Or is this just a silly idea complicating the matter? > > > > > > > > > > This is impossible with current Xen kexec/kdump interface. > > > > > > > > Why? > > > > > > Because current KEXEC_CMD_kexec_load does not load kernel > > > image and other things into Xen memory. It means that it > > > should live somewhere in dom0 Linux kernel memory. > > > > We could have a very simple hypercall which would have: > > > > struct fancy_new_hypercall { > > xen_pfn_t payload; // IN > > ssize_t len; // IN > > #define DATA (1<<1) > > #define DATA_EOF (1<<2) > > #define DATA_KERNEL (1<<3) > > #define DATA_RAMDISK (1<<4) > > unsigned int flags; // IN > > unsigned int status; // OUT > > }; > > > > which would in a loop just iterate over the payloads and > > let the hypervisor stick it in the crashkernel space. > > > > This is all hand-waving of course. There probably would be a need > > to figure out how much space you have in the reserved Xen's > > 'crashkernel' memory region too. > > I think that new kexec hypercall function should mimics kexec syscall. > It means that all arguments passed to hypercall should have same types > if it is possible or if it is not possible then conversion should be done > in very easy way. Additionally, I think that one call of new hypercall > load function should load all needed thinks in right place and > return relevant status. Last but not least, new functionality should We are not restricted to just _one_ hypercall. And this loading thing could be similar to the micrcode hypercall - which just points to a virtual address along with the length - and says 'load me'. > be available through /dev/xen/privcmd or directly from kernel without > bigger effort. Perhaps we should have a email thread on xen-devel where we hash out some ideas. Eric, would you be OK included on this - it would make sense for this mechanism to be as future-proof as possible - and I am not sure what your plans for kexec are in the future? > > Daniel ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH] drivers/xen: avoid out-of-range write in xen_add_device
>>> On 07.01.13 at 16:08, Konrad Rzeszutek Wilk wrote: > On Sat, Jan 05, 2013 at 02:18:46PM -0500, Nickolai Zeldovich wrote: >> xen_add_device() in drivers/xen/pci.c allocates a struct >> physdev_pci_device_add on the stack and then writes to optarr[0]. >> The previous declaration of struct physdev_pci_device_add contained >> a zero-length optarr[] array, presumably assuming it will be allocated >> with kmalloc with a suitable number of trailing elements, but the code in >> xen_add_device() as a result wrote past the end of the (stack-allocated) >> data structure. >> >> Since xen_add_device() is the only use of struct physdev_pci_device_add >> in the kernel, turn optarr[] into a single-element array instead. >> > > Lets include Jan and Xen-devel on this email - as this is also changing > the official header that is used in Xen. Correct - for that reason it is not the header that needs changing here, but the consumer of the header. I have to admit that I find it odd that the compiler allows automatic variables of variable size types without warning - otherwise this wouldn't have gone unnoticed. Jan >> Signed-off-by: Nickolai Zeldovich >> --- >> include/xen/interface/physdev.h |6 +- >> 1 file changed, 1 insertion(+), 5 deletions(-) >> >> diff --git a/include/xen/interface/physdev.h >> b/include/xen/interface/physdev.h >> index 1844d31..24fd218 100644 >> --- a/include/xen/interface/physdev.h >> +++ b/include/xen/interface/physdev.h >> @@ -242,11 +242,7 @@ struct physdev_pci_device_add { >> uint8_t bus; >> uint8_t devfn; >> } physfn; >> -#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L >> -uint32_t optarr[]; >> -#elif defined(__GNUC__) >> -uint32_t optarr[0]; >> -#endif >> +uint32_t optarr[1]; >> }; >> >> #define PHYSDEVOP_pci_device_remove 26 >> -- >> 1.7.10.4 >> ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH] drivers/xen: avoid out-of-range write in xen_add_device
On Sat, Jan 05, 2013 at 02:18:46PM -0500, Nickolai Zeldovich wrote: > xen_add_device() in drivers/xen/pci.c allocates a struct > physdev_pci_device_add on the stack and then writes to optarr[0]. > The previous declaration of struct physdev_pci_device_add contained > a zero-length optarr[] array, presumably assuming it will be allocated > with kmalloc with a suitable number of trailing elements, but the code in > xen_add_device() as a result wrote past the end of the (stack-allocated) > data structure. > > Since xen_add_device() is the only use of struct physdev_pci_device_add > in the kernel, turn optarr[] into a single-element array instead. > Lets include Jan and Xen-devel on this email - as this is also changing the official header that is used in Xen. > Signed-off-by: Nickolai Zeldovich > --- > include/xen/interface/physdev.h |6 +- > 1 file changed, 1 insertion(+), 5 deletions(-) > > diff --git a/include/xen/interface/physdev.h b/include/xen/interface/physdev.h > index 1844d31..24fd218 100644 > --- a/include/xen/interface/physdev.h > +++ b/include/xen/interface/physdev.h > @@ -242,11 +242,7 @@ struct physdev_pci_device_add { > uint8_t bus; > uint8_t devfn; > } physfn; > -#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L > -uint32_t optarr[]; > -#elif defined(__GNUC__) > -uint32_t optarr[0]; > -#endif > +uint32_t optarr[1]; > }; > > #define PHYSDEVOP_pci_device_remove 26 > -- > 1.7.10.4 > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH V3 2/2] vhost: handle polling errors
On 01/07/2013 10:55 PM, Michael S. Tsirkin wrote: > On Mon, Jan 07, 2013 at 12:38:17PM +0800, Jason Wang wrote: >> On 01/06/2013 09:22 PM, Michael S. Tsirkin wrote: >>> On Sun, Jan 06, 2013 at 03:18:38PM +0800, Jason Wang wrote: Polling errors were ignored by vhost/vhost_net, this may lead to crash when trying to remove vhost from waitqueue when after the polling is failed. Solve this problem by: - checking the poll->wqh before trying to remove from waitqueue - report an error when poll() returns a POLLERR in vhost_start_poll() - report an error when vhost_start_poll() fails in vhost_vring_ioctl()/vhost_net_set_backend() which is used to notify the failure to userspace. - report an error in the data path in vhost_net when meet polling errors. After those changes, we can safely drop the tx polling state in vhost_net since it was replaced by the checking of poll->wqh. Signed-off-by: Jason Wang --- drivers/vhost/net.c | 74 drivers/vhost/vhost.c | 31 +++- drivers/vhost/vhost.h |2 +- 3 files changed, 49 insertions(+), 58 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index d10ad6f..125c1e5 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -64,20 +64,10 @@ enum { VHOST_NET_VQ_MAX = 2, }; -enum vhost_net_poll_state { - VHOST_NET_POLL_DISABLED = 0, - VHOST_NET_POLL_STARTED = 1, - VHOST_NET_POLL_STOPPED = 2, -}; - struct vhost_net { struct vhost_dev dev; struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX]; struct vhost_poll poll[VHOST_NET_VQ_MAX]; - /* Tells us whether we are polling a socket for TX. - * We only do this when socket buffer fills up. - * Protected by tx vq lock. */ - enum vhost_net_poll_state tx_poll_state; /* Number of TX recently submitted. * Protected by tx vq lock. */ unsigned tx_packets; @@ -155,24 +145,6 @@ static void copy_iovec_hdr(const struct iovec *from, struct iovec *to, } } -/* Caller must have TX VQ lock */ -static void tx_poll_stop(struct vhost_net *net) -{ - if (likely(net->tx_poll_state != VHOST_NET_POLL_STARTED)) - return; - vhost_poll_stop(net->poll + VHOST_NET_VQ_TX); - net->tx_poll_state = VHOST_NET_POLL_STOPPED; -} - -/* Caller must have TX VQ lock */ -static void tx_poll_start(struct vhost_net *net, struct socket *sock) -{ - if (unlikely(net->tx_poll_state != VHOST_NET_POLL_STOPPED)) - return; - vhost_poll_start(net->poll + VHOST_NET_VQ_TX, sock->file); - net->tx_poll_state = VHOST_NET_POLL_STARTED; -} - /* In case of DMA done not in order in lower device driver for some reason. * upend_idx is used to track end of used idx, done_idx is used to track head * of used idx. Once lower device DMA done contiguously, we will signal KVM @@ -227,6 +199,7 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success) static void handle_tx(struct vhost_net *net) { struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX]; + struct vhost_poll *poll = net->poll + VHOST_NET_VQ_TX; unsigned out, in, s; int head; struct msghdr msg = { @@ -252,7 +225,8 @@ static void handle_tx(struct vhost_net *net) wmem = atomic_read(&sock->sk->sk_wmem_alloc); if (wmem >= sock->sk->sk_sndbuf) { mutex_lock(&vq->mutex); - tx_poll_start(net, sock); + if (vhost_poll_start(poll, sock->file)) + vq_err(vq, "Fail to start TX polling\n"); >>> s/Fail/Failed/ >>> >>> A question though: how can this happen? Could you clarify please? >>> Maybe we can find a way to prevent this error? >> Two conditions I think this can happen: >> >> 1) a buggy userspace disable a queue through TUNSETQUEUE >> 2) the net device were gone >> >> For 1, looks like we can delay the disabling until the refcnt goes to >> zero. For 2 may needs more changes. > I'd expect keeping a socket reference would prevent both issues. > Doesn't it? Doesn't work for 2 I think, the socket didn't hold a refcnt of the device, so the device can go away at anytime. Although we can change this, but it's the behaviour before multiqueue support. > >> Not sure it's worth to do this work, >> maybe a warning is enough just like other failure. > With other failures, you normally can correct the error then > kick to have it restart. This is soomething thagt would not > work here. If userspace is wrote correctly, (e.g passing a fd with correct state) it can also be corrected. > mutex_unlock(&vq->mutex); return;
Re: [PATCH V3 2/2] vhost: handle polling errors
On Mon, Jan 07, 2013 at 12:38:17PM +0800, Jason Wang wrote: > On 01/06/2013 09:22 PM, Michael S. Tsirkin wrote: > > On Sun, Jan 06, 2013 at 03:18:38PM +0800, Jason Wang wrote: > >> Polling errors were ignored by vhost/vhost_net, this may lead to crash when > >> trying to remove vhost from waitqueue when after the polling is failed. > >> Solve > >> this problem by: > >> > >> - checking the poll->wqh before trying to remove from waitqueue > >> - report an error when poll() returns a POLLERR in vhost_start_poll() > >> - report an error when vhost_start_poll() fails in > >> vhost_vring_ioctl()/vhost_net_set_backend() which is used to notify the > >> failure to userspace. > >> - report an error in the data path in vhost_net when meet polling errors. > >> > >> After those changes, we can safely drop the tx polling state in vhost_net > >> since > >> it was replaced by the checking of poll->wqh. > >> > >> Signed-off-by: Jason Wang > >> --- > >> drivers/vhost/net.c | 74 > >> > >> drivers/vhost/vhost.c | 31 +++- > >> drivers/vhost/vhost.h |2 +- > >> 3 files changed, 49 insertions(+), 58 deletions(-) > >> > >> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c > >> index d10ad6f..125c1e5 100644 > >> --- a/drivers/vhost/net.c > >> +++ b/drivers/vhost/net.c > >> @@ -64,20 +64,10 @@ enum { > >>VHOST_NET_VQ_MAX = 2, > >> }; > >> > >> -enum vhost_net_poll_state { > >> - VHOST_NET_POLL_DISABLED = 0, > >> - VHOST_NET_POLL_STARTED = 1, > >> - VHOST_NET_POLL_STOPPED = 2, > >> -}; > >> - > >> struct vhost_net { > >>struct vhost_dev dev; > >>struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX]; > >>struct vhost_poll poll[VHOST_NET_VQ_MAX]; > >> - /* Tells us whether we are polling a socket for TX. > >> - * We only do this when socket buffer fills up. > >> - * Protected by tx vq lock. */ > >> - enum vhost_net_poll_state tx_poll_state; > >>/* Number of TX recently submitted. > >> * Protected by tx vq lock. */ > >>unsigned tx_packets; > >> @@ -155,24 +145,6 @@ static void copy_iovec_hdr(const struct iovec *from, > >> struct iovec *to, > >>} > >> } > >> > >> -/* Caller must have TX VQ lock */ > >> -static void tx_poll_stop(struct vhost_net *net) > >> -{ > >> - if (likely(net->tx_poll_state != VHOST_NET_POLL_STARTED)) > >> - return; > >> - vhost_poll_stop(net->poll + VHOST_NET_VQ_TX); > >> - net->tx_poll_state = VHOST_NET_POLL_STOPPED; > >> -} > >> - > >> -/* Caller must have TX VQ lock */ > >> -static void tx_poll_start(struct vhost_net *net, struct socket *sock) > >> -{ > >> - if (unlikely(net->tx_poll_state != VHOST_NET_POLL_STOPPED)) > >> - return; > >> - vhost_poll_start(net->poll + VHOST_NET_VQ_TX, sock->file); > >> - net->tx_poll_state = VHOST_NET_POLL_STARTED; > >> -} > >> - > >> /* In case of DMA done not in order in lower device driver for some > >> reason. > >> * upend_idx is used to track end of used idx, done_idx is used to track > >> head > >> * of used idx. Once lower device DMA done contiguously, we will signal > >> KVM > >> @@ -227,6 +199,7 @@ static void vhost_zerocopy_callback(struct ubuf_info > >> *ubuf, bool success) > >> static void handle_tx(struct vhost_net *net) > >> { > >>struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX]; > >> + struct vhost_poll *poll = net->poll + VHOST_NET_VQ_TX; > >>unsigned out, in, s; > >>int head; > >>struct msghdr msg = { > >> @@ -252,7 +225,8 @@ static void handle_tx(struct vhost_net *net) > >>wmem = atomic_read(&sock->sk->sk_wmem_alloc); > >>if (wmem >= sock->sk->sk_sndbuf) { > >>mutex_lock(&vq->mutex); > >> - tx_poll_start(net, sock); > >> + if (vhost_poll_start(poll, sock->file)) > >> + vq_err(vq, "Fail to start TX polling\n"); > > s/Fail/Failed/ > > > > A question though: how can this happen? Could you clarify please? > > Maybe we can find a way to prevent this error? > > Two conditions I think this can happen: > > 1) a buggy userspace disable a queue through TUNSETQUEUE > 2) the net device were gone > > For 1, looks like we can delay the disabling until the refcnt goes to > zero. For 2 may needs more changes. I'd expect keeping a socket reference would prevent both issues. Doesn't it? > Not sure it's worth to do this work, > maybe a warning is enough just like other failure. With other failures, you normally can correct the error then kick to have it restart. This is soomething thagt would not work here. > > > >>mutex_unlock(&vq->mutex); > >>return; > >>} > >> @@ -261,7 +235,7 @@ static void handle_tx(struct vhost_net *net) > >>vhost_disable_notify(&net->dev, vq); > >> > >>if (wmem < sock->sk->sk_sndbuf / 2) > >> - tx_poll_stop(net); > >> + vhost_poll_stop(poll); > >>hdr_size = vq->vhost_hlen; > >>zcopy = vq->ubufs; > >> > >> @@ -283,8 +257,10 @@ static void ha
Re: [PATCH v2 1/5] virtio: add functions for piecewise addition of buffers
Il 07/01/2013 01:02, Rusty Russell ha scritto: > Paolo Bonzini writes: >> Il 02/01/2013 06:03, Rusty Russell ha scritto: >>> Paolo Bonzini writes: The virtqueue_add_buf function has two limitations: 1) it requires the caller to provide all the buffers in a single call; 2) it does not support chained scatterlists: the buffers must be provided as an array of struct scatterlist; >>> >>> Chained scatterlists are a horrible interface, but that doesn't mean we >>> shouldn't support them if there's a need. >>> >>> I think I once even had a patch which passed two chained sgs, rather >>> than a combo sg and two length numbers. It's very old, but I've pasted >>> it below. >>> >>> Duplicating the implementation by having another interface is pretty >>> nasty; I think I'd prefer the chained scatterlists, if that's optimal >>> for you. >> >> Unfortunately, that cannot work because not all architectures support >> chained scatterlists. > > WHAT? I can't figure out what an arch needs to do to support this? It needs to use the iterator functions in its DMA driver. > All archs we care about support them, though, so I think we can ignore > this issue for now. Kind of... In principle all QEMU-supported arches can use virtio, and the speedup can be quite useful. And there is no Kconfig symbol for SG chains that I can use to disable virtio-scsi on unsupported arches. :/ Paolo >> (Also, as you mention chained scatterlists are horrible. They'd happen >> to work for virtio-scsi, but not for virtio-blk where the response >> status is part of the footer, not the header). > > We lost that debate 5 years ago, so we hack around it as needed. We can > add helpers to append if we need. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [Xen-devel] [PATCH v3 00/11] xen: Initial kexec/kdump implementation
On Mon, 2013-01-07 at 12:34 +, Daniel Kiper wrote: > I think that new kexec hypercall function should mimics kexec syscall. We want to have an interface can be used by non-Linux domains (both dom0 and domU) as well though, so please bear this in mind. Historically we've not always been good at this when the hypercall interface is strongly tied to a particular guest implementation (in some sense this is the problem with the current kexec hypercall). Also what makes for a good syscall interface does not necessarily make for a good hypercall interface. Ian. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v3 02/11] x86/kexec: Add extra pointers to transition page table PGD, PUD, PMD and PTE
>>> On 07.01.13 at 13:52, Daniel Kiper wrote: > On Mon, Jan 07, 2013 at 09:48:20AM +, Jan Beulich wrote: >> >>> On 04.01.13 at 18:25, Daniel Kiper wrote: >> > Right, so where is virtual mapping of control page established? >> > I could not find relevant code in SLES kernel which does that. >> >> In the hypervisor (xen/arch/x86/machine_kexec.c:machine_kexec_load()). >> xen/arch/x86/machine_kexec.c:machine_kexec() then simply uses >> image->page_list[1]. > > This (xen/arch/x86/machine_kexec.c:machine_kexec_load()) maps relevant > page (allocated earlier by dom0) in hypervisor fixmap area. However, > it does not make relevant mapping in transition page table which > leads to crash when %cr3 is switched from Xen page table to > transition page table. That indeed could explain _random_ failures - the fixmap entries get created with _PAGE_GLOBAL set, i.e. don't get flushed with the CR3 write unless CR4.PGE is clear. And I don't see how your allocation of intermediate page tables would help: You wouldn't know where the mapping of the control page lives until you're actually in the early relocate_kernel code. Or was it that what distinguishes your cloned code from the native original? Jan ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v3 02/11] x86/kexec: Add extra pointers to transition page table PGD, PUD, PMD and PTE
On Mon, Jan 07, 2013 at 09:48:20AM +, Jan Beulich wrote: > >>> On 04.01.13 at 18:25, Daniel Kiper wrote: > > Right, so where is virtual mapping of control page established? > > I could not find relevant code in SLES kernel which does that. > > In the hypervisor (xen/arch/x86/machine_kexec.c:machine_kexec_load()). > xen/arch/x86/machine_kexec.c:machine_kexec() then simply uses > image->page_list[1]. This (xen/arch/x86/machine_kexec.c:machine_kexec_load()) maps relevant page (allocated earlier by dom0) in hypervisor fixmap area. However, it does not make relevant mapping in transition page table which leads to crash when %cr3 is switched from Xen page table to transition page table. Daniel ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [Xen-devel] [PATCH v3 00/11] xen: Initial kexec/kdump implementation
On Fri, Jan 04, 2013 at 02:11:46PM -0500, Konrad Rzeszutek Wilk wrote: > On Fri, Jan 04, 2013 at 06:07:51PM +0100, Daniel Kiper wrote: > > On Fri, Jan 04, 2013 at 02:41:17PM +, Jan Beulich wrote: > > > >>> On 04.01.13 at 15:22, Daniel Kiper wrote: > > > > On Wed, Jan 02, 2013 at 11:26:43AM +, Andrew Cooper wrote: > > > >> /sbin/kexec can load the "Xen" crash kernel itself by issuing > > > >> hypercalls using /dev/xen/privcmd. This would remove the need for > > > >> the dom0 kernel to distinguish between loading a crash kernel for > > > >> itself and loading a kernel for Xen. > > > >> > > > >> Or is this just a silly idea complicating the matter? > > > > > > > > This is impossible with current Xen kexec/kdump interface. > > > > > > Why? > > > > Because current KEXEC_CMD_kexec_load does not load kernel > > image and other things into Xen memory. It means that it > > should live somewhere in dom0 Linux kernel memory. > > We could have a very simple hypercall which would have: > > struct fancy_new_hypercall { > xen_pfn_t payload; // IN > ssize_t len; // IN > #define DATA (1<<1) > #define DATA_EOF (1<<2) > #define DATA_KERNEL (1<<3) > #define DATA_RAMDISK (1<<4) > unsigned int flags; // IN > unsigned int status; // OUT > }; > > which would in a loop just iterate over the payloads and > let the hypervisor stick it in the crashkernel space. > > This is all hand-waving of course. There probably would be a need > to figure out how much space you have in the reserved Xen's > 'crashkernel' memory region too. I think that new kexec hypercall function should mimics kexec syscall. It means that all arguments passed to hypercall should have same types if it is possible or if it is not possible then conversion should be done in very easy way. Additionally, I think that one call of new hypercall load function should load all needed thinks in right place and return relevant status. Last but not least, new functionality should be available through /dev/xen/privcmd or directly from kernel without bigger effort. Daniel ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [Xen-devel] [PATCH v3 00/11] xen: Initial kexec/kdump implementation
On Mon, 2013-01-07 at 10:46 +, Andrew Cooper wrote: > Given that /sbin/kexec creates a binary blob in memory, surely the most > simple thing is to get it to suitably mlock() the region and give a list > of VAs to the hypervisor. More than likely. The DOMID_KEXEC thing was just a radon musing ;-) > This way, Xen can properly take care of what it does with information > and where. For example, at the moment, allowing dom0 to choose where > gets overwritten in the Xen crash area is a recipe for disaster if a > crash occurs midway through loading/reloading the crash kernel. That's true. I think there is a double buffering scheme in the current thing and we should preserve that in any new implementation. Ian. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [Xen-devel] [PATCH v3 00/11] xen: Initial kexec/kdump implementation
On 07/01/13 10:25, Ian Campbell wrote: On Fri, 2013-01-04 at 19:11 +, Konrad Rzeszutek Wilk wrote: On Fri, Jan 04, 2013 at 06:07:51PM +0100, Daniel Kiper wrote: Because current KEXEC_CMD_kexec_load does not load kernel image and other things into Xen memory. It means that it should live somewhere in dom0 Linux kernel memory. We could have a very simple hypercall which would have: struct fancy_new_hypercall { xen_pfn_t payload; // IN This would have to be XEN_GUEST_HANDLE(something) since userspace cannot figure out what pfns back its memory. In any case since the hypervisor is going to want to copy the data into the crashkernel space a virtual address is convenient to have. ssize_t len; // IN #define DATA (1<<1) #define DATA_EOF (1<<2) #define DATA_KERNEL (1<<3) #define DATA_RAMDISK (1<<4) unsigned int flags; // IN unsigned int status; // OUT }; which would in a loop just iterate over the payloads and let the hypervisor stick it in the crashkernel space. This is all hand-waving of course. There probably would be a need to figure out how much space you have in the reserved Xen's 'crashkernel' memory region too. This is probably a mad idea but it's Monday morning and I'm sleep deprived so I'll throw it out there... What about adding DOMID_KEXEC (similar DOMID_IO etc)? This would allow dom0 to map the kexec memory space with the usual privcmd mmap hypercalls and build things in it directly. OK, I suspect this might not be practical for a variety of reasons (lack of a p2m for such domains so no way to find out the list of mfns, dom0 userspace simply doesn't have sufficient context to write sensible things here, etc) but maybe someone has a better head on today... Ian. Given that /sbin/kexec creates a binary blob in memory, surely the most simple thing is to get it to suitably mlock() the region and give a list of VAs to the hypervisor. This way, Xen can properly take care of what it does with information and where. For example, at the moment, allowing dom0 to choose where gets overwritten in the Xen crash area is a recipe for disaster if a crash occurs midway through loading/reloading the crash kernel. ~Andrew ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [Xen-devel] [PATCH v3 00/11] xen: Initial kexec/kdump implementation
On Fri, 2013-01-04 at 19:11 +, Konrad Rzeszutek Wilk wrote: > On Fri, Jan 04, 2013 at 06:07:51PM +0100, Daniel Kiper wrote: > > On Fri, Jan 04, 2013 at 02:41:17PM +, Jan Beulich wrote: > > > >>> On 04.01.13 at 15:22, Daniel Kiper wrote: > > > > On Wed, Jan 02, 2013 at 11:26:43AM +, Andrew Cooper wrote: > > > >> /sbin/kexec can load the "Xen" crash kernel itself by issuing > > > >> hypercalls using /dev/xen/privcmd. This would remove the need for > > > >> the dom0 kernel to distinguish between loading a crash kernel for > > > >> itself and loading a kernel for Xen. > > > >> > > > >> Or is this just a silly idea complicating the matter? > > > > > > > > This is impossible with current Xen kexec/kdump interface. > > > > > > Why? > > > > Because current KEXEC_CMD_kexec_load does not load kernel > > image and other things into Xen memory. It means that it > > should live somewhere in dom0 Linux kernel memory. > > We could have a very simple hypercall which would have: > > struct fancy_new_hypercall { > xen_pfn_t payload; // IN This would have to be XEN_GUEST_HANDLE(something) since userspace cannot figure out what pfns back its memory. In any case since the hypervisor is going to want to copy the data into the crashkernel space a virtual address is convenient to have. > ssize_t len; // IN > #define DATA (1<<1) > #define DATA_EOF (1<<2) > #define DATA_KERNEL (1<<3) > #define DATA_RAMDISK (1<<4) > unsigned int flags; // IN > unsigned int status; // OUT > }; > > which would in a loop just iterate over the payloads and > let the hypervisor stick it in the crashkernel space. > > This is all hand-waving of course. There probably would be a need > to figure out how much space you have in the reserved Xen's > 'crashkernel' memory region too. This is probably a mad idea but it's Monday morning and I'm sleep deprived so I'll throw it out there... What about adding DOMID_KEXEC (similar DOMID_IO etc)? This would allow dom0 to map the kexec memory space with the usual privcmd mmap hypercalls and build things in it directly. OK, I suspect this might not be practical for a variety of reasons (lack of a p2m for such domains so no way to find out the list of mfns, dom0 userspace simply doesn't have sufficient context to write sensible things here, etc) but maybe someone has a better head on today... Ian. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v3 02/11] x86/kexec: Add extra pointers to transition page table PGD, PUD, PMD and PTE
>>> On 04.01.13 at 18:25, Daniel Kiper wrote: > Right, so where is virtual mapping of control page established? > I could not find relevant code in SLES kernel which does that. In the hypervisor (xen/arch/x86/machine_kexec.c:machine_kexec_load()). xen/arch/x86/machine_kexec.c:machine_kexec() then simply uses image->page_list[1]. Jan ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH V2 2/2] virtio-net: reset virtqueue affinity when doing cpu hotplug
On 01/07/2013 03:55 PM, Jason Wang wrote: > On 01/07/2013 03:48 PM, Wanlong Gao wrote: >> On 01/07/2013 03:28 PM, Jason Wang wrote: >>> On 01/07/2013 03:15 PM, Wanlong Gao wrote: Add a cpu notifier to virtio-net, so that we can reset the virtqueue affinity if the cpu hotplug happens. It improve the performance through enabling or disabling the virtqueue affinity after doing cpu hotplug. Adding the notifier block to virtnet_info is suggested by Jason, thank you. Cc: Rusty Russell Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: Eric Dumazet Cc: virtualization@lists.linux-foundation.org Cc: net...@vger.kernel.org Signed-off-by: Wanlong Gao --- drivers/net/virtio_net.c | 30 ++ 1 file changed, 30 insertions(+) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index b483fb5..9547b4c 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -26,6 +26,7 @@ #include #include #include +#include static int napi_weight = 128; module_param(napi_weight, int, 0444); @@ -123,6 +124,9 @@ struct virtnet_info { /* Does the affinity hint is set for virtqueues? */ bool affinity_hint_set; + + /* CPU hot plug notifier */ + struct notifier_block nb; }; struct skb_vnet_hdr { @@ -1051,6 +1055,23 @@ static void virtnet_set_affinity(struct virtnet_info *vi, bool set) } } +static int virtnet_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + struct virtnet_info *vi = container_of(nfb, struct virtnet_info, nb); + switch(action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + case CPU_DEAD: + case CPU_DEAD_FROZEN: + virtnet_set_affinity(vi, true); + break; + default: + break; + } + return NOTIFY_OK; +} + >>> I think you'd better fix the .ndo_select_queue() as well (as Michael >>> said in your V1) since it currently uses smp processor id which may not >>> work very well in this case also. >> The bug is we can't get the right txq if the CPU IDs are not consecutive, >> right? Do you have any good idea about fixing this? >> >> Thanks, >> Wanlong Gao > > The point is make the virtqueue private to a specific cpu when the > number of queue pairs is equal to the number of cpus. So after you bind > the vq affinity to a specific cpu, you'd better use the reverse mapping > of this affinity to do .ndo_select_queue(). One possible idea, as > Michael suggested, is a per-cpu structure to record the preferable > virtqueue and do both .ndo_select_queue() and affinity hint setting > based on this. Yeah, I think I got it now, will address it in V3. thank you. ;) Regards, Wanlong Gao >> >>> Thanks static void virtnet_get_ringparam(struct net_device *dev, struct ethtool_ringparam *ring) { @@ -1509,6 +1530,13 @@ static int virtnet_probe(struct virtio_device *vdev) } } + vi->nb.notifier_call = &virtnet_cpu_callback; + err = register_hotcpu_notifier(&vi->nb); + if (err) { + pr_debug("virtio_net: registering cpu notifier failed\n"); + goto free_recv_bufs; + } + /* Assume link up if device can't report link status, otherwise get link status from config. */ if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) { @@ -1553,6 +1581,8 @@ static void virtnet_remove(struct virtio_device *vdev) { struct virtnet_info *vi = vdev->priv; + unregister_hotcpu_notifier(&vi->nb); + /* Prevent config work handler from accessing the device. */ mutex_lock(&vi->config_lock); vi->config_enable = false; >>> > > ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization