[no subject]

2013-01-07 Thread Michael S. Tsirkin
On Sun, Jan 06, 2013 at 02:36:13PM +0800, Asias He wrote:
> This drops the cmd completion list spin lock and makes the cmd
> completion queue lock-less.
> 
> Signed-off-by: Asias He 


Nicholas, any feedback?

> ---
>  drivers/vhost/tcm_vhost.c | 46 +-
>  drivers/vhost/tcm_vhost.h |  2 +-
>  2 files changed, 14 insertions(+), 34 deletions(-)
> 
> diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/tcm_vhost.c
> index b20df5c..3720604 100644
> --- a/drivers/vhost/tcm_vhost.c
> +++ b/drivers/vhost/tcm_vhost.c
> @@ -47,6 +47,7 @@
>  #include 
>  #include  /* TODO vhost.h currently depends on this */
>  #include 
> +#include 
>  
>  #include "vhost.c"
>  #include "vhost.h"
> @@ -64,8 +65,7 @@ struct vhost_scsi {
>   struct vhost_virtqueue vqs[3];
>  
>   struct vhost_work vs_completion_work; /* cmd completion work item */
> - struct list_head vs_completion_list;  /* cmd completion queue */
> - spinlock_t vs_completion_lock;/* protects s_completion_list */
> + struct llist_head vs_completion_list; /* cmd completion queue */
>  };
>  
>  /* Local pointer to allocated TCM configfs fabric module */
> @@ -301,9 +301,7 @@ static void vhost_scsi_complete_cmd(struct tcm_vhost_cmd 
> *tv_cmd)
>  {
>   struct vhost_scsi *vs = tv_cmd->tvc_vhost;
>  
> - spin_lock_bh(&vs->vs_completion_lock);
> - list_add_tail(&tv_cmd->tvc_completion_list, &vs->vs_completion_list);
> - spin_unlock_bh(&vs->vs_completion_lock);
> + llist_add(&tv_cmd->tvc_completion_list, &vs->vs_completion_list);
>  
>   vhost_work_queue(&vs->dev, &vs->vs_completion_work);
>  }
> @@ -347,27 +345,6 @@ static void vhost_scsi_free_cmd(struct tcm_vhost_cmd 
> *tv_cmd)
>   kfree(tv_cmd);
>  }
>  
> -/* Dequeue a command from the completion list */
> -static struct tcm_vhost_cmd *vhost_scsi_get_cmd_from_completion(
> - struct vhost_scsi *vs)
> -{
> - struct tcm_vhost_cmd *tv_cmd = NULL;
> -
> - spin_lock_bh(&vs->vs_completion_lock);
> - if (list_empty(&vs->vs_completion_list)) {
> - spin_unlock_bh(&vs->vs_completion_lock);
> - return NULL;
> - }
> -
> - list_for_each_entry(tv_cmd, &vs->vs_completion_list,
> - tvc_completion_list) {
> - list_del(&tv_cmd->tvc_completion_list);
> - break;
> - }
> - spin_unlock_bh(&vs->vs_completion_lock);
> - return tv_cmd;
> -}
> -
>  /* Fill in status and signal that we are done processing this command
>   *
>   * This is scheduled in the vhost work queue so we are called with the owner
> @@ -377,12 +354,18 @@ static void vhost_scsi_complete_cmd_work(struct 
> vhost_work *work)
>  {
>   struct vhost_scsi *vs = container_of(work, struct vhost_scsi,
>   vs_completion_work);
> + struct virtio_scsi_cmd_resp v_rsp;
>   struct tcm_vhost_cmd *tv_cmd;
> + struct llist_node *llnode;
> + struct se_cmd *se_cmd;
> + int ret;
>  
> - while ((tv_cmd = vhost_scsi_get_cmd_from_completion(vs))) {
> - struct virtio_scsi_cmd_resp v_rsp;
> - struct se_cmd *se_cmd = &tv_cmd->tvc_se_cmd;
> - int ret;
> + llnode = llist_del_all(&vs->vs_completion_list);
> + while (llnode) {
> + tv_cmd = llist_entry(llnode, struct tcm_vhost_cmd,
> +  tvc_completion_list);
> + llnode = llist_next(llnode);
> + se_cmd = &tv_cmd->tvc_se_cmd;
>  
>   pr_debug("%s tv_cmd %p resid %u status %#02x\n", __func__,
>   tv_cmd, se_cmd->residual_count, se_cmd->scsi_status);
> @@ -426,7 +409,6 @@ static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
>   pr_err("Unable to allocate struct tcm_vhost_cmd\n");
>   return ERR_PTR(-ENOMEM);
>   }
> - INIT_LIST_HEAD(&tv_cmd->tvc_completion_list);
>   tv_cmd->tvc_tag = v_req->tag;
>   tv_cmd->tvc_task_attr = v_req->task_attr;
>   tv_cmd->tvc_exp_data_len = exp_data_len;
> @@ -859,8 +841,6 @@ static int vhost_scsi_open(struct inode *inode, struct 
> file *f)
>   return -ENOMEM;
>  
>   vhost_work_init(&s->vs_completion_work, vhost_scsi_complete_cmd_work);
> - INIT_LIST_HEAD(&s->vs_completion_list);
> - spin_lock_init(&s->vs_completion_lock);
>  
>   s->vqs[VHOST_SCSI_VQ_CTL].handle_kick = vhost_scsi_ctl_handle_kick;
>   s->vqs[VHOST_SCSI_VQ_EVT].handle_kick = vhost_scsi_evt_handle_kick;
> diff --git a/drivers/vhost/tcm_vhost.h b/drivers/vhost/tcm_vhost.h
> index 7e87c63..47ee80b 100644
> --- a/drivers/vhost/tcm_vhost.h
> +++ b/drivers/vhost/tcm_vhost.h
> @@ -34,7 +34,7 @@ struct tcm_vhost_cmd {
>   /* Sense buffer that will be mapped into outgoing status */
>   unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER];
>   /* Completed commands list, serviced from vhost worker thread */
> - struct list_head tvc_completion_list;
> + struct llist_n

Re: [PATCH v2 1/5] virtio: add functions for piecewise addition of buffers

2013-01-07 Thread Rusty Russell
Paolo Bonzini  writes:
> Il 07/01/2013 01:02, Rusty Russell ha scritto:
>> Paolo Bonzini  writes:
>>> Il 02/01/2013 06:03, Rusty Russell ha scritto:
 Paolo Bonzini  writes:
> The virtqueue_add_buf function has two limitations:
>
> 1) it requires the caller to provide all the buffers in a single call;
>
> 2) it does not support chained scatterlists: the buffers must be
> provided as an array of struct scatterlist;

 Chained scatterlists are a horrible interface, but that doesn't mean we
 shouldn't support them if there's a need.

 I think I once even had a patch which passed two chained sgs, rather
 than a combo sg and two length numbers.  It's very old, but I've pasted
 it below.

 Duplicating the implementation by having another interface is pretty
 nasty; I think I'd prefer the chained scatterlists, if that's optimal
 for you.
>>>
>>> Unfortunately, that cannot work because not all architectures support
>>> chained scatterlists.
>> 
>> WHAT?  I can't figure out what an arch needs to do to support this?
>
> It needs to use the iterator functions in its DMA driver.

But we don't care for virtio.

>> All archs we care about support them, though, so I think we can ignore
>> this issue for now.
>
> Kind of... In principle all QEMU-supported arches can use virtio, and
> the speedup can be quite useful.  And there is no Kconfig symbol for SG
> chains that I can use to disable virtio-scsi on unsupported arches. :/

Well, we #error if it's not supported.  Then the lazy architectures can
fix it.

Cheers,
Rusty.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [Xen-devel] [PATCH v3 00/11] xen: Initial kexec/kdump implementation

2013-01-07 Thread David Vrabel
On 04/01/13 14:22, Daniel Kiper wrote:
> On Wed, Jan 02, 2013 at 11:26:43AM +, Andrew Cooper wrote:
>> On 27/12/12 18:02, Eric W. Biederman wrote:
>>> Andrew Cooper  writes:
>>>
 On 27/12/2012 07:53, Eric W. Biederman wrote:
> The syscall ABI still has the wrong semantics.
>
> Aka totally unmaintainable and umergeable.
>
> The concept of domU support is also strange.  What does domU support even 
> mean, when the dom0 support is loading a kernel to pick up Xen when Xen 
> falls over.
 There are two requirements pulling at this patch series, but I agree
 that we need to clarify them.
>>> It probably make sense to split them apart a little even.
>>>
>>>
>>
>> Thinking about this split, there might be a way to simply it even more.
>>
>> /sbin/kexec can load the "Xen" crash kernel itself by issuing
>> hypercalls using /dev/xen/privcmd.  This would remove the need for
>> the dom0 kernel to distinguish between loading a crash kernel for
>> itself and loading a kernel for Xen.
>>
>> Or is this just a silly idea complicating the matter?
> 
> This is impossible with current Xen kexec/kdump interface.
> It should be changed to do that. However, I suppose that
> Xen community would not be interested in such changes.

I don't see why the hypercall ABI cannot be extended with new sub-ops
that do the right thing -- the existing ABI is a bit weird.

I plan to start prototyping something shortly (hopefully next week) for
the Xen kexec case.

David
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [Xen-devel] [PATCH v3 00/11] xen: Initial kexec/kdump implementation

2013-01-07 Thread Konrad Rzeszutek Wilk
On Mon, Jan 07, 2013 at 01:34:04PM +0100, Daniel Kiper wrote:
> On Fri, Jan 04, 2013 at 02:11:46PM -0500, Konrad Rzeszutek Wilk wrote:
> > On Fri, Jan 04, 2013 at 06:07:51PM +0100, Daniel Kiper wrote:
> > > On Fri, Jan 04, 2013 at 02:41:17PM +, Jan Beulich wrote:
> > > > >>> On 04.01.13 at 15:22, Daniel Kiper  wrote:
> > > > > On Wed, Jan 02, 2013 at 11:26:43AM +, Andrew Cooper wrote:
> > > > >> /sbin/kexec can load the "Xen" crash kernel itself by issuing
> > > > >> hypercalls using /dev/xen/privcmd.  This would remove the need for
> > > > >> the dom0 kernel to distinguish between loading a crash kernel for
> > > > >> itself and loading a kernel for Xen.
> > > > >>
> > > > >> Or is this just a silly idea complicating the matter?
> > > > >
> > > > > This is impossible with current Xen kexec/kdump interface.
> > > >
> > > > Why?
> > >
> > > Because current KEXEC_CMD_kexec_load does not load kernel
> > > image and other things into Xen memory. It means that it
> > > should live somewhere in dom0 Linux kernel memory.
> >
> > We could have a very simple hypercall which would have:
> >
> > struct fancy_new_hypercall {
> > xen_pfn_t payload; // IN
> > ssize_t len; // IN
> > #define DATA (1<<1)
> > #define DATA_EOF (1<<2)
> > #define DATA_KERNEL (1<<3)
> > #define DATA_RAMDISK (1<<4)
> > unsigned int flags; // IN
> > unsigned int status; // OUT
> > };
> >
> > which would in a loop just iterate over the payloads and
> > let the hypervisor stick it in the crashkernel space.
> >
> > This is all hand-waving of course. There probably would be a need
> > to figure out how much space you have in the reserved Xen's
> > 'crashkernel' memory region too.
> 
> I think that new kexec hypercall function should mimics kexec syscall.
> It means that all arguments passed to hypercall should have same types
> if it is possible or if it is not possible then conversion should be done
> in very easy way. Additionally, I think that one call of new hypercall
> load function should load all needed thinks in right place and
> return relevant status. Last but not least, new functionality should

We are not restricted to just _one_ hypercall. And this loading
thing could be similar to the micrcode hypercall - which just points
to a virtual address along with the length - and says 'load me'.

> be available through /dev/xen/privcmd or directly from kernel without
> bigger effort.

Perhaps we should have a email thread on xen-devel where we hash out
some ideas. Eric, would you be OK included on this - it would make
sense for this mechanism to be as future-proof as possible - and I am not
sure what your plans for kexec are in the future?
> 
> Daniel
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH] drivers/xen: avoid out-of-range write in xen_add_device

2013-01-07 Thread Jan Beulich
>>> On 07.01.13 at 16:08, Konrad Rzeszutek Wilk  wrote:
> On Sat, Jan 05, 2013 at 02:18:46PM -0500, Nickolai Zeldovich wrote:
>> xen_add_device() in drivers/xen/pci.c allocates a struct
>> physdev_pci_device_add on the stack and then writes to optarr[0].
>> The previous declaration of struct physdev_pci_device_add contained
>> a zero-length optarr[] array, presumably assuming it will be allocated
>> with kmalloc with a suitable number of trailing elements, but the code in
>> xen_add_device() as a result wrote past the end of the (stack-allocated)
>> data structure.
>> 
>> Since xen_add_device() is the only use of struct physdev_pci_device_add
>> in the kernel, turn optarr[] into a single-element array instead.
>> 
> 
> Lets include Jan and Xen-devel on this email - as this is also changing
> the official header that is used in Xen.

Correct - for that reason it is not the header that needs changing
here, but the consumer of the header. I have to admit that I find
it odd that the compiler allows automatic variables of variable size
types without warning - otherwise this wouldn't have gone
unnoticed.

Jan

>> Signed-off-by: Nickolai Zeldovich 
>> ---
>>  include/xen/interface/physdev.h |6 +-
>>  1 file changed, 1 insertion(+), 5 deletions(-)
>> 
>> diff --git a/include/xen/interface/physdev.h 
>> b/include/xen/interface/physdev.h
>> index 1844d31..24fd218 100644
>> --- a/include/xen/interface/physdev.h
>> +++ b/include/xen/interface/physdev.h
>> @@ -242,11 +242,7 @@ struct physdev_pci_device_add {
>>  uint8_t bus;
>>  uint8_t devfn;
>>  } physfn;
>> -#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
>> -uint32_t optarr[];
>> -#elif defined(__GNUC__)
>> -uint32_t optarr[0];
>> -#endif
>> +uint32_t optarr[1];
>>  };
>>  
>>  #define PHYSDEVOP_pci_device_remove 26
>> -- 
>> 1.7.10.4
>> 



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH] drivers/xen: avoid out-of-range write in xen_add_device

2013-01-07 Thread Konrad Rzeszutek Wilk
On Sat, Jan 05, 2013 at 02:18:46PM -0500, Nickolai Zeldovich wrote:
> xen_add_device() in drivers/xen/pci.c allocates a struct
> physdev_pci_device_add on the stack and then writes to optarr[0].
> The previous declaration of struct physdev_pci_device_add contained
> a zero-length optarr[] array, presumably assuming it will be allocated
> with kmalloc with a suitable number of trailing elements, but the code in
> xen_add_device() as a result wrote past the end of the (stack-allocated)
> data structure.
> 
> Since xen_add_device() is the only use of struct physdev_pci_device_add
> in the kernel, turn optarr[] into a single-element array instead.
> 

Lets include Jan and Xen-devel on this email - as this is also changing
the official header that is used in Xen.

> Signed-off-by: Nickolai Zeldovich 
> ---
>  include/xen/interface/physdev.h |6 +-
>  1 file changed, 1 insertion(+), 5 deletions(-)
> 
> diff --git a/include/xen/interface/physdev.h b/include/xen/interface/physdev.h
> index 1844d31..24fd218 100644
> --- a/include/xen/interface/physdev.h
> +++ b/include/xen/interface/physdev.h
> @@ -242,11 +242,7 @@ struct physdev_pci_device_add {
>  uint8_t bus;
>  uint8_t devfn;
>  } physfn;
> -#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
> -uint32_t optarr[];
> -#elif defined(__GNUC__)
> -uint32_t optarr[0];
> -#endif
> +uint32_t optarr[1];
>  };
>  
>  #define PHYSDEVOP_pci_device_remove 26
> -- 
> 1.7.10.4
> 
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH V3 2/2] vhost: handle polling errors

2013-01-07 Thread Jason Wang
On 01/07/2013 10:55 PM, Michael S. Tsirkin wrote:
> On Mon, Jan 07, 2013 at 12:38:17PM +0800, Jason Wang wrote:
>> On 01/06/2013 09:22 PM, Michael S. Tsirkin wrote:
>>> On Sun, Jan 06, 2013 at 03:18:38PM +0800, Jason Wang wrote:
 Polling errors were ignored by vhost/vhost_net, this may lead to crash when
 trying to remove vhost from waitqueue when after the polling is failed. 
 Solve
 this problem by:

 - checking the poll->wqh before trying to remove from waitqueue
 - report an error when poll() returns a POLLERR in vhost_start_poll()
 - report an error when vhost_start_poll() fails in
   vhost_vring_ioctl()/vhost_net_set_backend() which is used to notify the
   failure to userspace.
 - report an error in the data path in vhost_net when meet polling errors.

 After those changes, we can safely drop the tx polling state in vhost_net 
 since
 it was replaced by the checking of poll->wqh.

 Signed-off-by: Jason Wang 
 ---
  drivers/vhost/net.c   |   74 
 
  drivers/vhost/vhost.c |   31 +++-
  drivers/vhost/vhost.h |2 +-
  3 files changed, 49 insertions(+), 58 deletions(-)

 diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
 index d10ad6f..125c1e5 100644
 --- a/drivers/vhost/net.c
 +++ b/drivers/vhost/net.c
 @@ -64,20 +64,10 @@ enum {
VHOST_NET_VQ_MAX = 2,
  };
  
 -enum vhost_net_poll_state {
 -  VHOST_NET_POLL_DISABLED = 0,
 -  VHOST_NET_POLL_STARTED = 1,
 -  VHOST_NET_POLL_STOPPED = 2,
 -};
 -
  struct vhost_net {
struct vhost_dev dev;
struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
struct vhost_poll poll[VHOST_NET_VQ_MAX];
 -  /* Tells us whether we are polling a socket for TX.
 -   * We only do this when socket buffer fills up.
 -   * Protected by tx vq lock. */
 -  enum vhost_net_poll_state tx_poll_state;
/* Number of TX recently submitted.
 * Protected by tx vq lock. */
unsigned tx_packets;
 @@ -155,24 +145,6 @@ static void copy_iovec_hdr(const struct iovec *from, 
 struct iovec *to,
}
  }
  
 -/* Caller must have TX VQ lock */
 -static void tx_poll_stop(struct vhost_net *net)
 -{
 -  if (likely(net->tx_poll_state != VHOST_NET_POLL_STARTED))
 -  return;
 -  vhost_poll_stop(net->poll + VHOST_NET_VQ_TX);
 -  net->tx_poll_state = VHOST_NET_POLL_STOPPED;
 -}
 -
 -/* Caller must have TX VQ lock */
 -static void tx_poll_start(struct vhost_net *net, struct socket *sock)
 -{
 -  if (unlikely(net->tx_poll_state != VHOST_NET_POLL_STOPPED))
 -  return;
 -  vhost_poll_start(net->poll + VHOST_NET_VQ_TX, sock->file);
 -  net->tx_poll_state = VHOST_NET_POLL_STARTED;
 -}
 -
  /* In case of DMA done not in order in lower device driver for some 
 reason.
   * upend_idx is used to track end of used idx, done_idx is used to track 
 head
   * of used idx. Once lower device DMA done contiguously, we will signal 
 KVM
 @@ -227,6 +199,7 @@ static void vhost_zerocopy_callback(struct ubuf_info 
 *ubuf, bool success)
  static void handle_tx(struct vhost_net *net)
  {
struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
 +  struct vhost_poll *poll = net->poll + VHOST_NET_VQ_TX;
unsigned out, in, s;
int head;
struct msghdr msg = {
 @@ -252,7 +225,8 @@ static void handle_tx(struct vhost_net *net)
wmem = atomic_read(&sock->sk->sk_wmem_alloc);
if (wmem >= sock->sk->sk_sndbuf) {
mutex_lock(&vq->mutex);
 -  tx_poll_start(net, sock);
 +  if (vhost_poll_start(poll, sock->file))
 +  vq_err(vq, "Fail to start TX polling\n");
>>> s/Fail/Failed/
>>>
>>> A question though: how can this happen? Could you clarify please?
>>> Maybe we can find a way to prevent this error?
>> Two conditions I think this can happen:
>>
>> 1) a buggy userspace disable a queue through TUNSETQUEUE
>> 2) the net device were gone
>>
>> For 1, looks like we can delay the disabling until the refcnt goes to
>> zero. For 2 may needs more changes.
> I'd expect keeping a socket reference would prevent both issues.
> Doesn't it?

Doesn't work for 2 I think, the socket didn't hold a refcnt of the
device, so the device can go away at anytime. Although we can change
this, but it's the behaviour before multiqueue support.
>
>> Not sure it's worth to do this work,
>> maybe a warning is enough just like other failure.
> With other failures, you normally can correct the error then
> kick to have it restart. This is soomething thagt would not
> work here.

If userspace is wrote correctly, (e.g passing a fd with correct state)
it can also be corrected.
>
mutex_unlock(&vq->mutex);
return;
 

Re: [PATCH V3 2/2] vhost: handle polling errors

2013-01-07 Thread Michael S. Tsirkin
On Mon, Jan 07, 2013 at 12:38:17PM +0800, Jason Wang wrote:
> On 01/06/2013 09:22 PM, Michael S. Tsirkin wrote:
> > On Sun, Jan 06, 2013 at 03:18:38PM +0800, Jason Wang wrote:
> >> Polling errors were ignored by vhost/vhost_net, this may lead to crash when
> >> trying to remove vhost from waitqueue when after the polling is failed. 
> >> Solve
> >> this problem by:
> >>
> >> - checking the poll->wqh before trying to remove from waitqueue
> >> - report an error when poll() returns a POLLERR in vhost_start_poll()
> >> - report an error when vhost_start_poll() fails in
> >>   vhost_vring_ioctl()/vhost_net_set_backend() which is used to notify the
> >>   failure to userspace.
> >> - report an error in the data path in vhost_net when meet polling errors.
> >>
> >> After those changes, we can safely drop the tx polling state in vhost_net 
> >> since
> >> it was replaced by the checking of poll->wqh.
> >>
> >> Signed-off-by: Jason Wang 
> >> ---
> >>  drivers/vhost/net.c   |   74 
> >> 
> >>  drivers/vhost/vhost.c |   31 +++-
> >>  drivers/vhost/vhost.h |2 +-
> >>  3 files changed, 49 insertions(+), 58 deletions(-)
> >>
> >> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> >> index d10ad6f..125c1e5 100644
> >> --- a/drivers/vhost/net.c
> >> +++ b/drivers/vhost/net.c
> >> @@ -64,20 +64,10 @@ enum {
> >>VHOST_NET_VQ_MAX = 2,
> >>  };
> >>  
> >> -enum vhost_net_poll_state {
> >> -  VHOST_NET_POLL_DISABLED = 0,
> >> -  VHOST_NET_POLL_STARTED = 1,
> >> -  VHOST_NET_POLL_STOPPED = 2,
> >> -};
> >> -
> >>  struct vhost_net {
> >>struct vhost_dev dev;
> >>struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
> >>struct vhost_poll poll[VHOST_NET_VQ_MAX];
> >> -  /* Tells us whether we are polling a socket for TX.
> >> -   * We only do this when socket buffer fills up.
> >> -   * Protected by tx vq lock. */
> >> -  enum vhost_net_poll_state tx_poll_state;
> >>/* Number of TX recently submitted.
> >> * Protected by tx vq lock. */
> >>unsigned tx_packets;
> >> @@ -155,24 +145,6 @@ static void copy_iovec_hdr(const struct iovec *from, 
> >> struct iovec *to,
> >>}
> >>  }
> >>  
> >> -/* Caller must have TX VQ lock */
> >> -static void tx_poll_stop(struct vhost_net *net)
> >> -{
> >> -  if (likely(net->tx_poll_state != VHOST_NET_POLL_STARTED))
> >> -  return;
> >> -  vhost_poll_stop(net->poll + VHOST_NET_VQ_TX);
> >> -  net->tx_poll_state = VHOST_NET_POLL_STOPPED;
> >> -}
> >> -
> >> -/* Caller must have TX VQ lock */
> >> -static void tx_poll_start(struct vhost_net *net, struct socket *sock)
> >> -{
> >> -  if (unlikely(net->tx_poll_state != VHOST_NET_POLL_STOPPED))
> >> -  return;
> >> -  vhost_poll_start(net->poll + VHOST_NET_VQ_TX, sock->file);
> >> -  net->tx_poll_state = VHOST_NET_POLL_STARTED;
> >> -}
> >> -
> >>  /* In case of DMA done not in order in lower device driver for some 
> >> reason.
> >>   * upend_idx is used to track end of used idx, done_idx is used to track 
> >> head
> >>   * of used idx. Once lower device DMA done contiguously, we will signal 
> >> KVM
> >> @@ -227,6 +199,7 @@ static void vhost_zerocopy_callback(struct ubuf_info 
> >> *ubuf, bool success)
> >>  static void handle_tx(struct vhost_net *net)
> >>  {
> >>struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
> >> +  struct vhost_poll *poll = net->poll + VHOST_NET_VQ_TX;
> >>unsigned out, in, s;
> >>int head;
> >>struct msghdr msg = {
> >> @@ -252,7 +225,8 @@ static void handle_tx(struct vhost_net *net)
> >>wmem = atomic_read(&sock->sk->sk_wmem_alloc);
> >>if (wmem >= sock->sk->sk_sndbuf) {
> >>mutex_lock(&vq->mutex);
> >> -  tx_poll_start(net, sock);
> >> +  if (vhost_poll_start(poll, sock->file))
> >> +  vq_err(vq, "Fail to start TX polling\n");
> > s/Fail/Failed/
> >
> > A question though: how can this happen? Could you clarify please?
> > Maybe we can find a way to prevent this error?
> 
> Two conditions I think this can happen:
> 
> 1) a buggy userspace disable a queue through TUNSETQUEUE
> 2) the net device were gone
> 
> For 1, looks like we can delay the disabling until the refcnt goes to
> zero. For 2 may needs more changes.

I'd expect keeping a socket reference would prevent both issues.
Doesn't it?

> Not sure it's worth to do this work,
> maybe a warning is enough just like other failure.

With other failures, you normally can correct the error then
kick to have it restart. This is soomething thagt would not
work here.

> >
> >>mutex_unlock(&vq->mutex);
> >>return;
> >>}
> >> @@ -261,7 +235,7 @@ static void handle_tx(struct vhost_net *net)
> >>vhost_disable_notify(&net->dev, vq);
> >>  
> >>if (wmem < sock->sk->sk_sndbuf / 2)
> >> -  tx_poll_stop(net);
> >> +  vhost_poll_stop(poll);
> >>hdr_size = vq->vhost_hlen;
> >>zcopy = vq->ubufs;
> >>  
> >> @@ -283,8 +257,10 @@ static void ha

Re: [PATCH v2 1/5] virtio: add functions for piecewise addition of buffers

2013-01-07 Thread Paolo Bonzini
Il 07/01/2013 01:02, Rusty Russell ha scritto:
> Paolo Bonzini  writes:
>> Il 02/01/2013 06:03, Rusty Russell ha scritto:
>>> Paolo Bonzini  writes:
 The virtqueue_add_buf function has two limitations:

 1) it requires the caller to provide all the buffers in a single call;

 2) it does not support chained scatterlists: the buffers must be
 provided as an array of struct scatterlist;
>>>
>>> Chained scatterlists are a horrible interface, but that doesn't mean we
>>> shouldn't support them if there's a need.
>>>
>>> I think I once even had a patch which passed two chained sgs, rather
>>> than a combo sg and two length numbers.  It's very old, but I've pasted
>>> it below.
>>>
>>> Duplicating the implementation by having another interface is pretty
>>> nasty; I think I'd prefer the chained scatterlists, if that's optimal
>>> for you.
>>
>> Unfortunately, that cannot work because not all architectures support
>> chained scatterlists.
> 
> WHAT?  I can't figure out what an arch needs to do to support this?

It needs to use the iterator functions in its DMA driver.

> All archs we care about support them, though, so I think we can ignore
> this issue for now.

Kind of... In principle all QEMU-supported arches can use virtio, and
the speedup can be quite useful.  And there is no Kconfig symbol for SG
chains that I can use to disable virtio-scsi on unsupported arches. :/

Paolo

>> (Also, as you mention chained scatterlists are horrible.  They'd happen
>> to work for virtio-scsi, but not for virtio-blk where the response
>> status is part of the footer, not the header).
> 
> We lost that debate 5 years ago, so we hack around it as needed.  We can
> add helpers to append if we need.

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [Xen-devel] [PATCH v3 00/11] xen: Initial kexec/kdump implementation

2013-01-07 Thread Ian Campbell
On Mon, 2013-01-07 at 12:34 +, Daniel Kiper wrote:
> I think that new kexec hypercall function should mimics kexec syscall.

We want to have an interface can be used by non-Linux domains (both dom0
and domU) as well though, so please bear this in mind.

Historically we've not always been good at this when the hypercall
interface is strongly tied to a particular guest implementation (in some
sense this is the problem with the current kexec hypercall).

Also what makes for a good syscall interface does not necessarily make
for a good hypercall interface.

Ian.

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v3 02/11] x86/kexec: Add extra pointers to transition page table PGD, PUD, PMD and PTE

2013-01-07 Thread Jan Beulich
>>> On 07.01.13 at 13:52, Daniel Kiper  wrote:
> On Mon, Jan 07, 2013 at 09:48:20AM +, Jan Beulich wrote:
>> >>> On 04.01.13 at 18:25, Daniel Kiper  wrote:
>> > Right, so where is virtual mapping of control page established?
>> > I could not find relevant code in SLES kernel which does that.
>>
>> In the hypervisor (xen/arch/x86/machine_kexec.c:machine_kexec_load()).
>> xen/arch/x86/machine_kexec.c:machine_kexec() then simply uses
>> image->page_list[1].
> 
> This (xen/arch/x86/machine_kexec.c:machine_kexec_load()) maps relevant
> page (allocated earlier by dom0) in hypervisor fixmap area. However,
> it does not make relevant mapping in transition page table which
> leads to crash when %cr3 is switched from Xen page table to
> transition page table.

That indeed could explain _random_ failures - the fixmap entries
get created with _PAGE_GLOBAL set, i.e. don't get flushed with
the CR3 write unless CR4.PGE is clear.

And I don't see how your allocation of intermediate page tables
would help: You wouldn't know where the mapping of the control
page lives until you're actually in the early relocate_kernel code.
Or was it that what distinguishes your cloned code from the
native original?

Jan

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v3 02/11] x86/kexec: Add extra pointers to transition page table PGD, PUD, PMD and PTE

2013-01-07 Thread Daniel Kiper
On Mon, Jan 07, 2013 at 09:48:20AM +, Jan Beulich wrote:
> >>> On 04.01.13 at 18:25, Daniel Kiper  wrote:
> > Right, so where is virtual mapping of control page established?
> > I could not find relevant code in SLES kernel which does that.
>
> In the hypervisor (xen/arch/x86/machine_kexec.c:machine_kexec_load()).
> xen/arch/x86/machine_kexec.c:machine_kexec() then simply uses
> image->page_list[1].

This (xen/arch/x86/machine_kexec.c:machine_kexec_load()) maps relevant
page (allocated earlier by dom0) in hypervisor fixmap area. However,
it does not make relevant mapping in transition page table which
leads to crash when %cr3 is switched from Xen page table to
transition page table.

Daniel
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [Xen-devel] [PATCH v3 00/11] xen: Initial kexec/kdump implementation

2013-01-07 Thread Daniel Kiper
On Fri, Jan 04, 2013 at 02:11:46PM -0500, Konrad Rzeszutek Wilk wrote:
> On Fri, Jan 04, 2013 at 06:07:51PM +0100, Daniel Kiper wrote:
> > On Fri, Jan 04, 2013 at 02:41:17PM +, Jan Beulich wrote:
> > > >>> On 04.01.13 at 15:22, Daniel Kiper  wrote:
> > > > On Wed, Jan 02, 2013 at 11:26:43AM +, Andrew Cooper wrote:
> > > >> /sbin/kexec can load the "Xen" crash kernel itself by issuing
> > > >> hypercalls using /dev/xen/privcmd.  This would remove the need for
> > > >> the dom0 kernel to distinguish between loading a crash kernel for
> > > >> itself and loading a kernel for Xen.
> > > >>
> > > >> Or is this just a silly idea complicating the matter?
> > > >
> > > > This is impossible with current Xen kexec/kdump interface.
> > >
> > > Why?
> >
> > Because current KEXEC_CMD_kexec_load does not load kernel
> > image and other things into Xen memory. It means that it
> > should live somewhere in dom0 Linux kernel memory.
>
> We could have a very simple hypercall which would have:
>
> struct fancy_new_hypercall {
>   xen_pfn_t payload; // IN
>   ssize_t len; // IN
> #define DATA (1<<1)
> #define DATA_EOF (1<<2)
> #define DATA_KERNEL (1<<3)
> #define DATA_RAMDISK (1<<4)
>   unsigned int flags; // IN
>   unsigned int status; // OUT
> };
>
> which would in a loop just iterate over the payloads and
> let the hypervisor stick it in the crashkernel space.
>
> This is all hand-waving of course. There probably would be a need
> to figure out how much space you have in the reserved Xen's
> 'crashkernel' memory region too.

I think that new kexec hypercall function should mimics kexec syscall.
It means that all arguments passed to hypercall should have same types
if it is possible or if it is not possible then conversion should be done
in very easy way. Additionally, I think that one call of new hypercall
load function should load all needed thinks in right place and
return relevant status. Last but not least, new functionality should
be available through /dev/xen/privcmd or directly from kernel without
bigger effort.

Daniel
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [Xen-devel] [PATCH v3 00/11] xen: Initial kexec/kdump implementation

2013-01-07 Thread Ian Campbell
On Mon, 2013-01-07 at 10:46 +, Andrew Cooper wrote:

> Given that /sbin/kexec creates a binary blob in memory, surely the most 
> simple thing is to get it to suitably mlock() the region and give a list 
> of VAs to the hypervisor.

More than likely. The DOMID_KEXEC thing was just a radon musing ;-)

> This way, Xen can properly take care of what it does with information 
> and where.  For example, at the moment, allowing dom0 to choose where 
> gets overwritten in the Xen crash area is a recipe for disaster if a 
> crash occurs midway through loading/reloading the crash kernel.

That's true. I think there is a double buffering scheme in the current
thing and we should preserve that in any new implementation.

Ian.

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [Xen-devel] [PATCH v3 00/11] xen: Initial kexec/kdump implementation

2013-01-07 Thread Andrew Cooper

On 07/01/13 10:25, Ian Campbell wrote:

On Fri, 2013-01-04 at 19:11 +, Konrad Rzeszutek Wilk wrote:

On Fri, Jan 04, 2013 at 06:07:51PM +0100, Daniel Kiper wrote:

Because current KEXEC_CMD_kexec_load does not load kernel
image and other things into Xen memory. It means that it
should live somewhere in dom0 Linux kernel memory.

We could have a very simple hypercall which would have:

struct fancy_new_hypercall {
xen_pfn_t payload; // IN

This would have to be XEN_GUEST_HANDLE(something) since userspace cannot
figure out what pfns back its memory. In any case since the hypervisor
is going to want to copy the data into the crashkernel space a virtual
address is convenient to have.


ssize_t len; // IN
#define DATA (1<<1)
#define DATA_EOF (1<<2)
#define DATA_KERNEL (1<<3)
#define DATA_RAMDISK (1<<4)
unsigned int flags; // IN
unsigned int status; // OUT
};

which would in a loop just iterate over the payloads and
let the hypervisor stick it in the crashkernel space.

This is all hand-waving of course. There probably would be a need
to figure out how much space you have in the reserved Xen's
'crashkernel' memory region too.

This is probably a mad idea but it's Monday morning and I'm sleep
deprived so I'll throw it out there...

What about adding DOMID_KEXEC (similar DOMID_IO etc)? This would allow
dom0 to map the kexec memory space with the usual privcmd mmap
hypercalls and build things in it directly.

OK, I suspect this might not be practical for a variety of reasons (lack
of a p2m for such domains so no way to find out the list of mfns, dom0
userspace simply doesn't have sufficient context to write sensible
things here, etc) but maybe someone has a better head on today...

Ian.



Given that /sbin/kexec creates a binary blob in memory, surely the most 
simple thing is to get it to suitably mlock() the region and give a list 
of VAs to the hypervisor.


This way, Xen can properly take care of what it does with information 
and where.  For example, at the moment, allowing dom0 to choose where 
gets overwritten in the Xen crash area is a recipe for disaster if a 
crash occurs midway through loading/reloading the crash kernel.


~Andrew

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [Xen-devel] [PATCH v3 00/11] xen: Initial kexec/kdump implementation

2013-01-07 Thread Ian Campbell
On Fri, 2013-01-04 at 19:11 +, Konrad Rzeszutek Wilk wrote:
> On Fri, Jan 04, 2013 at 06:07:51PM +0100, Daniel Kiper wrote:
> > On Fri, Jan 04, 2013 at 02:41:17PM +, Jan Beulich wrote:
> > > >>> On 04.01.13 at 15:22, Daniel Kiper  wrote:
> > > > On Wed, Jan 02, 2013 at 11:26:43AM +, Andrew Cooper wrote:
> > > >> /sbin/kexec can load the "Xen" crash kernel itself by issuing
> > > >> hypercalls using /dev/xen/privcmd.  This would remove the need for
> > > >> the dom0 kernel to distinguish between loading a crash kernel for
> > > >> itself and loading a kernel for Xen.
> > > >>
> > > >> Or is this just a silly idea complicating the matter?
> > > >
> > > > This is impossible with current Xen kexec/kdump interface.
> > >
> > > Why?
> > 
> > Because current KEXEC_CMD_kexec_load does not load kernel
> > image and other things into Xen memory. It means that it
> > should live somewhere in dom0 Linux kernel memory.
> 
> We could have a very simple hypercall which would have:
> 
> struct fancy_new_hypercall {
>   xen_pfn_t payload; // IN

This would have to be XEN_GUEST_HANDLE(something) since userspace cannot
figure out what pfns back its memory. In any case since the hypervisor
is going to want to copy the data into the crashkernel space a virtual
address is convenient to have.

>   ssize_t len; // IN
> #define DATA (1<<1)
> #define DATA_EOF (1<<2)
> #define DATA_KERNEL (1<<3)
> #define DATA_RAMDISK (1<<4)
>   unsigned int flags; // IN
>   unsigned int status; // OUT
> };
> 
> which would in a loop just iterate over the payloads and
> let the hypervisor stick it in the crashkernel space.
> 
> This is all hand-waving of course. There probably would be a need
> to figure out how much space you have in the reserved Xen's
> 'crashkernel' memory region too.

This is probably a mad idea but it's Monday morning and I'm sleep
deprived so I'll throw it out there...

What about adding DOMID_KEXEC (similar DOMID_IO etc)? This would allow
dom0 to map the kexec memory space with the usual privcmd mmap
hypercalls and build things in it directly.

OK, I suspect this might not be practical for a variety of reasons (lack
of a p2m for such domains so no way to find out the list of mfns, dom0
userspace simply doesn't have sufficient context to write sensible
things here, etc) but maybe someone has a better head on today...

Ian.

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v3 02/11] x86/kexec: Add extra pointers to transition page table PGD, PUD, PMD and PTE

2013-01-07 Thread Jan Beulich
>>> On 04.01.13 at 18:25, Daniel Kiper  wrote:
> Right, so where is virtual mapping of control page established?
> I could not find relevant code in SLES kernel which does that.

In the hypervisor (xen/arch/x86/machine_kexec.c:machine_kexec_load()).
xen/arch/x86/machine_kexec.c:machine_kexec() then simply uses
image->page_list[1].

Jan

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH V2 2/2] virtio-net: reset virtqueue affinity when doing cpu hotplug

2013-01-07 Thread Wanlong Gao
On 01/07/2013 03:55 PM, Jason Wang wrote:
> On 01/07/2013 03:48 PM, Wanlong Gao wrote:
>> On 01/07/2013 03:28 PM, Jason Wang wrote:
>>> On 01/07/2013 03:15 PM, Wanlong Gao wrote:
 Add a cpu notifier to virtio-net, so that we can reset the
 virtqueue affinity if the cpu hotplug happens. It improve
 the performance through enabling or disabling the virtqueue
 affinity after doing cpu hotplug.
 Adding the notifier block to virtnet_info is suggested by
 Jason, thank you.

 Cc: Rusty Russell 
 Cc: "Michael S. Tsirkin" 
 Cc: Jason Wang 
 Cc: Eric Dumazet 
 Cc: virtualization@lists.linux-foundation.org
 Cc: net...@vger.kernel.org
 Signed-off-by: Wanlong Gao 
 ---
  drivers/net/virtio_net.c | 30 ++
  1 file changed, 30 insertions(+)

 diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
 index b483fb5..9547b4c 100644
 --- a/drivers/net/virtio_net.c
 +++ b/drivers/net/virtio_net.c
 @@ -26,6 +26,7 @@
  #include 
  #include 
  #include 
 +#include 
  
  static int napi_weight = 128;
  module_param(napi_weight, int, 0444);
 @@ -123,6 +124,9 @@ struct virtnet_info {
  
/* Does the affinity hint is set for virtqueues? */
bool affinity_hint_set;
 +
 +  /* CPU hot plug notifier */
 +  struct notifier_block nb;
  };
  
  struct skb_vnet_hdr {
 @@ -1051,6 +1055,23 @@ static void virtnet_set_affinity(struct 
 virtnet_info *vi, bool set)
}
  }
  
 +static int virtnet_cpu_callback(struct notifier_block *nfb,
 +  unsigned long action, void *hcpu)
 +{
 +  struct virtnet_info *vi = container_of(nfb, struct virtnet_info, nb);
 +  switch(action) {
 +  case CPU_ONLINE:
 +  case CPU_ONLINE_FROZEN:
 +  case CPU_DEAD:
 +  case CPU_DEAD_FROZEN:
 +  virtnet_set_affinity(vi, true);
 +  break;
 +  default:
 +  break;
 +  }
 +  return NOTIFY_OK;
 +}
 +
>>> I think you'd better fix the .ndo_select_queue() as well (as Michael
>>> said in your V1) since it currently uses smp processor id which may not
>>> work very well in this case also.
>> The bug is we can't get the right txq if the CPU IDs are not consecutive,
>> right? Do you have any good idea about fixing this? 
>>
>> Thanks,
>> Wanlong Gao
> 
> The point is make the virtqueue private to a specific cpu when the
> number of queue pairs is equal to the number of cpus. So after you bind
> the vq affinity to a specific cpu, you'd better use the reverse mapping
> of this affinity to do .ndo_select_queue(). One possible idea, as
> Michael suggested, is a per-cpu structure to record the preferable
> virtqueue and do both .ndo_select_queue() and affinity hint setting
> based on this.

Yeah, I think I got it now, will address it in V3. thank you. ;)

Regards,
Wanlong Gao

>>
>>> Thanks
  static void virtnet_get_ringparam(struct net_device *dev,
struct ethtool_ringparam *ring)
  {
 @@ -1509,6 +1530,13 @@ static int virtnet_probe(struct virtio_device *vdev)
}
}
  
 +  vi->nb.notifier_call = &virtnet_cpu_callback;
 +  err = register_hotcpu_notifier(&vi->nb);
 +  if (err) {
 +  pr_debug("virtio_net: registering cpu notifier failed\n");
 +  goto free_recv_bufs;
 +  }
 +
/* Assume link up if device can't report link status,
   otherwise get link status from config. */
if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
 @@ -1553,6 +1581,8 @@ static void virtnet_remove(struct virtio_device 
 *vdev)
  {
struct virtnet_info *vi = vdev->priv;
  
 +  unregister_hotcpu_notifier(&vi->nb);
 +
/* Prevent config work handler from accessing the device. */
mutex_lock(&vi->config_lock);
vi->config_enable = false;
>>>
> 
> 

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization