Re: [PATCH 2/2] vhost: handle polling failure

2012-12-27 Thread Wanlong Gao
On 12/27/2012 02:39 PM, Jason Wang wrote:
 Currently, polling error were ignored in vhost. This may lead some issues (e.g
 kenrel crash when passing a tap fd to vhost before calling TUNSETIFF). Fix 
 this
 by:

Can this kernel crash be reproduced by hand?

Thanks,
Wanlong Gao

 
 - extend the idea of vhost_net_poll_state to all vhost_polls
 - change the state only when polling is succeed
 - make vhost_poll_start() report errors to the caller, which could be used
   caller or userspace.
 
 Signed-off-by: Jason Wang jasow...@redhat.com
 ---
  drivers/vhost/net.c   |   75 
 +
  drivers/vhost/vhost.c |   16 +-
  drivers/vhost/vhost.h |   11 ++-
  3 files changed, 50 insertions(+), 52 deletions(-)
 
 diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
 index 629d6b5..56e7f5a 100644
 --- a/drivers/vhost/net.c
 +++ b/drivers/vhost/net.c
 @@ -64,20 +64,10 @@ enum {
   VHOST_NET_VQ_MAX = 2,
  };
  
 -enum vhost_net_poll_state {
 - VHOST_NET_POLL_DISABLED = 0,
 - VHOST_NET_POLL_STARTED = 1,
 - VHOST_NET_POLL_STOPPED = 2,
 -};
 -
  struct vhost_net {
   struct vhost_dev dev;
   struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
   struct vhost_poll poll[VHOST_NET_VQ_MAX];
 - /* Tells us whether we are polling a socket for TX.
 -  * We only do this when socket buffer fills up.
 -  * Protected by tx vq lock. */
 - enum vhost_net_poll_state tx_poll_state;
   /* Number of TX recently submitted.
* Protected by tx vq lock. */
   unsigned tx_packets;
 @@ -155,24 +145,6 @@ static void copy_iovec_hdr(const struct iovec *from, 
 struct iovec *to,
   }
  }
  
 -/* Caller must have TX VQ lock */
 -static void tx_poll_stop(struct vhost_net *net)
 -{
 - if (likely(net-tx_poll_state != VHOST_NET_POLL_STARTED))
 - return;
 - vhost_poll_stop(net-poll + VHOST_NET_VQ_TX);
 - net-tx_poll_state = VHOST_NET_POLL_STOPPED;
 -}
 -
 -/* Caller must have TX VQ lock */
 -static void tx_poll_start(struct vhost_net *net, struct socket *sock)
 -{
 - if (unlikely(net-tx_poll_state != VHOST_NET_POLL_STOPPED))
 - return;
 - vhost_poll_start(net-poll + VHOST_NET_VQ_TX, sock-file);
 - net-tx_poll_state = VHOST_NET_POLL_STARTED;
 -}
 -
  /* In case of DMA done not in order in lower device driver for some reason.
   * upend_idx is used to track end of used idx, done_idx is used to track head
   * of used idx. Once lower device DMA done contiguously, we will signal KVM
 @@ -252,7 +224,7 @@ static void handle_tx(struct vhost_net *net)
   wmem = atomic_read(sock-sk-sk_wmem_alloc);
   if (wmem = sock-sk-sk_sndbuf) {
   mutex_lock(vq-mutex);
 - tx_poll_start(net, sock);
 + vhost_poll_start(net-poll + VHOST_NET_VQ_TX, sock-file);
   mutex_unlock(vq-mutex);
   return;
   }
 @@ -261,7 +233,7 @@ static void handle_tx(struct vhost_net *net)
   vhost_disable_notify(net-dev, vq);
  
   if (wmem  sock-sk-sk_sndbuf / 2)
 - tx_poll_stop(net);
 + vhost_poll_stop(net-poll + VHOST_NET_VQ_TX);
   hdr_size = vq-vhost_hlen;
   zcopy = vq-ubufs;
  
 @@ -283,7 +255,8 @@ static void handle_tx(struct vhost_net *net)
  
   wmem = atomic_read(sock-sk-sk_wmem_alloc);
   if (wmem = sock-sk-sk_sndbuf * 3 / 4) {
 - tx_poll_start(net, sock);
 + vhost_poll_start(net-poll + VHOST_NET_VQ_TX,
 +  sock-file);
   set_bit(SOCK_ASYNC_NOSPACE, sock-flags);
   break;
   }
 @@ -294,7 +267,8 @@ static void handle_tx(struct vhost_net *net)
   (vq-upend_idx - vq-done_idx) :
   (vq-upend_idx + UIO_MAXIOV - vq-done_idx);
   if (unlikely(num_pends  VHOST_MAX_PEND)) {
 - tx_poll_start(net, sock);
 + vhost_poll_start(net-poll + VHOST_NET_VQ_TX,
 +  sock-file);
   set_bit(SOCK_ASYNC_NOSPACE, sock-flags);
   break;
   }
 @@ -360,7 +334,8 @@ static void handle_tx(struct vhost_net *net)
   }
   vhost_discard_vq_desc(vq, 1);
   if (err == -EAGAIN || err == -ENOBUFS)
 - tx_poll_start(net, sock);
 + vhost_poll_start(net-poll + VHOST_NET_VQ_TX,
 +  sock-file);
   break;
   }
   if (err != len)
 @@ -623,7 +598,6 @@ static int vhost_net_open(struct inode *inode, struct 
 file *f)
  
   vhost_poll_init(n-poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev);
   

Re: [RFC PATCH] virtio-net: reset virtqueue affinity when doing cpu hotplug

2012-12-27 Thread Michael S. Tsirkin
On Thu, Dec 27, 2012 at 11:34:16AM +0800, Jason Wang wrote:
 On 12/26/2012 06:46 PM, Michael S. Tsirkin wrote:
  On Wed, Dec 26, 2012 at 03:06:54PM +0800, Wanlong Gao wrote:
  Add a cpu notifier to virtio-net, so that we can reset the
  virtqueue affinity if the cpu hotplug happens. It improve
  the performance through enabling or disabling the virtqueue
  affinity after doing cpu hotplug.
 
  Cc: Rusty Russell ru...@rustcorp.com.au
  Cc: Michael S. Tsirkin m...@redhat.com
  Cc: Jason Wang jasow...@redhat.com
  Cc: virtualization@lists.linux-foundation.org
  Cc: net...@vger.kernel.org
  Signed-off-by: Wanlong Gao gaowanl...@cn.fujitsu.com
  Thanks for looking into this.
  Some comments:
 
  1. Looks like the logic in
  virtnet_set_affinity (and in virtnet_select_queue)
  will not work very well when CPU IDs are not
  consequitive. This can happen with hot unplug.
 
  Maybe we should add a VQ allocator, and defining
  a per-cpu variable specifying the VQ instead
  of using CPU ID.
 
 Yes, and generate the affinity hint based on the mapping. Btw, what does
 VQ allocator means here?

Some logic to generate CPU to VQ mapping.

 
 
  2. The below code seems racy e.g. when CPU is added
  during device init.
 
  3. using a global cpu_hotplug seems inelegant.
  In any case we should document what is the
  meaning of this variable.
 
  ---
   drivers/net/virtio_net.c | 39 ++-
   1 file changed, 38 insertions(+), 1 deletion(-)
 
  diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
  index a6fcf15..9710cf4 100644
  --- a/drivers/net/virtio_net.c
  +++ b/drivers/net/virtio_net.c
  @@ -26,6 +26,7 @@
   #include linux/scatterlist.h
   #include linux/if_vlan.h
   #include linux/slab.h
  +#include linux/cpu.h
   
   static int napi_weight = 128;
   module_param(napi_weight, int, 0444);
  @@ -34,6 +35,8 @@ static bool csum = true, gso = true;
   module_param(csum, bool, 0444);
   module_param(gso, bool, 0444);
   
  +static bool cpu_hotplug = false;
  +
   /* FIXME: MTU in config. */
   #define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
   #define GOOD_COPY_LEN 128
  @@ -1041,6 +1044,26 @@ static void virtnet_set_affinity(struct 
  virtnet_info *vi, bool set)
 vi-affinity_hint_set = false;
   }
   
  +static int virtnet_cpu_callback(struct notifier_block *nfb,
  + unsigned long action, void *hcpu)
  +{
  +  switch(action) {
  +  case CPU_ONLINE:
  +  case CPU_ONLINE_FROZEN:
  +  case CPU_DEAD:
  +  case CPU_DEAD_FROZEN:
  +  cpu_hotplug = true;
  +  break;
  +  default:
  +  break;
  +  }
  +  return NOTIFY_OK;
  +}
  +
  +static struct notifier_block virtnet_cpu_notifier = {
  +  .notifier_call = virtnet_cpu_callback,
  +};
  +
   static void virtnet_get_ringparam(struct net_device *dev,
 struct ethtool_ringparam *ring)
   {
  @@ -1131,7 +1154,14 @@ static int virtnet_change_mtu(struct net_device 
  *dev, int new_mtu)
*/
   static u16 virtnet_select_queue(struct net_device *dev, struct sk_buff 
  *skb)
   {
  -  int txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) :
  +  int txq;
  +
  +  if (unlikely(cpu_hotplug == true)) {
  +  virtnet_set_affinity(netdev_priv(dev), true);
  +  cpu_hotplug = false;
  +  }
  +
  +  txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) :
   smp_processor_id();
   
 while (unlikely(txq = dev-real_num_tx_queues))
  @@ -1248,6 +1278,8 @@ static void virtnet_del_vqs(struct virtnet_info *vi)
   {
 struct virtio_device *vdev = vi-vdev;
   
  +  unregister_hotcpu_notifier(virtnet_cpu_notifier);
  +
 virtnet_set_affinity(vi, false);
   
 vdev-config-del_vqs(vdev);
  @@ -1372,6 +1404,11 @@ static int init_vqs(struct virtnet_info *vi)
 goto err_free;
   
 virtnet_set_affinity(vi, true);
  +
  +  ret = register_hotcpu_notifier(virtnet_cpu_notifier);
  +  if (ret)
  +  goto err_free;
  +
 return 0;
   
   err_free:
  -- 
  1.8.0
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 1/2] vhost_net: correct error hanlding in vhost_net_set_backend()

2012-12-27 Thread Michael S. Tsirkin
On Thu, Dec 27, 2012 at 02:39:20PM +0800, Jason Wang wrote:
 Currently, polling error were ignored in vhost. This may lead some issues (e.g
 kenrel crash when passing a tap fd to vhost before calling TUNSETIFF). Fix 
 this
 by:
 
 - extend the idea of vhost_net_poll_state to all vhost_polls
 - change the state only when polling is succeed
 - make vhost_poll_start() report errors to the caller, which could be used
   caller or userspace.

Maybe it could but this patch just ignores these errors.
And it's not clear how would userspace handle these errors.
Also, since we have a reference on the fd, it would seem
that once poll succeeds it can't fail in the future.

So two other options would make more sense to me:
- if vhost is bound to tun without SETIFF, fail this immediately
- if vhost is bound to tun without SETIFF, defer polling
  until SETIFF

Option 1 would seem much easier to implement, I think it's
preferable.

 
 Signed-off-by: Jason Wang jasow...@redhat.com
 ---
  drivers/vhost/net.c   |   75 
 +
  drivers/vhost/vhost.c |   16 +-
  drivers/vhost/vhost.h |   11 ++-
  3 files changed, 50 insertions(+), 52 deletions(-)
 
 diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
 index 629d6b5..56e7f5a 100644
 --- a/drivers/vhost/net.c
 +++ b/drivers/vhost/net.c
 @@ -64,20 +64,10 @@ enum {
   VHOST_NET_VQ_MAX = 2,
  };
  
 -enum vhost_net_poll_state {
 - VHOST_NET_POLL_DISABLED = 0,
 - VHOST_NET_POLL_STARTED = 1,
 - VHOST_NET_POLL_STOPPED = 2,
 -};
 -
  struct vhost_net {
   struct vhost_dev dev;
   struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
   struct vhost_poll poll[VHOST_NET_VQ_MAX];
 - /* Tells us whether we are polling a socket for TX.
 -  * We only do this when socket buffer fills up.
 -  * Protected by tx vq lock. */
 - enum vhost_net_poll_state tx_poll_state;
   /* Number of TX recently submitted.
* Protected by tx vq lock. */
   unsigned tx_packets;
 @@ -155,24 +145,6 @@ static void copy_iovec_hdr(const struct iovec *from, 
 struct iovec *to,
   }
  }
  
 -/* Caller must have TX VQ lock */
 -static void tx_poll_stop(struct vhost_net *net)
 -{
 - if (likely(net-tx_poll_state != VHOST_NET_POLL_STARTED))
 - return;
 - vhost_poll_stop(net-poll + VHOST_NET_VQ_TX);
 - net-tx_poll_state = VHOST_NET_POLL_STOPPED;
 -}
 -
 -/* Caller must have TX VQ lock */
 -static void tx_poll_start(struct vhost_net *net, struct socket *sock)
 -{
 - if (unlikely(net-tx_poll_state != VHOST_NET_POLL_STOPPED))
 - return;
 - vhost_poll_start(net-poll + VHOST_NET_VQ_TX, sock-file);
 - net-tx_poll_state = VHOST_NET_POLL_STARTED;
 -}
 -
  /* In case of DMA done not in order in lower device driver for some reason.
   * upend_idx is used to track end of used idx, done_idx is used to track head
   * of used idx. Once lower device DMA done contiguously, we will signal KVM
 @@ -252,7 +224,7 @@ static void handle_tx(struct vhost_net *net)
   wmem = atomic_read(sock-sk-sk_wmem_alloc);
   if (wmem = sock-sk-sk_sndbuf) {
   mutex_lock(vq-mutex);
 - tx_poll_start(net, sock);
 + vhost_poll_start(net-poll + VHOST_NET_VQ_TX, sock-file);
   mutex_unlock(vq-mutex);
   return;
   }
 @@ -261,7 +233,7 @@ static void handle_tx(struct vhost_net *net)
   vhost_disable_notify(net-dev, vq);
  
   if (wmem  sock-sk-sk_sndbuf / 2)
 - tx_poll_stop(net);
 + vhost_poll_stop(net-poll + VHOST_NET_VQ_TX);
   hdr_size = vq-vhost_hlen;
   zcopy = vq-ubufs;
  
 @@ -283,7 +255,8 @@ static void handle_tx(struct vhost_net *net)
  
   wmem = atomic_read(sock-sk-sk_wmem_alloc);
   if (wmem = sock-sk-sk_sndbuf * 3 / 4) {
 - tx_poll_start(net, sock);
 + vhost_poll_start(net-poll + VHOST_NET_VQ_TX,
 +  sock-file);
   set_bit(SOCK_ASYNC_NOSPACE, sock-flags);
   break;
   }
 @@ -294,7 +267,8 @@ static void handle_tx(struct vhost_net *net)
   (vq-upend_idx - vq-done_idx) :
   (vq-upend_idx + UIO_MAXIOV - vq-done_idx);
   if (unlikely(num_pends  VHOST_MAX_PEND)) {
 - tx_poll_start(net, sock);
 + vhost_poll_start(net-poll + VHOST_NET_VQ_TX,
 +  sock-file);
   set_bit(SOCK_ASYNC_NOSPACE, sock-flags);
   break;
   }
 @@ -360,7 +334,8 @@ static void handle_tx(struct vhost_net *net)
   }
   vhost_discard_vq_desc(vq, 1);
   if (err == -EAGAIN || err == -ENOBUFS)
 -

Re: [PATCH v3 00/11] xen: Initial kexec/kdump implementation

2012-12-27 Thread Eric W. Biederman
Andrew Cooper andrew.coop...@citrix.com writes:

 On 27/12/2012 07:53, Eric W. Biederman wrote:
 The syscall ABI still has the wrong semantics.

 Aka totally unmaintainable and umergeable.

 The concept of domU support is also strange.  What does domU support even 
 mean, when the dom0 support is loading a kernel to pick up Xen when Xen 
 falls over.

 There are two requirements pulling at this patch series, but I agree
 that we need to clarify them.

It probably make sense to split them apart a little even.

 When dom0 loads a crash kernel, it is loading one for Xen to use.  As a
 dom0 crash causes a Xen crash, having dom0 set up a kdump kernel for
 itself is completely useless.  This ability is present in classic Xen
 dom0 kernels, but the feature is currently missing in PVOPS.

 Many cloud customers and service providers want the ability for a VM
 administrator to be able to load a kdump/kexec kernel within a
 domain[1].  This allows the VM administrator to take more proactive
 steps to isolate the cause of a crash, the state of which is most likely
 discarded while tearing down the domain.  The result being that as far
 as Xen is concerned, the domain is still alive, while the kdump
 kernel/environment can work its usual magic.  I am not aware of any
 feature like this existing in the past.

Which makes domU support semantically just the normal kexec/kdump
support.  Got it.

The point of implementing domU is for those times when the hypervisor
admin and the kernel admin are different.

For domU support modifying or adding alternate versions of
machine_kexec.c and relocate_kernel.S to add paravirtualization support
make sense.

There is the practical argument that for implementation efficiency of
crash dumps it would be better if that support came from the hypervisor
or the hypervisor environment.  But this gets into the practical reality
that the hypervisor environment does not do that today.  Furthermore
kexec all by itself working in a paravirtualized environment under Xen
makes sense.

domU support is what Peter was worrying about for cleanliness, and
we need some x86 backend ops there, and generally to be careful.


For dom0 support we need to extend the kexec_load system call, and
get it right.

When we are done I expect both dom0 and domU support of kexec to work
in dom0.  I don't know if the normal kexec or kdump case will ever make
sense in dom0 but there is no reason for that case to be broken.

 ~Andrew

 [1] http://lists.xen.org/archives/html/xen-devel/2012-11/msg01274.html

Eric
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v3 09/11] x86/xen/enlighten: Add init and crash kexec/kdump hooks

2012-12-27 Thread H. Peter Anvin

On 12/26/2012 06:18 PM, Daniel Kiper wrote:

Add init and crash kexec/kdump hooks.

Signed-off-by: Daniel Kiper daniel.ki...@oracle.com
---
  arch/x86/xen/enlighten.c |   11 +++
  1 files changed, 11 insertions(+), 0 deletions(-)


On the general issue of hooks:

Hooks need their pre- and postsemantics extremely ewell documented, let 
they end up being an impossible roadblock to development.


-hpa

--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v3 02/11] x86/kexec: Add extra pointers to transition page table PGD, PUD, PMD and PTE

2012-12-27 Thread Daniel Kiper
 Hmm... this code is being redone at the moment... this might conflict.

Is this available somewhere? May I have a look at it?

Daniel

PS I am on holiday until 02/01/2013 and I could not
   have access to my email box. Please be patient.
   At worst case I will send reply when I will be
   back at office.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v3 06/11] x86/xen: Add i386 kexec/kdump implementation

2012-12-27 Thread Daniel Kiper
 On 12/26/2012 06:18 PM, Daniel Kiper wrote:
  Add i386 kexec/kdump implementation.
 
  v2 - suggestions/fixes:
  - allocate transition page table pages below 4 GiB
(suggested by Jan Beulich).

 Why?

Sadly all addresses are passed via unsigned long
variable to kexec hypercall.

Daniel
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v3 06/11] x86/xen: Add i386 kexec/kdump implementation

2012-12-27 Thread H. Peter Anvin

On 12/27/2012 03:23 PM, Daniel Kiper wrote:

On 12/26/2012 06:18 PM, Daniel Kiper wrote:

Add i386 kexec/kdump implementation.

v2 - suggestions/fixes:
 - allocate transition page table pages below 4 GiB
   (suggested by Jan Beulich).


Why?


Sadly all addresses are passed via unsigned long
variable to kexec hypercall.



So can you unf*ck your broken interface before imposing it on anyone else?

-hpa

--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v3 00/11] xen: Initial kexec/kdump implementation

2012-12-27 Thread Daniel Kiper
 On 12/26/2012 06:18 PM, Daniel Kiper wrote:
  Hi,
 
  This set of patches contains initial kexec/kdump implementation for Xen v3.
  Currently only dom0 is supported, however, almost all infrustructure
  required for domU support is ready.
 
  Jan Beulich suggested to merge Xen x86 assembler code with baremetal x86 
  code.
  This could simplify and reduce a bit size of kernel code. However, this 
  solution
  requires some changes in baremetal x86 code. First of all code which 
  establishes
  transition page table should be moved back from machine_kexec_$(BITS).c to
  relocate_kernel_$(BITS).S. Another important thing which should be changed 
  in that
  case is format of page_list array. Xen kexec hypercall requires to 
  alternate physical
  addresses with virtual ones. These and other required stuff have not been 
  done in that
  version because I am not sure that solution will be accepted by kexec/kdump 
  maintainers.
  I hope that this email spark discussion about that topic.

 I want a detailed list of the constraints that this assumes and 
 therefore imposes on the native implementation as a result of this.  We 
 have had way too many patches where Xen PV hacks effectively nailgun 
 arbitrary, and sometimes poor, design decisions in place and now we 
 can't fix them.

OK but now I think that we should leave this discussion
until all details regarding kexec/kdump generic code
will be agreed. Sorry for that.

Daniel
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v3 01/11] kexec: introduce kexec firmware support

2012-12-27 Thread Daniel Kiper
 Daniel Kiper daniel.ki...@oracle.com writes:

  Some kexec/kdump implementations (e.g. Xen PVOPS) could not use default
  Linux infrastructure and require some support from firmware and/or 
  hypervisor.
  To cope with that problem kexec firmware infrastructure was introduced.
  It allows a developer to use all kexec/kdump features of given firmware
  or hypervisor.

 As this stands this patch is wrong.

 You need to pass an additional flag from userspace through /sbin/kexec
 that says load the kexec image in the firmware.  A global variable here
 is not ok.

 As I understand it you are loading a kexec on xen panic image.  Which
 is semantically different from a kexec on linux panic image.  It is not
 ok to do have a silly global variable kexec_use_firmware.

Earlier we agreed that /sbin/kexec should call kexec syscall with
special flag. However, during work on Xen kexec/kdump v3 patch
I stated that this is insufficient because e.g. crash_kexec()
should execute different code in case of use of firmware support too.
Sadly syscall does not save this flag anywhere. Additionally, I stated
that kernel itself has the best knowledge which code path should be
used (firmware or plain Linux). If this decision will be left to userspace
then simple kexec syscall could crash system at worst case (e.g. when
plain Linux kexec will be used in case when firmware kaxec should be used).
However, if you wish I could add this flag to syscall. Additionally, I could
add function which enables firmware support and then kexec_use_firmware
variable will be global only in kexec.c module.

 Furthermore it is not ok to have a conditional
 code outside of header files.

I agree but how to dispatch execution e.g. in crash_kexec()
if we would like (I suppose) compile kexec firmware
support conditionally?

Daniel
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v3 00/11] xen: Initial kexec/kdump implementation

2012-12-27 Thread Daniel Kiper
 Andrew Cooper andrew.coop...@citrix.com writes:

  On 27/12/2012 07:53, Eric W. Biederman wrote:
  The syscall ABI still has the wrong semantics.
 
  Aka totally unmaintainable and umergeable.
 
  The concept of domU support is also strange.  What does domU support even 
  mean, when the dom0  support is loading a kernel to pick up Xen when Xen 
  falls over.
 
  There are two requirements pulling at this patch series, but I agree
  that we need to clarify them.

 It probably make sense to split them apart a little even.

  When dom0 loads a crash kernel, it is loading one for Xen to use.  As a
  dom0 crash causes a Xen crash, having dom0 set up a kdump kernel for
  itself is completely useless.  This ability is present in classic Xen
  dom0 kernels, but the feature is currently missing in PVOPS.

  Many cloud customers and service providers want the ability for a VM
  administrator to be able to load a kdump/kexec kernel within a
  domain[1].  This allows the VM administrator to take more proactive
  steps to isolate the cause of a crash, the state of which is most likely
  discarded while tearing down the domain.  The result being that as far
  as Xen is concerned, the domain is still alive, while the kdump
  kernel/environment can work its usual magic.  I am not aware of any
  feature like this existing in the past.

 Which makes domU support semantically just the normal kexec/kdump
 support.  Got it.

To some extent. It is true on HVM and PVonHVM guests. However,
PV guests requires a bit different kexec/kdump implementation
than plain kexec/kdump. Proposed firmware support has almost
all required features. PV guest specific features (a few) will
be added later (after agreeing generic firmware support which
is sufficient at least for dom0).

It looks that I should replace domU by PV guest in patch description.

 The point of implementing domU is for those times when the hypervisor
 admin and the kernel admin are different.

Right.

 For domU support modifying or adding alternate versions of
 machine_kexec.c and relocate_kernel.S to add paravirtualization support
 make sense.

It is not sufficient. Please look above.

 There is the practical argument that for implementation efficiency of
 crash dumps it would be better if that support came from the hypervisor
 or the hypervisor environment.  But this gets into the practical reality

I am thinking about that.

 that the hypervisor environment does not do that today.  Furthermore
 kexec all by itself working in a paravirtualized environment under Xen
 makes sense.

 domU support is what Peter was worrying about for cleanliness, and
 we need some x86 backend ops there, and generally to be careful.

As I know we do not need any additional pv_ops stuff
if we place all needed things in kexec firmware support.

Daniel
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v3 01/11] kexec: introduce kexec firmware support

2012-12-27 Thread Eric W. Biederman
Daniel Kiper daniel.ki...@oracle.com writes:

 Daniel Kiper daniel.ki...@oracle.com writes:

  Some kexec/kdump implementations (e.g. Xen PVOPS) could not use default
  Linux infrastructure and require some support from firmware and/or 
  hypervisor.
  To cope with that problem kexec firmware infrastructure was introduced.
  It allows a developer to use all kexec/kdump features of given firmware
  or hypervisor.

 As this stands this patch is wrong.

 You need to pass an additional flag from userspace through /sbin/kexec
 that says load the kexec image in the firmware.  A global variable here
 is not ok.

 As I understand it you are loading a kexec on xen panic image.  Which
 is semantically different from a kexec on linux panic image.  It is not
 ok to do have a silly global variable kexec_use_firmware.

 Earlier we agreed that /sbin/kexec should call kexec syscall with
 special flag. However, during work on Xen kexec/kdump v3 patch
 I stated that this is insufficient because e.g. crash_kexec()
 should execute different code in case of use of firmware support too.

That implies you have the wrong model of userspace.

Very simply there is:
linux kexec pass through to xen kexec.

And
linux kexec (ultimately pv kexec because the pv machine is a slightly
different architecture).

 Sadly syscall does not save this flag anywhere.

 Additionally, I stated
 that kernel itself has the best knowledge which code path should be
 used (firmware or plain Linux). If this decision will be left to userspace
 then simple kexec syscall could crash system at worst case (e.g. when
 plain Linux kexec will be used in case when firmware kaxec should be
 used).

And that path selection bit is strongly non-sense.  You are advocating
hardcoding unnecessary policy in the kernel.

If for dom0 you need crash_kexec to do something different from domU
you should be able to load a small piece of code via kexec that makes
the hypervisor calls you need.

 However, if you wish I could add this flag to syscall.

I do wish.  We need to distinguish between the kexec firmware pass
through, and normal kexec.

 Additionally, I could
 add function which enables firmware support and then kexec_use_firmware
 variable will be global only in kexec.c module.

No.  kexec_use_firmware is the wrong mental model.

Do not mix the kexec pass through and the normal kexec case.

We most definitely need to call different code in the kexec firmware
pass through case.

For normal kexec we just need to use a paravirt aware version of
machine_kexec and machine_kexec_shutdown.

 Furthermore it is not ok to have a conditional
 code outside of header files.

 I agree but how to dispatch execution e.g. in crash_kexec()
 if we would like (I suppose) compile kexec firmware
 support conditionally?

The classic pattern is to have the #ifdefs in the header and have an
noop function that is inlined when the functionality is compiled out.
This allows all of the logic to always be compiled.

Eric
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v3 06/11] x86/xen: Add i386 kexec/kdump implementation

2012-12-27 Thread Eric W. Biederman
H. Peter Anvin h...@zytor.com writes:

 On 12/27/2012 03:23 PM, Daniel Kiper wrote:
 On 12/26/2012 06:18 PM, Daniel Kiper wrote:
 Add i386 kexec/kdump implementation.

 v2 - suggestions/fixes:
  - allocate transition page table pages below 4 GiB
(suggested by Jan Beulich).

 Why?

 Sadly all addresses are passed via unsigned long
 variable to kexec hypercall.


 So can you unf*ck your broken interface before imposing it on anyone
 else?

Hasn't 32bit dom0 been retired?

Untill the kexec firmware pass through and the normal kexec code paths
are separated I can't make sense out of this code.

The normal kexec code path should be doable without any special
constraints on the kernel.  Just leaning on some xen paravirt calls.

The kexec pass through probably should not even touch x86 arch code.

Certainly the same patch should not have code for both the xen kexec
pass through and the paravirt arch code for the normal kexec path.

Eric
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 2/2] vhost: handle polling failure

2012-12-27 Thread Jason Wang
On 12/27/2012 06:01 PM, Wanlong Gao wrote:
 On 12/27/2012 02:39 PM, Jason Wang wrote:
  Currently, polling error were ignored in vhost. This may lead some issues 
  (e.g
  kenrel crash when passing a tap fd to vhost before calling TUNSETIFF). Fix 
  this
  by:
 Can this kernel crash be reproduced by hand?

 Thanks,
 Wanlong Gao

  
Yes, it could be simply reproduced by: open a tap fd but does not cal
TUNSETIFF, then pass it to qemu and enable vhost.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 1/2] vhost_net: correct error hanlding in vhost_net_set_backend()

2012-12-27 Thread Jason Wang
On 12/27/2012 09:03 PM, Michael S. Tsirkin wrote:
 On Thu, Dec 27, 2012 at 02:39:20PM +0800, Jason Wang wrote:
 Currently, polling error were ignored in vhost. This may lead some issues 
 (e.g
 kenrel crash when passing a tap fd to vhost before calling TUNSETIFF). Fix 
 this
 by:

 - extend the idea of vhost_net_poll_state to all vhost_polls
 - change the state only when polling is succeed
 - make vhost_poll_start() report errors to the caller, which could be used
   caller or userspace.
 Maybe it could but this patch just ignores these errors.
 And it's not clear how would userspace handle these errors.

Not all were ignored, one example is vhost_net_enable_vq(), this could
be used to let userspace know the fd were not setup correctly.
 Also, since we have a reference on the fd, it would seem
 that once poll succeeds it can't fail in the future.

Right.

 So two other options would make more sense to me:
 - if vhost is bound to tun without SETIFF, fail this immediately
 - if vhost is bound to tun without SETIFF, defer polling
   until SETIFF

 Option 1 would seem much easier to implement, I think it's
 preferable.

Option 1 seems better, since userspace may also disable a queue in the
meantime. Will add a vq_err() and break out of the loop when fails to
start the polling.
 Signed-off-by: Jason Wang jasow...@redhat.com
 ---
  drivers/vhost/net.c   |   75 
 +
  drivers/vhost/vhost.c |   16 +-
  drivers/vhost/vhost.h |   11 ++-
  3 files changed, 50 insertions(+), 52 deletions(-)

 diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
 index 629d6b5..56e7f5a 100644
 --- a/drivers/vhost/net.c
 +++ b/drivers/vhost/net.c
 @@ -64,20 +64,10 @@ enum {
  VHOST_NET_VQ_MAX = 2,
  };
  
 -enum vhost_net_poll_state {
 -VHOST_NET_POLL_DISABLED = 0,
 -VHOST_NET_POLL_STARTED = 1,
 -VHOST_NET_POLL_STOPPED = 2,
 -};
 -
  struct vhost_net {
  struct vhost_dev dev;
  struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
  struct vhost_poll poll[VHOST_NET_VQ_MAX];
 -/* Tells us whether we are polling a socket for TX.
 - * We only do this when socket buffer fills up.
 - * Protected by tx vq lock. */
 -enum vhost_net_poll_state tx_poll_state;
  /* Number of TX recently submitted.
   * Protected by tx vq lock. */
  unsigned tx_packets;
 @@ -155,24 +145,6 @@ static void copy_iovec_hdr(const struct iovec *from, 
 struct iovec *to,
  }
  }
  
 -/* Caller must have TX VQ lock */
 -static void tx_poll_stop(struct vhost_net *net)
 -{
 -if (likely(net-tx_poll_state != VHOST_NET_POLL_STARTED))
 -return;
 -vhost_poll_stop(net-poll + VHOST_NET_VQ_TX);
 -net-tx_poll_state = VHOST_NET_POLL_STOPPED;
 -}
 -
 -/* Caller must have TX VQ lock */
 -static void tx_poll_start(struct vhost_net *net, struct socket *sock)
 -{
 -if (unlikely(net-tx_poll_state != VHOST_NET_POLL_STOPPED))
 -return;
 -vhost_poll_start(net-poll + VHOST_NET_VQ_TX, sock-file);
 -net-tx_poll_state = VHOST_NET_POLL_STARTED;
 -}
 -
  /* In case of DMA done not in order in lower device driver for some reason.
   * upend_idx is used to track end of used idx, done_idx is used to track 
 head
   * of used idx. Once lower device DMA done contiguously, we will signal KVM
 @@ -252,7 +224,7 @@ static void handle_tx(struct vhost_net *net)
  wmem = atomic_read(sock-sk-sk_wmem_alloc);
  if (wmem = sock-sk-sk_sndbuf) {
  mutex_lock(vq-mutex);
 -tx_poll_start(net, sock);
 +vhost_poll_start(net-poll + VHOST_NET_VQ_TX, sock-file);
  mutex_unlock(vq-mutex);
  return;
  }
 @@ -261,7 +233,7 @@ static void handle_tx(struct vhost_net *net)
  vhost_disable_notify(net-dev, vq);
  
  if (wmem  sock-sk-sk_sndbuf / 2)
 -tx_poll_stop(net);
 +vhost_poll_stop(net-poll + VHOST_NET_VQ_TX);
  hdr_size = vq-vhost_hlen;
  zcopy = vq-ubufs;
  
 @@ -283,7 +255,8 @@ static void handle_tx(struct vhost_net *net)
  
  wmem = atomic_read(sock-sk-sk_wmem_alloc);
  if (wmem = sock-sk-sk_sndbuf * 3 / 4) {
 -tx_poll_start(net, sock);
 +vhost_poll_start(net-poll + VHOST_NET_VQ_TX,
 + sock-file);
  set_bit(SOCK_ASYNC_NOSPACE, sock-flags);
  break;
  }
 @@ -294,7 +267,8 @@ static void handle_tx(struct vhost_net *net)
  (vq-upend_idx - vq-done_idx) :
  (vq-upend_idx + UIO_MAXIOV - vq-done_idx);
  if (unlikely(num_pends  VHOST_MAX_PEND)) {
 -tx_poll_start(net, sock);
 +vhost_poll_start(net-poll + VHOST_NET_VQ_TX,
 + sock-file);
  

Re: [PATCH 1/2] vhost_net: correct error hanlding in vhost_net_set_backend()

2012-12-27 Thread Jason Wang
On 12/27/2012 09:14 PM, Michael S. Tsirkin wrote:
 On Thu, Dec 27, 2012 at 02:39:19PM +0800, Jason Wang wrote:
 Fix the leaking of oldubufs and fd refcnt when fail to initialized used ring.

 Signed-off-by: Jason Wang jasow...@redhat.com
 ---
  drivers/vhost/net.c |   14 +++---
  1 files changed, 11 insertions(+), 3 deletions(-)

 diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
 index ebd08b2..629d6b5 100644
 --- a/drivers/vhost/net.c
 +++ b/drivers/vhost/net.c
 @@ -834,8 +834,10 @@ static long vhost_net_set_backend(struct vhost_net *n, 
 unsigned index, int fd)
  vhost_net_enable_vq(n, vq);
  
  r = vhost_init_used(vq);
 -if (r)
 -goto err_vq;
 +if (r) {
 +sock = NULL;
 +goto err_used;
 +}
  
  n-tx_packets = 0;
  n-tx_zcopy_err = 0;
 @@ -859,8 +861,14 @@ static long vhost_net_set_backend(struct vhost_net *n, 
 unsigned index, int fd)
  mutex_unlock(n-dev.mutex);
  return 0;
  
 +err_used:
 +if (oldubufs)
 +vhost_ubuf_put_and_wait(oldubufs);
 +if (oldsock)
 +fput(oldsock-file);
  err_ubufs:
 -fput(sock-file);
 +if (sock)
 +fput(sock-file);
  err_vq:
  mutex_unlock(vq-mutex);
  err:
 I think it's a real bug, but I don't see how the fix
 makes sense.
 We are returning an error, so we ideally
 revert to the state before the faulty
 operation. So this should put sock and ubufs,
 not oldsock/oldubufs.

Agree.

 The best way is probably to change
 vhost_init_used so that it gets private data
 pointer as a parameter.

 We can then call it before ubuf alloc.
 You can then add err_used right after err_ubufs
 with no extra logic.


Make more sense, thanks.



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization