Re: [PATCH 2/2] vhost: handle polling failure
On 12/27/2012 02:39 PM, Jason Wang wrote: Currently, polling error were ignored in vhost. This may lead some issues (e.g kenrel crash when passing a tap fd to vhost before calling TUNSETIFF). Fix this by: Can this kernel crash be reproduced by hand? Thanks, Wanlong Gao - extend the idea of vhost_net_poll_state to all vhost_polls - change the state only when polling is succeed - make vhost_poll_start() report errors to the caller, which could be used caller or userspace. Signed-off-by: Jason Wang jasow...@redhat.com --- drivers/vhost/net.c | 75 + drivers/vhost/vhost.c | 16 +- drivers/vhost/vhost.h | 11 ++- 3 files changed, 50 insertions(+), 52 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 629d6b5..56e7f5a 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -64,20 +64,10 @@ enum { VHOST_NET_VQ_MAX = 2, }; -enum vhost_net_poll_state { - VHOST_NET_POLL_DISABLED = 0, - VHOST_NET_POLL_STARTED = 1, - VHOST_NET_POLL_STOPPED = 2, -}; - struct vhost_net { struct vhost_dev dev; struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX]; struct vhost_poll poll[VHOST_NET_VQ_MAX]; - /* Tells us whether we are polling a socket for TX. - * We only do this when socket buffer fills up. - * Protected by tx vq lock. */ - enum vhost_net_poll_state tx_poll_state; /* Number of TX recently submitted. * Protected by tx vq lock. */ unsigned tx_packets; @@ -155,24 +145,6 @@ static void copy_iovec_hdr(const struct iovec *from, struct iovec *to, } } -/* Caller must have TX VQ lock */ -static void tx_poll_stop(struct vhost_net *net) -{ - if (likely(net-tx_poll_state != VHOST_NET_POLL_STARTED)) - return; - vhost_poll_stop(net-poll + VHOST_NET_VQ_TX); - net-tx_poll_state = VHOST_NET_POLL_STOPPED; -} - -/* Caller must have TX VQ lock */ -static void tx_poll_start(struct vhost_net *net, struct socket *sock) -{ - if (unlikely(net-tx_poll_state != VHOST_NET_POLL_STOPPED)) - return; - vhost_poll_start(net-poll + VHOST_NET_VQ_TX, sock-file); - net-tx_poll_state = VHOST_NET_POLL_STARTED; -} - /* In case of DMA done not in order in lower device driver for some reason. * upend_idx is used to track end of used idx, done_idx is used to track head * of used idx. Once lower device DMA done contiguously, we will signal KVM @@ -252,7 +224,7 @@ static void handle_tx(struct vhost_net *net) wmem = atomic_read(sock-sk-sk_wmem_alloc); if (wmem = sock-sk-sk_sndbuf) { mutex_lock(vq-mutex); - tx_poll_start(net, sock); + vhost_poll_start(net-poll + VHOST_NET_VQ_TX, sock-file); mutex_unlock(vq-mutex); return; } @@ -261,7 +233,7 @@ static void handle_tx(struct vhost_net *net) vhost_disable_notify(net-dev, vq); if (wmem sock-sk-sk_sndbuf / 2) - tx_poll_stop(net); + vhost_poll_stop(net-poll + VHOST_NET_VQ_TX); hdr_size = vq-vhost_hlen; zcopy = vq-ubufs; @@ -283,7 +255,8 @@ static void handle_tx(struct vhost_net *net) wmem = atomic_read(sock-sk-sk_wmem_alloc); if (wmem = sock-sk-sk_sndbuf * 3 / 4) { - tx_poll_start(net, sock); + vhost_poll_start(net-poll + VHOST_NET_VQ_TX, + sock-file); set_bit(SOCK_ASYNC_NOSPACE, sock-flags); break; } @@ -294,7 +267,8 @@ static void handle_tx(struct vhost_net *net) (vq-upend_idx - vq-done_idx) : (vq-upend_idx + UIO_MAXIOV - vq-done_idx); if (unlikely(num_pends VHOST_MAX_PEND)) { - tx_poll_start(net, sock); + vhost_poll_start(net-poll + VHOST_NET_VQ_TX, + sock-file); set_bit(SOCK_ASYNC_NOSPACE, sock-flags); break; } @@ -360,7 +334,8 @@ static void handle_tx(struct vhost_net *net) } vhost_discard_vq_desc(vq, 1); if (err == -EAGAIN || err == -ENOBUFS) - tx_poll_start(net, sock); + vhost_poll_start(net-poll + VHOST_NET_VQ_TX, + sock-file); break; } if (err != len) @@ -623,7 +598,6 @@ static int vhost_net_open(struct inode *inode, struct file *f) vhost_poll_init(n-poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev);
Re: [RFC PATCH] virtio-net: reset virtqueue affinity when doing cpu hotplug
On Thu, Dec 27, 2012 at 11:34:16AM +0800, Jason Wang wrote: On 12/26/2012 06:46 PM, Michael S. Tsirkin wrote: On Wed, Dec 26, 2012 at 03:06:54PM +0800, Wanlong Gao wrote: Add a cpu notifier to virtio-net, so that we can reset the virtqueue affinity if the cpu hotplug happens. It improve the performance through enabling or disabling the virtqueue affinity after doing cpu hotplug. Cc: Rusty Russell ru...@rustcorp.com.au Cc: Michael S. Tsirkin m...@redhat.com Cc: Jason Wang jasow...@redhat.com Cc: virtualization@lists.linux-foundation.org Cc: net...@vger.kernel.org Signed-off-by: Wanlong Gao gaowanl...@cn.fujitsu.com Thanks for looking into this. Some comments: 1. Looks like the logic in virtnet_set_affinity (and in virtnet_select_queue) will not work very well when CPU IDs are not consequitive. This can happen with hot unplug. Maybe we should add a VQ allocator, and defining a per-cpu variable specifying the VQ instead of using CPU ID. Yes, and generate the affinity hint based on the mapping. Btw, what does VQ allocator means here? Some logic to generate CPU to VQ mapping. 2. The below code seems racy e.g. when CPU is added during device init. 3. using a global cpu_hotplug seems inelegant. In any case we should document what is the meaning of this variable. --- drivers/net/virtio_net.c | 39 ++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index a6fcf15..9710cf4 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -26,6 +26,7 @@ #include linux/scatterlist.h #include linux/if_vlan.h #include linux/slab.h +#include linux/cpu.h static int napi_weight = 128; module_param(napi_weight, int, 0444); @@ -34,6 +35,8 @@ static bool csum = true, gso = true; module_param(csum, bool, 0444); module_param(gso, bool, 0444); +static bool cpu_hotplug = false; + /* FIXME: MTU in config. */ #define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) #define GOOD_COPY_LEN 128 @@ -1041,6 +1044,26 @@ static void virtnet_set_affinity(struct virtnet_info *vi, bool set) vi-affinity_hint_set = false; } +static int virtnet_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + switch(action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + case CPU_DEAD: + case CPU_DEAD_FROZEN: + cpu_hotplug = true; + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block virtnet_cpu_notifier = { + .notifier_call = virtnet_cpu_callback, +}; + static void virtnet_get_ringparam(struct net_device *dev, struct ethtool_ringparam *ring) { @@ -1131,7 +1154,14 @@ static int virtnet_change_mtu(struct net_device *dev, int new_mtu) */ static u16 virtnet_select_queue(struct net_device *dev, struct sk_buff *skb) { - int txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : + int txq; + + if (unlikely(cpu_hotplug == true)) { + virtnet_set_affinity(netdev_priv(dev), true); + cpu_hotplug = false; + } + + txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : smp_processor_id(); while (unlikely(txq = dev-real_num_tx_queues)) @@ -1248,6 +1278,8 @@ static void virtnet_del_vqs(struct virtnet_info *vi) { struct virtio_device *vdev = vi-vdev; + unregister_hotcpu_notifier(virtnet_cpu_notifier); + virtnet_set_affinity(vi, false); vdev-config-del_vqs(vdev); @@ -1372,6 +1404,11 @@ static int init_vqs(struct virtnet_info *vi) goto err_free; virtnet_set_affinity(vi, true); + + ret = register_hotcpu_notifier(virtnet_cpu_notifier); + if (ret) + goto err_free; + return 0; err_free: -- 1.8.0 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH 1/2] vhost_net: correct error hanlding in vhost_net_set_backend()
On Thu, Dec 27, 2012 at 02:39:20PM +0800, Jason Wang wrote: Currently, polling error were ignored in vhost. This may lead some issues (e.g kenrel crash when passing a tap fd to vhost before calling TUNSETIFF). Fix this by: - extend the idea of vhost_net_poll_state to all vhost_polls - change the state only when polling is succeed - make vhost_poll_start() report errors to the caller, which could be used caller or userspace. Maybe it could but this patch just ignores these errors. And it's not clear how would userspace handle these errors. Also, since we have a reference on the fd, it would seem that once poll succeeds it can't fail in the future. So two other options would make more sense to me: - if vhost is bound to tun without SETIFF, fail this immediately - if vhost is bound to tun without SETIFF, defer polling until SETIFF Option 1 would seem much easier to implement, I think it's preferable. Signed-off-by: Jason Wang jasow...@redhat.com --- drivers/vhost/net.c | 75 + drivers/vhost/vhost.c | 16 +- drivers/vhost/vhost.h | 11 ++- 3 files changed, 50 insertions(+), 52 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 629d6b5..56e7f5a 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -64,20 +64,10 @@ enum { VHOST_NET_VQ_MAX = 2, }; -enum vhost_net_poll_state { - VHOST_NET_POLL_DISABLED = 0, - VHOST_NET_POLL_STARTED = 1, - VHOST_NET_POLL_STOPPED = 2, -}; - struct vhost_net { struct vhost_dev dev; struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX]; struct vhost_poll poll[VHOST_NET_VQ_MAX]; - /* Tells us whether we are polling a socket for TX. - * We only do this when socket buffer fills up. - * Protected by tx vq lock. */ - enum vhost_net_poll_state tx_poll_state; /* Number of TX recently submitted. * Protected by tx vq lock. */ unsigned tx_packets; @@ -155,24 +145,6 @@ static void copy_iovec_hdr(const struct iovec *from, struct iovec *to, } } -/* Caller must have TX VQ lock */ -static void tx_poll_stop(struct vhost_net *net) -{ - if (likely(net-tx_poll_state != VHOST_NET_POLL_STARTED)) - return; - vhost_poll_stop(net-poll + VHOST_NET_VQ_TX); - net-tx_poll_state = VHOST_NET_POLL_STOPPED; -} - -/* Caller must have TX VQ lock */ -static void tx_poll_start(struct vhost_net *net, struct socket *sock) -{ - if (unlikely(net-tx_poll_state != VHOST_NET_POLL_STOPPED)) - return; - vhost_poll_start(net-poll + VHOST_NET_VQ_TX, sock-file); - net-tx_poll_state = VHOST_NET_POLL_STARTED; -} - /* In case of DMA done not in order in lower device driver for some reason. * upend_idx is used to track end of used idx, done_idx is used to track head * of used idx. Once lower device DMA done contiguously, we will signal KVM @@ -252,7 +224,7 @@ static void handle_tx(struct vhost_net *net) wmem = atomic_read(sock-sk-sk_wmem_alloc); if (wmem = sock-sk-sk_sndbuf) { mutex_lock(vq-mutex); - tx_poll_start(net, sock); + vhost_poll_start(net-poll + VHOST_NET_VQ_TX, sock-file); mutex_unlock(vq-mutex); return; } @@ -261,7 +233,7 @@ static void handle_tx(struct vhost_net *net) vhost_disable_notify(net-dev, vq); if (wmem sock-sk-sk_sndbuf / 2) - tx_poll_stop(net); + vhost_poll_stop(net-poll + VHOST_NET_VQ_TX); hdr_size = vq-vhost_hlen; zcopy = vq-ubufs; @@ -283,7 +255,8 @@ static void handle_tx(struct vhost_net *net) wmem = atomic_read(sock-sk-sk_wmem_alloc); if (wmem = sock-sk-sk_sndbuf * 3 / 4) { - tx_poll_start(net, sock); + vhost_poll_start(net-poll + VHOST_NET_VQ_TX, + sock-file); set_bit(SOCK_ASYNC_NOSPACE, sock-flags); break; } @@ -294,7 +267,8 @@ static void handle_tx(struct vhost_net *net) (vq-upend_idx - vq-done_idx) : (vq-upend_idx + UIO_MAXIOV - vq-done_idx); if (unlikely(num_pends VHOST_MAX_PEND)) { - tx_poll_start(net, sock); + vhost_poll_start(net-poll + VHOST_NET_VQ_TX, + sock-file); set_bit(SOCK_ASYNC_NOSPACE, sock-flags); break; } @@ -360,7 +334,8 @@ static void handle_tx(struct vhost_net *net) } vhost_discard_vq_desc(vq, 1); if (err == -EAGAIN || err == -ENOBUFS) -
Re: [PATCH v3 00/11] xen: Initial kexec/kdump implementation
Andrew Cooper andrew.coop...@citrix.com writes: On 27/12/2012 07:53, Eric W. Biederman wrote: The syscall ABI still has the wrong semantics. Aka totally unmaintainable and umergeable. The concept of domU support is also strange. What does domU support even mean, when the dom0 support is loading a kernel to pick up Xen when Xen falls over. There are two requirements pulling at this patch series, but I agree that we need to clarify them. It probably make sense to split them apart a little even. When dom0 loads a crash kernel, it is loading one for Xen to use. As a dom0 crash causes a Xen crash, having dom0 set up a kdump kernel for itself is completely useless. This ability is present in classic Xen dom0 kernels, but the feature is currently missing in PVOPS. Many cloud customers and service providers want the ability for a VM administrator to be able to load a kdump/kexec kernel within a domain[1]. This allows the VM administrator to take more proactive steps to isolate the cause of a crash, the state of which is most likely discarded while tearing down the domain. The result being that as far as Xen is concerned, the domain is still alive, while the kdump kernel/environment can work its usual magic. I am not aware of any feature like this existing in the past. Which makes domU support semantically just the normal kexec/kdump support. Got it. The point of implementing domU is for those times when the hypervisor admin and the kernel admin are different. For domU support modifying or adding alternate versions of machine_kexec.c and relocate_kernel.S to add paravirtualization support make sense. There is the practical argument that for implementation efficiency of crash dumps it would be better if that support came from the hypervisor or the hypervisor environment. But this gets into the practical reality that the hypervisor environment does not do that today. Furthermore kexec all by itself working in a paravirtualized environment under Xen makes sense. domU support is what Peter was worrying about for cleanliness, and we need some x86 backend ops there, and generally to be careful. For dom0 support we need to extend the kexec_load system call, and get it right. When we are done I expect both dom0 and domU support of kexec to work in dom0. I don't know if the normal kexec or kdump case will ever make sense in dom0 but there is no reason for that case to be broken. ~Andrew [1] http://lists.xen.org/archives/html/xen-devel/2012-11/msg01274.html Eric ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v3 09/11] x86/xen/enlighten: Add init and crash kexec/kdump hooks
On 12/26/2012 06:18 PM, Daniel Kiper wrote: Add init and crash kexec/kdump hooks. Signed-off-by: Daniel Kiper daniel.ki...@oracle.com --- arch/x86/xen/enlighten.c | 11 +++ 1 files changed, 11 insertions(+), 0 deletions(-) On the general issue of hooks: Hooks need their pre- and postsemantics extremely ewell documented, let they end up being an impossible roadblock to development. -hpa -- H. Peter Anvin, Intel Open Source Technology Center I work for Intel. I don't speak on their behalf. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v3 02/11] x86/kexec: Add extra pointers to transition page table PGD, PUD, PMD and PTE
Hmm... this code is being redone at the moment... this might conflict. Is this available somewhere? May I have a look at it? Daniel PS I am on holiday until 02/01/2013 and I could not have access to my email box. Please be patient. At worst case I will send reply when I will be back at office. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v3 06/11] x86/xen: Add i386 kexec/kdump implementation
On 12/26/2012 06:18 PM, Daniel Kiper wrote: Add i386 kexec/kdump implementation. v2 - suggestions/fixes: - allocate transition page table pages below 4 GiB (suggested by Jan Beulich). Why? Sadly all addresses are passed via unsigned long variable to kexec hypercall. Daniel ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v3 06/11] x86/xen: Add i386 kexec/kdump implementation
On 12/27/2012 03:23 PM, Daniel Kiper wrote: On 12/26/2012 06:18 PM, Daniel Kiper wrote: Add i386 kexec/kdump implementation. v2 - suggestions/fixes: - allocate transition page table pages below 4 GiB (suggested by Jan Beulich). Why? Sadly all addresses are passed via unsigned long variable to kexec hypercall. So can you unf*ck your broken interface before imposing it on anyone else? -hpa -- H. Peter Anvin, Intel Open Source Technology Center I work for Intel. I don't speak on their behalf. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v3 00/11] xen: Initial kexec/kdump implementation
On 12/26/2012 06:18 PM, Daniel Kiper wrote: Hi, This set of patches contains initial kexec/kdump implementation for Xen v3. Currently only dom0 is supported, however, almost all infrustructure required for domU support is ready. Jan Beulich suggested to merge Xen x86 assembler code with baremetal x86 code. This could simplify and reduce a bit size of kernel code. However, this solution requires some changes in baremetal x86 code. First of all code which establishes transition page table should be moved back from machine_kexec_$(BITS).c to relocate_kernel_$(BITS).S. Another important thing which should be changed in that case is format of page_list array. Xen kexec hypercall requires to alternate physical addresses with virtual ones. These and other required stuff have not been done in that version because I am not sure that solution will be accepted by kexec/kdump maintainers. I hope that this email spark discussion about that topic. I want a detailed list of the constraints that this assumes and therefore imposes on the native implementation as a result of this. We have had way too many patches where Xen PV hacks effectively nailgun arbitrary, and sometimes poor, design decisions in place and now we can't fix them. OK but now I think that we should leave this discussion until all details regarding kexec/kdump generic code will be agreed. Sorry for that. Daniel ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v3 01/11] kexec: introduce kexec firmware support
Daniel Kiper daniel.ki...@oracle.com writes: Some kexec/kdump implementations (e.g. Xen PVOPS) could not use default Linux infrastructure and require some support from firmware and/or hypervisor. To cope with that problem kexec firmware infrastructure was introduced. It allows a developer to use all kexec/kdump features of given firmware or hypervisor. As this stands this patch is wrong. You need to pass an additional flag from userspace through /sbin/kexec that says load the kexec image in the firmware. A global variable here is not ok. As I understand it you are loading a kexec on xen panic image. Which is semantically different from a kexec on linux panic image. It is not ok to do have a silly global variable kexec_use_firmware. Earlier we agreed that /sbin/kexec should call kexec syscall with special flag. However, during work on Xen kexec/kdump v3 patch I stated that this is insufficient because e.g. crash_kexec() should execute different code in case of use of firmware support too. Sadly syscall does not save this flag anywhere. Additionally, I stated that kernel itself has the best knowledge which code path should be used (firmware or plain Linux). If this decision will be left to userspace then simple kexec syscall could crash system at worst case (e.g. when plain Linux kexec will be used in case when firmware kaxec should be used). However, if you wish I could add this flag to syscall. Additionally, I could add function which enables firmware support and then kexec_use_firmware variable will be global only in kexec.c module. Furthermore it is not ok to have a conditional code outside of header files. I agree but how to dispatch execution e.g. in crash_kexec() if we would like (I suppose) compile kexec firmware support conditionally? Daniel ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v3 00/11] xen: Initial kexec/kdump implementation
Andrew Cooper andrew.coop...@citrix.com writes: On 27/12/2012 07:53, Eric W. Biederman wrote: The syscall ABI still has the wrong semantics. Aka totally unmaintainable and umergeable. The concept of domU support is also strange. What does domU support even mean, when the dom0 support is loading a kernel to pick up Xen when Xen falls over. There are two requirements pulling at this patch series, but I agree that we need to clarify them. It probably make sense to split them apart a little even. When dom0 loads a crash kernel, it is loading one for Xen to use. As a dom0 crash causes a Xen crash, having dom0 set up a kdump kernel for itself is completely useless. This ability is present in classic Xen dom0 kernels, but the feature is currently missing in PVOPS. Many cloud customers and service providers want the ability for a VM administrator to be able to load a kdump/kexec kernel within a domain[1]. This allows the VM administrator to take more proactive steps to isolate the cause of a crash, the state of which is most likely discarded while tearing down the domain. The result being that as far as Xen is concerned, the domain is still alive, while the kdump kernel/environment can work its usual magic. I am not aware of any feature like this existing in the past. Which makes domU support semantically just the normal kexec/kdump support. Got it. To some extent. It is true on HVM and PVonHVM guests. However, PV guests requires a bit different kexec/kdump implementation than plain kexec/kdump. Proposed firmware support has almost all required features. PV guest specific features (a few) will be added later (after agreeing generic firmware support which is sufficient at least for dom0). It looks that I should replace domU by PV guest in patch description. The point of implementing domU is for those times when the hypervisor admin and the kernel admin are different. Right. For domU support modifying or adding alternate versions of machine_kexec.c and relocate_kernel.S to add paravirtualization support make sense. It is not sufficient. Please look above. There is the practical argument that for implementation efficiency of crash dumps it would be better if that support came from the hypervisor or the hypervisor environment. But this gets into the practical reality I am thinking about that. that the hypervisor environment does not do that today. Furthermore kexec all by itself working in a paravirtualized environment under Xen makes sense. domU support is what Peter was worrying about for cleanliness, and we need some x86 backend ops there, and generally to be careful. As I know we do not need any additional pv_ops stuff if we place all needed things in kexec firmware support. Daniel ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v3 01/11] kexec: introduce kexec firmware support
Daniel Kiper daniel.ki...@oracle.com writes: Daniel Kiper daniel.ki...@oracle.com writes: Some kexec/kdump implementations (e.g. Xen PVOPS) could not use default Linux infrastructure and require some support from firmware and/or hypervisor. To cope with that problem kexec firmware infrastructure was introduced. It allows a developer to use all kexec/kdump features of given firmware or hypervisor. As this stands this patch is wrong. You need to pass an additional flag from userspace through /sbin/kexec that says load the kexec image in the firmware. A global variable here is not ok. As I understand it you are loading a kexec on xen panic image. Which is semantically different from a kexec on linux panic image. It is not ok to do have a silly global variable kexec_use_firmware. Earlier we agreed that /sbin/kexec should call kexec syscall with special flag. However, during work on Xen kexec/kdump v3 patch I stated that this is insufficient because e.g. crash_kexec() should execute different code in case of use of firmware support too. That implies you have the wrong model of userspace. Very simply there is: linux kexec pass through to xen kexec. And linux kexec (ultimately pv kexec because the pv machine is a slightly different architecture). Sadly syscall does not save this flag anywhere. Additionally, I stated that kernel itself has the best knowledge which code path should be used (firmware or plain Linux). If this decision will be left to userspace then simple kexec syscall could crash system at worst case (e.g. when plain Linux kexec will be used in case when firmware kaxec should be used). And that path selection bit is strongly non-sense. You are advocating hardcoding unnecessary policy in the kernel. If for dom0 you need crash_kexec to do something different from domU you should be able to load a small piece of code via kexec that makes the hypervisor calls you need. However, if you wish I could add this flag to syscall. I do wish. We need to distinguish between the kexec firmware pass through, and normal kexec. Additionally, I could add function which enables firmware support and then kexec_use_firmware variable will be global only in kexec.c module. No. kexec_use_firmware is the wrong mental model. Do not mix the kexec pass through and the normal kexec case. We most definitely need to call different code in the kexec firmware pass through case. For normal kexec we just need to use a paravirt aware version of machine_kexec and machine_kexec_shutdown. Furthermore it is not ok to have a conditional code outside of header files. I agree but how to dispatch execution e.g. in crash_kexec() if we would like (I suppose) compile kexec firmware support conditionally? The classic pattern is to have the #ifdefs in the header and have an noop function that is inlined when the functionality is compiled out. This allows all of the logic to always be compiled. Eric ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v3 06/11] x86/xen: Add i386 kexec/kdump implementation
H. Peter Anvin h...@zytor.com writes: On 12/27/2012 03:23 PM, Daniel Kiper wrote: On 12/26/2012 06:18 PM, Daniel Kiper wrote: Add i386 kexec/kdump implementation. v2 - suggestions/fixes: - allocate transition page table pages below 4 GiB (suggested by Jan Beulich). Why? Sadly all addresses are passed via unsigned long variable to kexec hypercall. So can you unf*ck your broken interface before imposing it on anyone else? Hasn't 32bit dom0 been retired? Untill the kexec firmware pass through and the normal kexec code paths are separated I can't make sense out of this code. The normal kexec code path should be doable without any special constraints on the kernel. Just leaning on some xen paravirt calls. The kexec pass through probably should not even touch x86 arch code. Certainly the same patch should not have code for both the xen kexec pass through and the paravirt arch code for the normal kexec path. Eric ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH 2/2] vhost: handle polling failure
On 12/27/2012 06:01 PM, Wanlong Gao wrote: On 12/27/2012 02:39 PM, Jason Wang wrote: Currently, polling error were ignored in vhost. This may lead some issues (e.g kenrel crash when passing a tap fd to vhost before calling TUNSETIFF). Fix this by: Can this kernel crash be reproduced by hand? Thanks, Wanlong Gao Yes, it could be simply reproduced by: open a tap fd but does not cal TUNSETIFF, then pass it to qemu and enable vhost. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH 1/2] vhost_net: correct error hanlding in vhost_net_set_backend()
On 12/27/2012 09:03 PM, Michael S. Tsirkin wrote: On Thu, Dec 27, 2012 at 02:39:20PM +0800, Jason Wang wrote: Currently, polling error were ignored in vhost. This may lead some issues (e.g kenrel crash when passing a tap fd to vhost before calling TUNSETIFF). Fix this by: - extend the idea of vhost_net_poll_state to all vhost_polls - change the state only when polling is succeed - make vhost_poll_start() report errors to the caller, which could be used caller or userspace. Maybe it could but this patch just ignores these errors. And it's not clear how would userspace handle these errors. Not all were ignored, one example is vhost_net_enable_vq(), this could be used to let userspace know the fd were not setup correctly. Also, since we have a reference on the fd, it would seem that once poll succeeds it can't fail in the future. Right. So two other options would make more sense to me: - if vhost is bound to tun without SETIFF, fail this immediately - if vhost is bound to tun without SETIFF, defer polling until SETIFF Option 1 would seem much easier to implement, I think it's preferable. Option 1 seems better, since userspace may also disable a queue in the meantime. Will add a vq_err() and break out of the loop when fails to start the polling. Signed-off-by: Jason Wang jasow...@redhat.com --- drivers/vhost/net.c | 75 + drivers/vhost/vhost.c | 16 +- drivers/vhost/vhost.h | 11 ++- 3 files changed, 50 insertions(+), 52 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 629d6b5..56e7f5a 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -64,20 +64,10 @@ enum { VHOST_NET_VQ_MAX = 2, }; -enum vhost_net_poll_state { -VHOST_NET_POLL_DISABLED = 0, -VHOST_NET_POLL_STARTED = 1, -VHOST_NET_POLL_STOPPED = 2, -}; - struct vhost_net { struct vhost_dev dev; struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX]; struct vhost_poll poll[VHOST_NET_VQ_MAX]; -/* Tells us whether we are polling a socket for TX. - * We only do this when socket buffer fills up. - * Protected by tx vq lock. */ -enum vhost_net_poll_state tx_poll_state; /* Number of TX recently submitted. * Protected by tx vq lock. */ unsigned tx_packets; @@ -155,24 +145,6 @@ static void copy_iovec_hdr(const struct iovec *from, struct iovec *to, } } -/* Caller must have TX VQ lock */ -static void tx_poll_stop(struct vhost_net *net) -{ -if (likely(net-tx_poll_state != VHOST_NET_POLL_STARTED)) -return; -vhost_poll_stop(net-poll + VHOST_NET_VQ_TX); -net-tx_poll_state = VHOST_NET_POLL_STOPPED; -} - -/* Caller must have TX VQ lock */ -static void tx_poll_start(struct vhost_net *net, struct socket *sock) -{ -if (unlikely(net-tx_poll_state != VHOST_NET_POLL_STOPPED)) -return; -vhost_poll_start(net-poll + VHOST_NET_VQ_TX, sock-file); -net-tx_poll_state = VHOST_NET_POLL_STARTED; -} - /* In case of DMA done not in order in lower device driver for some reason. * upend_idx is used to track end of used idx, done_idx is used to track head * of used idx. Once lower device DMA done contiguously, we will signal KVM @@ -252,7 +224,7 @@ static void handle_tx(struct vhost_net *net) wmem = atomic_read(sock-sk-sk_wmem_alloc); if (wmem = sock-sk-sk_sndbuf) { mutex_lock(vq-mutex); -tx_poll_start(net, sock); +vhost_poll_start(net-poll + VHOST_NET_VQ_TX, sock-file); mutex_unlock(vq-mutex); return; } @@ -261,7 +233,7 @@ static void handle_tx(struct vhost_net *net) vhost_disable_notify(net-dev, vq); if (wmem sock-sk-sk_sndbuf / 2) -tx_poll_stop(net); +vhost_poll_stop(net-poll + VHOST_NET_VQ_TX); hdr_size = vq-vhost_hlen; zcopy = vq-ubufs; @@ -283,7 +255,8 @@ static void handle_tx(struct vhost_net *net) wmem = atomic_read(sock-sk-sk_wmem_alloc); if (wmem = sock-sk-sk_sndbuf * 3 / 4) { -tx_poll_start(net, sock); +vhost_poll_start(net-poll + VHOST_NET_VQ_TX, + sock-file); set_bit(SOCK_ASYNC_NOSPACE, sock-flags); break; } @@ -294,7 +267,8 @@ static void handle_tx(struct vhost_net *net) (vq-upend_idx - vq-done_idx) : (vq-upend_idx + UIO_MAXIOV - vq-done_idx); if (unlikely(num_pends VHOST_MAX_PEND)) { -tx_poll_start(net, sock); +vhost_poll_start(net-poll + VHOST_NET_VQ_TX, + sock-file);
Re: [PATCH 1/2] vhost_net: correct error hanlding in vhost_net_set_backend()
On 12/27/2012 09:14 PM, Michael S. Tsirkin wrote: On Thu, Dec 27, 2012 at 02:39:19PM +0800, Jason Wang wrote: Fix the leaking of oldubufs and fd refcnt when fail to initialized used ring. Signed-off-by: Jason Wang jasow...@redhat.com --- drivers/vhost/net.c | 14 +++--- 1 files changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index ebd08b2..629d6b5 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -834,8 +834,10 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) vhost_net_enable_vq(n, vq); r = vhost_init_used(vq); -if (r) -goto err_vq; +if (r) { +sock = NULL; +goto err_used; +} n-tx_packets = 0; n-tx_zcopy_err = 0; @@ -859,8 +861,14 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) mutex_unlock(n-dev.mutex); return 0; +err_used: +if (oldubufs) +vhost_ubuf_put_and_wait(oldubufs); +if (oldsock) +fput(oldsock-file); err_ubufs: -fput(sock-file); +if (sock) +fput(sock-file); err_vq: mutex_unlock(vq-mutex); err: I think it's a real bug, but I don't see how the fix makes sense. We are returning an error, so we ideally revert to the state before the faulty operation. So this should put sock and ubufs, not oldsock/oldubufs. Agree. The best way is probably to change vhost_init_used so that it gets private data pointer as a parameter. We can then call it before ubuf alloc. You can then add err_used right after err_ubufs with no extra logic. Make more sense, thanks. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization