[PATCH v2 2/2] vhost: return bool from *_access_ok() functions
Currently vhost *_access_ok() functions return int. This is error-prone because there are two popular conventions: 1. 0 means failure, 1 means success 2. -errno means failure, 0 means success Although vhost mostly uses #1, it does not do so consistently. umem_access_ok() uses #2. This patch changes the return type from int to bool so that false means failure and true means success. This eliminates a potential source of errors. Suggested-by: Linus TorvaldsSigned-off-by: Stefan Hajnoczi --- drivers/vhost/vhost.h | 4 ++-- drivers/vhost/vhost.c | 66 +-- 2 files changed, 35 insertions(+), 35 deletions(-) diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index ac4b6056f19a..6e00fa57af09 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -178,8 +178,8 @@ void vhost_dev_cleanup(struct vhost_dev *); void vhost_dev_stop(struct vhost_dev *); long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user *argp); long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp); -int vhost_vq_access_ok(struct vhost_virtqueue *vq); -int vhost_log_access_ok(struct vhost_dev *); +bool vhost_vq_access_ok(struct vhost_virtqueue *vq); +bool vhost_log_access_ok(struct vhost_dev *); int vhost_get_vq_desc(struct vhost_virtqueue *, struct iovec iov[], unsigned int iov_count, diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 93fd0c75b0d8..b6a082ef33dd 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -641,14 +641,14 @@ void vhost_dev_cleanup(struct vhost_dev *dev) } EXPORT_SYMBOL_GPL(vhost_dev_cleanup); -static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz) +static bool log_access_ok(void __user *log_base, u64 addr, unsigned long sz) { u64 a = addr / VHOST_PAGE_SIZE / 8; /* Make sure 64 bit math will not overflow. */ if (a > ULONG_MAX - (unsigned long)log_base || a + (unsigned long)log_base > ULONG_MAX) - return 0; + return false; return access_ok(VERIFY_WRITE, log_base + a, (sz + VHOST_PAGE_SIZE * 8 - 1) / VHOST_PAGE_SIZE / 8); @@ -661,30 +661,30 @@ static bool vhost_overflow(u64 uaddr, u64 size) } /* Caller should have vq mutex and device mutex. */ -static int vq_memory_access_ok(void __user *log_base, struct vhost_umem *umem, - int log_all) +static bool vq_memory_access_ok(void __user *log_base, struct vhost_umem *umem, + int log_all) { struct vhost_umem_node *node; if (!umem) - return 0; + return false; list_for_each_entry(node, >umem_list, link) { unsigned long a = node->userspace_addr; if (vhost_overflow(node->userspace_addr, node->size)) - return 0; + return false; if (!access_ok(VERIFY_WRITE, (void __user *)a, node->size)) - return 0; + return false; else if (log_all && !log_access_ok(log_base, node->start, node->size)) - return 0; + return false; } - return 1; + return true; } static inline void __user *vhost_vq_meta_fetch(struct vhost_virtqueue *vq, @@ -701,13 +701,13 @@ static inline void __user *vhost_vq_meta_fetch(struct vhost_virtqueue *vq, /* Can we switch to this memory table? */ /* Caller should have device mutex but not vq mutex */ -static int memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem, - int log_all) +static bool memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem, +int log_all) { int i; for (i = 0; i < d->nvqs; ++i) { - int ok; + bool ok; bool log; mutex_lock(>vqs[i]->mutex); @@ -717,12 +717,12 @@ static int memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem, ok = vq_memory_access_ok(d->vqs[i]->log_base, umem, log); else - ok = 1; + ok = true; mutex_unlock(>vqs[i]->mutex); if (!ok) - return 0; + return false; } - return 1; + return true; } static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len, @@ -959,21 +959,21 @@ static void vhost_iotlb_notify_vq(struct vhost_dev *d, spin_unlock(>iotlb_lock); } -static int umem_access_ok(u64 uaddr, u64 size, int access) +static
[PATCH v2 1/2] vhost: fix vhost_vq_access_ok() log check
Commit d65026c6c62e7d9616c8ceb5a53b68bcdc050525 ("vhost: validate log when IOTLB is enabled") introduced a regression. The logic was originally: if (vq->iotlb) return 1; return A && B; After the patch the short-circuit logic for A was inverted: if (A || vq->iotlb) return A; return B; This patch fixes the regression by rewriting the checks in the obvious way, no longer returning A when vq->iotlb is non-NULL (which is hard to understand). Reported-by: syzbot+65a84dde0214b0387...@syzkaller.appspotmail.com Cc: Jason WangSigned-off-by: Stefan Hajnoczi --- drivers/vhost/vhost.c | 8 +--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 5320039671b7..93fd0c75b0d8 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1244,10 +1244,12 @@ static int vq_log_access_ok(struct vhost_virtqueue *vq, /* Caller should have vq mutex and device mutex */ int vhost_vq_access_ok(struct vhost_virtqueue *vq) { - int ret = vq_log_access_ok(vq, vq->log_base); + if (!vq_log_access_ok(vq, vq->log_base)) + return 0; - if (ret || vq->iotlb) - return ret; + /* Access validation occurs at prefetch time with IOTLB */ + if (vq->iotlb) + return 1; return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used); } -- 2.14.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v2 0/2] vhost: fix vhost_vq_access_ok() log check
v2: * Rewrote the conditional to make the vq access check clearer [Linus] * Added Patch 2 to make the return type consistent and harder to misuse [Linus] The first patch fixes the vhost virtqueue access check which was recently broken. The second patch replaces the int return type with bool to prevent future bugs. Stefan Hajnoczi (2): vhost: fix vhost_vq_access_ok() log check vhost: return bool from *_access_ok() functions drivers/vhost/vhost.h | 4 +-- drivers/vhost/vhost.c | 70 ++- 2 files changed, 38 insertions(+), 36 deletions(-) -- 2.14.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [RFC] vhost: introduce mdev based hardware vhost backend
On Tue, Apr 10, 2018 at 10:52:52AM +0800, Jason Wang wrote: > On 2018年04月02日 23:23, Tiwei Bie wrote: > > This patch introduces a mdev (mediated device) based hardware > > vhost backend. This backend is an abstraction of the various > > hardware vhost accelerators (potentially any device that uses > > virtio ring can be used as a vhost accelerator). Some generic > > mdev parent ops are provided for accelerator drivers to support > > generating mdev instances. > > > > What's this > > === > > > > The idea is that we can setup a virtio ring compatible device > > with the messages available at the vhost-backend. Originally, > > these messages are used to implement a software vhost backend, > > but now we will use these messages to setup a virtio ring > > compatible hardware device. Then the hardware device will be > > able to work with the guest virtio driver in the VM just like > > what the software backend does. That is to say, we can implement > > a hardware based vhost backend in QEMU, and any virtio ring > > compatible devices potentially can be used with this backend. > > (We also call it vDPA -- vhost Data Path Acceleration). > > > > One problem is that, different virtio ring compatible devices > > may have different device interfaces. That is to say, we will > > need different drivers in QEMU. It could be troublesome. And > > that's what this patch trying to fix. The idea behind this > > patch is very simple: mdev is a standard way to emulate device > > in kernel. > > So you just move the abstraction layer from qemu to kernel, and you still > need different drivers in kernel for different device interfaces of > accelerators. This looks even more complex than leaving it in qemu. As you > said, another idea is to implement userspace vhost backend for accelerators > which seems easier and could co-work with other parts of qemu without > inventing new type of messages. I'm not quite sure. Do you think it's acceptable to add various vendor specific hardware drivers in QEMU? > > Need careful thought here to seek a best solution here. Yeah, definitely! :) And your opinions would be very helpful! > > > So we defined a standard device based on mdev, which > > is able to accept vhost messages. When the mdev emulation code > > (i.e. the generic mdev parent ops provided by this patch) gets > > vhost messages, it will parse and deliver them to accelerator > > drivers. Drivers can use these messages to setup accelerators. > > > > That is to say, the generic mdev parent ops (e.g. read()/write()/ > > ioctl()/...) will be provided for accelerator drivers to register > > accelerators as mdev parent devices. And each accelerator device > > will support generating standard mdev instance(s). > > > > With this standard device interface, we will be able to just > > develop one userspace driver to implement the hardware based > > vhost backend in QEMU. > > > > Difference between vDPA and PCI passthru > > > > > > The key difference between vDPA and PCI passthru is that, in > > vDPA only the data path of the device (e.g. DMA ring, notify > > region and queue interrupt) is pass-throughed to the VM, the > > device control path (e.g. PCI configuration space and MMIO > > regions) is still defined and emulated by QEMU. > > > > The benefits of keeping virtio device emulation in QEMU compared > > with virtio device PCI passthru include (but not limit to): > > > > - consistent device interface for guest OS in the VM; > > - max flexibility on the hardware design, especially the > >accelerator for each vhost backend doesn't have to be a > >full PCI device; > > - leveraging the existing virtio live-migration framework; > > > > The interface of this mdev based device > > === > > > > 1. BAR0 > > > > The MMIO region described by BAR0 is the main control > > interface. Messages will be written to or read from > > this region. > > > > The message type is determined by the `request` field > > in message header. The message size is encoded in the > > message header too. The message format looks like this: > > > > struct vhost_vfio_op { > > __u64 request; > > __u32 flags; > > /* Flag values: */ > > #define VHOST_VFIO_NEED_REPLY 0x1 /* Whether need reply */ > > __u32 size; > > union { > > __u64 u64; > > struct vhost_vring_state state; > > struct vhost_vring_addr addr; > > struct vhost_memory memory; > > } payload; > > }; > > > > The existing vhost-kernel ioctl cmds are reused as > > the message requests in above structure. > > > > Each message will be written to or read from this > > region at offset 0: > > > > int vhost_vfio_write(struct vhost_dev *dev, struct vhost_vfio_op *op) > > { > > int count = VHOST_VFIO_OP_HDR_SIZE + op->size; > > struct vhost_vfio *vfio = dev->opaque; > > int ret; > > > > ret = pwrite64(vfio->device_fd, op, count,
Re: [RFC v2] virtio: support packed ring
On Tue, Apr 10, 2018 at 10:55:25AM +0800, Jason Wang wrote: > On 2018年04月01日 22:12, Tiwei Bie wrote: > > Hello everyone, > > > > This RFC implements packed ring support for virtio driver. > > > > The code was tested with DPDK vhost (testpmd/vhost-PMD) implemented > > by Jens at http://dpdk.org/ml/archives/dev/2018-January/089417.html > > Minor changes are needed for the vhost code, e.g. to kick the guest. > > > > TODO: > > - Refinements and bug fixes; > > - Split into small patches; > > - Test indirect descriptor support; > > - Test/fix event suppression support; > > - Test devices other than net; > > > > RFC v1 -> RFC v2: > > - Add indirect descriptor support - compile test only; > > - Add event suppression supprt - compile test only; > > - Move vring_packed_init() out of uapi (Jason, MST); > > - Merge two loops into one in virtqueue_add_packed() (Jason); > > - Split vring_unmap_one() for packed ring and split ring (Jason); > > - Avoid using '%' operator (Jason); > > - Rename free_head -> next_avail_idx (Jason); > > - Add comments for virtio_wmb() in virtqueue_add_packed() (Jason); > > - Some other refinements and bug fixes; > > > > Thanks! > > Will try to review this later. > > But it would be better if you can split it (more than 1000 lines is too big > to be reviewed easily). E.g you can at least split it into three patches, > new structures, datapath, and event suppression. > No problem! It's on my TODO list. I'll get it done in the next version. Thanks! ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH net-next 1/5] virtio: Add support for SCTP checksum offloading
On 2018年04月02日 21:40, Vladislav Yasevich wrote: To support SCTP checksum offloading, we need to add a new feature to virtio_net, so we can negotiate support between the hypervisor and the guest. The signalling to the guest that an alternate checksum needs to be used is done via a new flag in the virtio_net_hdr. If the flag is set, the host will know to perform an alternate checksum calculation, which right now is only CRC32c. Signed-off-by: Vladislav Yasevich--- drivers/net/virtio_net.c| 11 --- include/linux/virtio_net.h | 6 ++ include/uapi/linux/virtio_net.h | 2 ++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 7b187ec..b601294 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2724,9 +2724,14 @@ static int virtnet_probe(struct virtio_device *vdev) /* Do we support "hardware" checksums? */ if (virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) { /* This opens up the world of extra features. */ - dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG; + netdev_features_t sctp = 0; + + if (virtio_has_feature(vdev, VIRTIO_NET_F_SCTP_CSUM)) + sctp |= NETIF_F_SCTP_CRC; + + dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG | sctp; if (csum) - dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG; + dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG | sctp; if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) { dev->hw_features |= NETIF_F_TSO @@ -2952,7 +2957,7 @@ static struct virtio_device_id id_table[] = { }; #define VIRTNET_FEATURES \ - VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, \ + VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, VIRTIO_NET_F_SCTP_CSUM, \ It looks to me _F_SCTP_CSUM implies the ability of both device and driver. Do we still need the flexibility like csum to differ guest/host ability like e.g _F_GUEST_SCTP_CSUM? Thanks VIRTIO_NET_F_MAC, \ VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, \ VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, \ diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index f144216..2e7a64a 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -39,6 +39,9 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, if (!skb_partial_csum_set(skb, start, off)) return -EINVAL; + + if (hdr->flags & VIRTIO_NET_HDR_F_CSUM_NOT_INET) + skb->csum_not_inet = 1; } if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { @@ -96,6 +99,9 @@ static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb, hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID; } /* else everything is zero */ + if (skb->csum_not_inet) + hdr->flags &= VIRTIO_NET_HDR_F_CSUM_NOT_INET; + return 0; } diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h index 5de6ed3..3f279c8 100644 --- a/include/uapi/linux/virtio_net.h +++ b/include/uapi/linux/virtio_net.h @@ -36,6 +36,7 @@ #define VIRTIO_NET_F_GUEST_CSUM 1 /* Guest handles pkts w/ partial csum */ #define VIRTIO_NET_F_CTRL_GUEST_OFFLOADS 2 /* Dynamic offload configuration. */ #define VIRTIO_NET_F_MTU 3 /* Initial MTU advice */ +#define VIRTIO_NET_F_SCTP_CSUM 4 /* SCTP checksum offload support */ #define VIRTIO_NET_F_MAC 5 /* Host has given MAC address. */ #define VIRTIO_NET_F_GUEST_TSO4 7 /* Guest can handle TSOv4 in. */ #define VIRTIO_NET_F_GUEST_TSO6 8 /* Guest can handle TSOv6 in. */ @@ -101,6 +102,7 @@ struct virtio_net_config { struct virtio_net_hdr_v1 { #define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 /* Use csum_start, csum_offset */ #define VIRTIO_NET_HDR_F_DATA_VALID 2 /* Csum is valid */ +#define VIRTIO_NET_HDR_F_CSUM_NOT_INET 4 /* Checksum is not inet */ __u8 flags; #define VIRTIO_NET_HDR_GSO_NONE 0 /* Not a GSO frame */ #define VIRTIO_NET_HDR_GSO_TCPV4 1 /* GSO frame, IPv4 TCP (TSO) */ ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [RFC v2] virtio: support packed ring
On 2018年04月01日 22:12, Tiwei Bie wrote: Hello everyone, This RFC implements packed ring support for virtio driver. The code was tested with DPDK vhost (testpmd/vhost-PMD) implemented by Jens at http://dpdk.org/ml/archives/dev/2018-January/089417.html Minor changes are needed for the vhost code, e.g. to kick the guest. TODO: - Refinements and bug fixes; - Split into small patches; - Test indirect descriptor support; - Test/fix event suppression support; - Test devices other than net; RFC v1 -> RFC v2: - Add indirect descriptor support - compile test only; - Add event suppression supprt - compile test only; - Move vring_packed_init() out of uapi (Jason, MST); - Merge two loops into one in virtqueue_add_packed() (Jason); - Split vring_unmap_one() for packed ring and split ring (Jason); - Avoid using '%' operator (Jason); - Rename free_head -> next_avail_idx (Jason); - Add comments for virtio_wmb() in virtqueue_add_packed() (Jason); - Some other refinements and bug fixes; Thanks! Will try to review this later. But it would be better if you can split it (more than 1000 lines is too big to be reviewed easily). E.g you can at least split it into three patches, new structures, datapath, and event suppression. Thanks Signed-off-by: Tiwei Bie--- drivers/virtio/virtio_ring.c | 1094 +--- include/linux/virtio_ring.h|8 +- include/uapi/linux/virtio_config.h | 12 +- include/uapi/linux/virtio_ring.h | 61 ++ 4 files changed, 980 insertions(+), 195 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 71458f493cf8..0515dca34d77 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -58,14 +58,15 @@ struct vring_desc_state { void *data; /* Data for callback. */ - struct vring_desc *indir_desc; /* Indirect descriptor, if any. */ + void *indir_desc; /* Indirect descriptor, if any. */ + int num;/* Descriptor list length. */ }; struct vring_virtqueue { struct virtqueue vq; - /* Actual memory layout for this queue */ - struct vring vring; + /* Is this a packed ring? */ + bool packed; /* Can we use weak barriers? */ bool weak_barriers; @@ -79,19 +80,45 @@ struct vring_virtqueue { /* Host publishes avail event idx */ bool event; - /* Head of free buffer list. */ - unsigned int free_head; /* Number we've added since last sync. */ unsigned int num_added; /* Last used index we've seen. */ u16 last_used_idx; - /* Last written value to avail->flags */ - u16 avail_flags_shadow; + union { + /* Available for split ring */ + struct { + /* Actual memory layout for this queue. */ + struct vring vring; - /* Last written value to avail->idx in guest byte order */ - u16 avail_idx_shadow; + /* Head of free buffer list. */ + unsigned int free_head; + + /* Last written value to avail->flags */ + u16 avail_flags_shadow; + + /* Last written value to avail->idx in +* guest byte order. */ + u16 avail_idx_shadow; + }; + + /* Available for packed ring */ + struct { + /* Actual memory layout for this queue. */ + struct vring_packed vring_packed; + + /* Driver ring wrap counter. */ + u8 wrap_counter; + + /* Index of the next avail descriptor. */ + unsigned int next_avail_idx; + + /* Last written value to driver->flags in +* guest byte order. */ + u16 event_flags_shadow; + }; + }; /* How to notify other side. FIXME: commonalize hcalls! */ bool (*notify)(struct virtqueue *vq); @@ -201,8 +228,33 @@ static dma_addr_t vring_map_single(const struct vring_virtqueue *vq, cpu_addr, size, direction); } -static void vring_unmap_one(const struct vring_virtqueue *vq, - struct vring_desc *desc) +static void vring_unmap_one_split(const struct vring_virtqueue *vq, + struct vring_desc *desc) +{ + u16 flags; + + if (!vring_use_dma_api(vq->vq.vdev)) + return; + + flags = virtio16_to_cpu(vq->vq.vdev, desc->flags); + + if (flags & VRING_DESC_F_INDIRECT) { + dma_unmap_single(vring_dma_dev(vq), +virtio64_to_cpu(vq->vq.vdev, desc->addr), +virtio32_to_cpu(vq->vq.vdev, desc->len), +
Re: [RFC] vhost: introduce mdev based hardware vhost backend
On 2018年04月02日 23:23, Tiwei Bie wrote: This patch introduces a mdev (mediated device) based hardware vhost backend. This backend is an abstraction of the various hardware vhost accelerators (potentially any device that uses virtio ring can be used as a vhost accelerator). Some generic mdev parent ops are provided for accelerator drivers to support generating mdev instances. What's this === The idea is that we can setup a virtio ring compatible device with the messages available at the vhost-backend. Originally, these messages are used to implement a software vhost backend, but now we will use these messages to setup a virtio ring compatible hardware device. Then the hardware device will be able to work with the guest virtio driver in the VM just like what the software backend does. That is to say, we can implement a hardware based vhost backend in QEMU, and any virtio ring compatible devices potentially can be used with this backend. (We also call it vDPA -- vhost Data Path Acceleration). One problem is that, different virtio ring compatible devices may have different device interfaces. That is to say, we will need different drivers in QEMU. It could be troublesome. And that's what this patch trying to fix. The idea behind this patch is very simple: mdev is a standard way to emulate device in kernel. So you just move the abstraction layer from qemu to kernel, and you still need different drivers in kernel for different device interfaces of accelerators. This looks even more complex than leaving it in qemu. As you said, another idea is to implement userspace vhost backend for accelerators which seems easier and could co-work with other parts of qemu without inventing new type of messages. Need careful thought here to seek a best solution here. So we defined a standard device based on mdev, which is able to accept vhost messages. When the mdev emulation code (i.e. the generic mdev parent ops provided by this patch) gets vhost messages, it will parse and deliver them to accelerator drivers. Drivers can use these messages to setup accelerators. That is to say, the generic mdev parent ops (e.g. read()/write()/ ioctl()/...) will be provided for accelerator drivers to register accelerators as mdev parent devices. And each accelerator device will support generating standard mdev instance(s). With this standard device interface, we will be able to just develop one userspace driver to implement the hardware based vhost backend in QEMU. Difference between vDPA and PCI passthru The key difference between vDPA and PCI passthru is that, in vDPA only the data path of the device (e.g. DMA ring, notify region and queue interrupt) is pass-throughed to the VM, the device control path (e.g. PCI configuration space and MMIO regions) is still defined and emulated by QEMU. The benefits of keeping virtio device emulation in QEMU compared with virtio device PCI passthru include (but not limit to): - consistent device interface for guest OS in the VM; - max flexibility on the hardware design, especially the accelerator for each vhost backend doesn't have to be a full PCI device; - leveraging the existing virtio live-migration framework; The interface of this mdev based device === 1. BAR0 The MMIO region described by BAR0 is the main control interface. Messages will be written to or read from this region. The message type is determined by the `request` field in message header. The message size is encoded in the message header too. The message format looks like this: struct vhost_vfio_op { __u64 request; __u32 flags; /* Flag values: */ #define VHOST_VFIO_NEED_REPLY 0x1 /* Whether need reply */ __u32 size; union { __u64 u64; struct vhost_vring_state state; struct vhost_vring_addr addr; struct vhost_memory memory; } payload; }; The existing vhost-kernel ioctl cmds are reused as the message requests in above structure. Each message will be written to or read from this region at offset 0: int vhost_vfio_write(struct vhost_dev *dev, struct vhost_vfio_op *op) { int count = VHOST_VFIO_OP_HDR_SIZE + op->size; struct vhost_vfio *vfio = dev->opaque; int ret; ret = pwrite64(vfio->device_fd, op, count, vfio->bar0_offset); if (ret != count) return -1; return 0; } int vhost_vfio_read(struct vhost_dev *dev, struct vhost_vfio_op *op) { int count = VHOST_VFIO_OP_HDR_SIZE + op->size; struct vhost_vfio *vfio = dev->opaque; uint64_t request = op->request; int ret; ret = pread64(vfio->device_fd, op, count, vfio->bar0_offset); if (ret != count || request != op->request) return -1; return 0; } It's quite straightforward to set things to the device. Just need to write the message to
[PATCH v32 1/4] mm: support reporting free page blocks
This patch adds support to walk through the free page blocks in the system and report them via a callback function. Some page blocks may leave the free list after zone->lock is released, so it is the caller's responsibility to either detect or prevent the use of such pages. One use example of this patch is to accelerate live migration by skipping the transfer of free pages reported from the guest. A popular method used by the hypervisor to track which part of memory is written during live migration is to write-protect all the guest memory. So, those pages that are reported as free pages but are written after the report function returns will be captured by the hypervisor, and they will be added to the next round of memory transfer. Signed-off-by: Wei WangSigned-off-by: Liang Li Cc: Michal Hocko Cc: Andrew Morton Cc: Michael S. Tsirkin Acked-by: Michal Hocko --- include/linux/mm.h | 6 mm/page_alloc.c| 97 ++ 2 files changed, 103 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index f945dff..4698df1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1951,6 +1951,12 @@ extern void free_area_init_node(int nid, unsigned long * zones_size, unsigned long zone_start_pfn, unsigned long *zholes_size); extern void free_initmem(void); +extern int walk_free_mem_block(void *opaque, + int min_order, + int (*report_pfn_range)(void *opaque, + unsigned long pfn, + unsigned long num)); + /* * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK) * into the buddy system. The freed pages will be poisoned with pattern diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4ea0182..83e50fd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4912,6 +4912,103 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) show_swap_cache_info(); } +/* + * Walk through a free page list and report the found pfn range via the + * callback. + * + * Return 0 if it completes the reporting. Otherwise, return the non-zero + * value returned from the callback. + */ +static int walk_free_page_list(void *opaque, + struct zone *zone, + int order, + enum migratetype mt, + int (*report_pfn_range)(void *, + unsigned long, + unsigned long)) +{ + struct page *page; + struct list_head *list; + unsigned long pfn, flags; + int ret = 0; + + spin_lock_irqsave(>lock, flags); + list = >free_area[order].free_list[mt]; + list_for_each_entry(page, list, lru) { + pfn = page_to_pfn(page); + ret = report_pfn_range(opaque, pfn, 1 << order); + if (ret) + break; + } + spin_unlock_irqrestore(>lock, flags); + + return ret; +} + +/** + * walk_free_mem_block - Walk through the free page blocks in the system + * @opaque: the context passed from the caller + * @min_order: the minimum order of free lists to check + * @report_pfn_range: the callback to report the pfn range of the free pages + * + * If the callback returns a non-zero value, stop iterating the list of free + * page blocks. Otherwise, continue to report. + * + * Please note that there are no locking guarantees for the callback and + * that the reported pfn range might be freed or disappear after the + * callback returns so the caller has to be very careful how it is used. + * + * The callback itself must not sleep or perform any operations which would + * require any memory allocations directly (not even GFP_NOWAIT/GFP_ATOMIC) + * or via any lock dependency. It is generally advisable to implement + * the callback as simple as possible and defer any heavy lifting to a + * different context. + * + * There is no guarantee that each free range will be reported only once + * during one walk_free_mem_block invocation. + * + * pfn_to_page on the given range is strongly discouraged and if there is + * an absolute need for that make sure to contact MM people to discuss + * potential problems. + * + * The function itself might sleep so it cannot be called from atomic + * contexts. + * + * In general low orders tend to be very volatile and so it makes more + * sense to query larger ones first for various optimizations which like + * ballooning etc... This will reduce the overhead as well. + * + * Return 0 if it completes the reporting. Otherwise, return the non-zero + * value returned from the callback. + */ +int walk_free_mem_block(void
[PATCH v32 0/4] Virtio-balloon: support free page reporting
This patch series is separated from the previous "Virtio-balloon Enhancement" series. The new feature, VIRTIO_BALLOON_F_FREE_PAGE_HINT, implemented by this series enables the virtio-balloon driver to report hints of guest free pages to the host. It can be used to accelerate live migration of VMs. Here is an introduction of this usage: Live migration needs to transfer the VM's memory from the source machine to the destination round by round. For the 1st round, all the VM's memory is transferred. From the 2nd round, only the pieces of memory that were written by the guest (after the 1st round) are transferred. One method that is popularly used by the hypervisor to track which part of memory is written is to write-protect all the guest memory. This feature enables the optimization by skipping the transfer of guest free pages during VM live migration. It is not concerned that the memory pages are used after they are given to the hypervisor as a hint of the free pages, because they will be tracked by the hypervisor and transferred in the subsequent round if they are used and written. * Tests - Test Environment Host: Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz Guest: 8G RAM, 4 vCPU Migration setup: migrate_set_speed 100G, migrate_set_downtime 2 second - Test Results - Idle Guest Live Migration Time (results are averaged over 10 runs): - Optimization v.s. Legacy = 271ms vs 1769ms --> ~86% reduction - Guest with Linux Compilation Workload (make bzImage -j4): - Live Migration Time (average) Optimization v.s. Legacy = 1265ms v.s. 2634ms --> ~51% reduction - Linux Compilation Time Optimization v.s. Legacy = 4min56s v.s. 5min3s --> no obvious difference ChangeLog: v31->v32: - virtio-balloon: - rename cmd_id_use to cmd_id_active; - report_free_page_func: detach used buffers after host sends a vq interrupt, instead of busy waiting for used buffers. v30->v31: - virtio-balloon: - virtio_balloon_send_free_pages: return -EINTR rather than 1 to indicate an active stop requested by host; and add more comments to explain about access to cmd_id_received without locks; - add_one_sg: add TODO to comments about possible improvement. v29->v30: - mm/walk_free_mem_block: add cond_sched() for each order v28->v29: - mm/page_poison: only expose page_poison_enabled(), rather than more changes did in v28, as we are not 100% confident about that for now. - virtio-balloon: use a separate buffer for the stop cmd, instead of having the start and stop cmd use the same buffer. This avoids the corner case that the start cmd is overridden by the stop cmd when the host has a delay in reading the start cmd. v27->v28: - mm/page_poison: Move PAGE_POISON to page_poison.c and add a function to expose page poison val to kernel modules. v26->v27: - add a new patch to expose page_poisoning_enabled to kernel modules - virtio-balloon: set poison_val to 0x, instead of 0xaa v25->v26: virtio-balloon changes only - remove kicking free page vq since the host now polls the vq after initiating the reporting - report_free_page_func: detach all the used buffers after sending the stop cmd id. This avoids leaving the detaching burden (i.e. overhead) to the next cmd id. Detaching here isn't considered overhead since the stop cmd id has been sent, and host has already moved formard. v24->v25: - mm: change walk_free_mem_block to return 0 (instead of true) on completing the report, and return a non-zero value from the callabck, which stops the reporting. - virtio-balloon: - use enum instead of define for VIRTIO_BALLOON_VQ_INFLATE etc. - avoid __virtio_clear_bit when bailing out; - a new method to avoid reporting the some cmd id to host twice - destroy_workqueue can cancel free page work when the feature is negotiated; - fail probe when the free page vq size is less than 2. v23->v24: - change feature name VIRTIO_BALLOON_F_FREE_PAGE_VQ to VIRTIO_BALLOON_F_FREE_PAGE_HINT - kick when vq->num_free < half full, instead of "= half full" - replace BUG_ON with bailing out - check vb->balloon_wq in probe(), if null, bail out - add a new feature bit for page poisoning - solve the corner case that one cmd id being sent to host twice v22->v23: - change to kick the device when the vq is half-way full; - open-code batch_free_page_sg into add_one_sg; - change cmd_id from "uint32_t" to "__virtio32"; - reserver one entry in the vq for the driver to send cmd_id, instead of busywaiting for an available entry; - add "stop_update" check before queue_work for prudence purpose for now, will have a separate patch to discuss this flag check later; - init_vqs: change to put some variables on stack to
[PATCH v32 2/4] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_HINT
Negotiation of the VIRTIO_BALLOON_F_FREE_PAGE_HINT feature indicates the support of reporting hints of guest free pages to host via virtio-balloon. Host requests the guest to report free page hints by sending a new cmd id to the guest via the free_page_report_cmd_id configuration register. When the guest starts to report, the first element added to the free page vq is the cmd id given by host. When the guest finishes the reporting of all the free pages, VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID is added to the vq to tell host that the reporting is done. Host polls the free page vq after sending the starting cmd id, so the guest doesn't need to kick after filling an element to the vq. Host may also requests the guest to stop the reporting in advance by sending the stop cmd id to the guest via the configuration register. Signed-off-by: Wei WangSigned-off-by: Liang Li Cc: Michael S. Tsirkin Cc: Michal Hocko --- drivers/virtio/virtio_balloon.c | 288 +++- include/uapi/linux/virtio_balloon.h | 4 + 2 files changed, 256 insertions(+), 36 deletions(-) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index dfe5684..197f865 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -51,9 +51,22 @@ MODULE_PARM_DESC(oom_pages, "pages to free on OOM"); static struct vfsmount *balloon_mnt; #endif +enum virtio_balloon_vq { + VIRTIO_BALLOON_VQ_INFLATE, + VIRTIO_BALLOON_VQ_DEFLATE, + VIRTIO_BALLOON_VQ_STATS, + VIRTIO_BALLOON_VQ_FREE_PAGE, + VIRTIO_BALLOON_VQ_MAX +}; + struct virtio_balloon { struct virtio_device *vdev; - struct virtqueue *inflate_vq, *deflate_vq, *stats_vq; + struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq; + + /* Balloon's own wq for cpu-intensive work items */ + struct workqueue_struct *balloon_wq; + /* The free page reporting work item submitted to the balloon wq */ + struct work_struct report_free_page_work; /* The balloon servicing is delegated to a freezable workqueue. */ struct work_struct update_balloon_stats_work; @@ -63,6 +76,13 @@ struct virtio_balloon { spinlock_t stop_update_lock; bool stop_update; + /* The new cmd id received from host */ + uint32_t cmd_id_received; + /* The cmd id that is actively in use */ + __virtio32 cmd_id_active; + /* Buffer to store the stop sign */ + __virtio32 stop_cmd_id; + /* Waiting for host to ack the pages we released. */ wait_queue_head_t acked; @@ -320,17 +340,6 @@ static void stats_handle_request(struct virtio_balloon *vb) virtqueue_kick(vq); } -static void virtballoon_changed(struct virtio_device *vdev) -{ - struct virtio_balloon *vb = vdev->priv; - unsigned long flags; - - spin_lock_irqsave(>stop_update_lock, flags); - if (!vb->stop_update) - queue_work(system_freezable_wq, >update_balloon_size_work); - spin_unlock_irqrestore(>stop_update_lock, flags); -} - static inline s64 towards_target(struct virtio_balloon *vb) { s64 target; @@ -347,6 +356,34 @@ static inline s64 towards_target(struct virtio_balloon *vb) return target - vb->num_pages; } +static void virtballoon_changed(struct virtio_device *vdev) +{ + struct virtio_balloon *vb = vdev->priv; + unsigned long flags; + s64 diff = towards_target(vb); + + if (diff) { + spin_lock_irqsave(>stop_update_lock, flags); + if (!vb->stop_update) + queue_work(system_freezable_wq, + >update_balloon_size_work); + spin_unlock_irqrestore(>stop_update_lock, flags); + } + + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { + virtio_cread(vdev, struct virtio_balloon_config, +free_page_report_cmd_id, >cmd_id_received); + if (vb->cmd_id_received != + VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID) { + spin_lock_irqsave(>stop_update_lock, flags); + if (!vb->stop_update) + queue_work(vb->balloon_wq, + >report_free_page_work); + spin_unlock_irqrestore(>stop_update_lock, flags); + } + } +} + static void update_balloon_size(struct virtio_balloon *vb) { u32 actual = vb->num_pages; @@ -419,44 +456,196 @@ static void update_balloon_size_func(struct work_struct *work) queue_work(system_freezable_wq, work); } +static void free_page_vq_cb(struct virtqueue *vq) +{ + unsigned int unused; + + while (virtqueue_get_buf(vq, )) + ; +} + static int init_vqs(struct virtio_balloon
[PATCH v32 4/4] virtio-balloon: VIRTIO_BALLOON_F_PAGE_POISON
The VIRTIO_BALLOON_F_PAGE_POISON feature bit is used to indicate if the guest is using page poisoning. Guest writes to the poison_val config field to tell host about the page poisoning value in use. Signed-off-by: Wei WangSuggested-by: Michael S. Tsirkin Cc: Michael S. Tsirkin Cc: Michal Hocko Cc: Andrew Morton --- drivers/virtio/virtio_balloon.c | 10 ++ include/uapi/linux/virtio_balloon.h | 3 +++ 2 files changed, 13 insertions(+) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 197f865..aea91ab 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -730,6 +730,7 @@ static struct file_system_type balloon_fs = { static int virtballoon_probe(struct virtio_device *vdev) { struct virtio_balloon *vb; + __u32 poison_val; int err; if (!vdev->config->get) { @@ -775,6 +776,11 @@ static int virtballoon_probe(struct virtio_device *vdev) vb->stop_cmd_id = cpu_to_virtio32(vb->vdev, VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID); INIT_WORK(>report_free_page_work, report_free_page_func); + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON)) { + memset(_val, PAGE_POISON, sizeof(poison_val)); + virtio_cwrite(vb->vdev, struct virtio_balloon_config, + poison_val, _val); + } } vb->nb.notifier_call = virtballoon_oom_notify; @@ -893,6 +899,9 @@ static int virtballoon_restore(struct virtio_device *vdev) static int virtballoon_validate(struct virtio_device *vdev) { + if (!page_poisoning_enabled()) + __virtio_clear_bit(vdev, VIRTIO_BALLOON_F_PAGE_POISON); + __virtio_clear_bit(vdev, VIRTIO_F_IOMMU_PLATFORM); return 0; } @@ -902,6 +911,7 @@ static unsigned int features[] = { VIRTIO_BALLOON_F_STATS_VQ, VIRTIO_BALLOON_F_DEFLATE_ON_OOM, VIRTIO_BALLOON_F_FREE_PAGE_HINT, + VIRTIO_BALLOON_F_PAGE_POISON, }; static struct virtio_driver virtio_balloon_driver = { diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h index b2d86c2..8b93581 100644 --- a/include/uapi/linux/virtio_balloon.h +++ b/include/uapi/linux/virtio_balloon.h @@ -35,6 +35,7 @@ #define VIRTIO_BALLOON_F_STATS_VQ 1 /* Memory Stats virtqueue */ #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM2 /* Deflate balloon on OOM */ #define VIRTIO_BALLOON_F_FREE_PAGE_HINT3 /* VQ to report free pages */ +#define VIRTIO_BALLOON_F_PAGE_POISON 4 /* Guest is using page poisoning */ /* Size of a PFN in the balloon interface. */ #define VIRTIO_BALLOON_PFN_SHIFT 12 @@ -47,6 +48,8 @@ struct virtio_balloon_config { __u32 actual; /* Free page report command id, readonly by guest */ __u32 free_page_report_cmd_id; + /* Stores PAGE_POISON if page poisoning is in use */ + __u32 poison_val; }; #define VIRTIO_BALLOON_S_SWAP_IN 0 /* Amount of memory swapped in */ -- 2.7.4 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v32 3/4] mm/page_poison: expose page_poisoning_enabled to kernel modules
In some usages, e.g. virtio-balloon, a kernel module needs to know if page poisoning is in use. This patch exposes the page_poisoning_enabled function to kernel modules. Signed-off-by: Wei WangCc: Andrew Morton Cc: Michal Hocko Cc: Michael S. Tsirkin Acked-by: Andrew Morton --- mm/page_poison.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/mm/page_poison.c b/mm/page_poison.c index e83fd44..762b472 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -17,6 +17,11 @@ static int early_page_poison_param(char *buf) } early_param("page_poison", early_page_poison_param); +/** + * page_poisoning_enabled - check if page poisoning is enabled + * + * Return true if page poisoning is enabled, or false if not. + */ bool page_poisoning_enabled(void) { /* @@ -29,6 +34,7 @@ bool page_poisoning_enabled(void) (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && debug_pagealloc_enabled())); } +EXPORT_SYMBOL_GPL(page_poisoning_enabled); static void poison_page(struct page *page) { -- 2.7.4 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH RESEND net] vhost: fix vhost_vq_access_ok() log check
On Tue, Apr 10, 2018 at 3:40 AM, Michael S. Tsirkinwrote: > From: Stefan Hajnoczi > > Commit d65026c6c62e7d9616c8ceb5a53b68bcdc050525 ("vhost: validate log > when IOTLB is enabled") introduced a regression. The logic was > originally: > > if (vq->iotlb) > return 1; > return A && B; > > After the patch the short-circuit logic for A was inverted: > > if (A || vq->iotlb) > return A; > return B; > > The correct logic is: > > if (!A || vq->iotlb) > return A; > return B; > > Reported-by: syzbot+65a84dde0214b0387...@syzkaller.appspotmail.com > Cc: Jason Wang > Signed-off-by: Stefan Hajnoczi > Acked-by: Michael S. Tsirkin > > --- > drivers/vhost/vhost.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) NACK I will send a v2 with cleaner logic as suggested by Linus. Stefan ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [RFC PATCH 2/3] netdev: kernel-only IFF_HIDDEN netdevice
On Mon, Apr 9, 2018 at 4:03 PM, Stephen Hemmingerwrote: > On Mon, 9 Apr 2018 15:30:42 -0700 > Siwei Liu wrote: > >> On Mon, Apr 9, 2018 at 3:15 PM, Andrew Lunn wrote: >> >> No, implementation wise I'd avoid changing the class on the fly. What >> >> I'm looking to is a means to add a secondary class or class aliasing >> >> mechanism for netdevs that allows mapping for a kernel device >> >> namespace (/class/net-kernel) to userspace (/class/net). Imagine >> >> creating symlinks between these two namespaces as an analogy. All >> >> userspace visible netdevs today will have both a kernel name and a >> >> userspace visible name, having one (/class/net) referecing the other >> >> (/class/net-kernel) in its own namespace. The newly introduced >> >> IFF_AUTO_MANAGED device will have a kernel name only >> >> (/class/net-kernel). As a result, the existing applications using >> >> /class/net don't break, while we're adding the kernel namespace that >> >> allows IFF_AUTO_MANAGED devices which will not be exposed to userspace >> >> at all. >> > >> > My gut feeling is this whole scheme will not fly. You really should be >> > talking to GregKH. >> >> Will do. Before spreading it out loudly I'd run it within netdev to >> clarify the need for why not exposing the lower netdevs is critical >> for cloud service providers in the face of introducing a new feature, >> and we are not hiding anything but exposing it in a way that don't >> break existing userspace applications while introducing feature is >> possible with the limitation of keeping old userspace still. >> >> > >> > Anyway, please remember that IFF_AUTO_MANAGED will need to be dynamic. >> > A device can start out as a normal device, and will change to being >> > automatic later, when the user on top of it probes. >> >> Sure. In whatever form it's still a netdev, and changing the namespace >> should be more dynamic than changing the class. >> >> Thanks a lot, >> -Siwei >> >> > >> > Andrew > > Also, remember for netdev's /sys is really a third class API. > The primary API's are netlink and ioctl. Also why not use existing > network namespaces rather than inventing a new abstraction? Because we want to leave old userspace unmodified while making SR-IOV live migration transparent to users. Specifically, we'd want old udevd to skip through uevents for the lower netdevs, while also making new udevd able to name the bypass_master interface by referencing the pci slot information which is only present in the sysfs entry for IFF_AUTO_MANAGED net device. The problem of using network namespace is that, no sysfs entry will be around for IFF_AUTO_MANAGED netdev if we isolate it out to a separate netns, unless we generalize the scope for what netns is designed for (isolation I mean). For auto-managed netdevs we don't neccessarily wants strict isolation, but rather a way of sticking to 1-netdev model for strict backward compatibility requiement of the old userspace, while exposing the information in a way new userspace understands. Thanks, -Siwei ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [RFC PATCH 2/3] netdev: kernel-only IFF_HIDDEN netdevice
On Mon, 9 Apr 2018 15:30:42 -0700 Siwei Liuwrote: > On Mon, Apr 9, 2018 at 3:15 PM, Andrew Lunn wrote: > >> No, implementation wise I'd avoid changing the class on the fly. What > >> I'm looking to is a means to add a secondary class or class aliasing > >> mechanism for netdevs that allows mapping for a kernel device > >> namespace (/class/net-kernel) to userspace (/class/net). Imagine > >> creating symlinks between these two namespaces as an analogy. All > >> userspace visible netdevs today will have both a kernel name and a > >> userspace visible name, having one (/class/net) referecing the other > >> (/class/net-kernel) in its own namespace. The newly introduced > >> IFF_AUTO_MANAGED device will have a kernel name only > >> (/class/net-kernel). As a result, the existing applications using > >> /class/net don't break, while we're adding the kernel namespace that > >> allows IFF_AUTO_MANAGED devices which will not be exposed to userspace > >> at all. > > > > My gut feeling is this whole scheme will not fly. You really should be > > talking to GregKH. > > Will do. Before spreading it out loudly I'd run it within netdev to > clarify the need for why not exposing the lower netdevs is critical > for cloud service providers in the face of introducing a new feature, > and we are not hiding anything but exposing it in a way that don't > break existing userspace applications while introducing feature is > possible with the limitation of keeping old userspace still. > > > > > Anyway, please remember that IFF_AUTO_MANAGED will need to be dynamic. > > A device can start out as a normal device, and will change to being > > automatic later, when the user on top of it probes. > > Sure. In whatever form it's still a netdev, and changing the namespace > should be more dynamic than changing the class. > > Thanks a lot, > -Siwei > > > > > Andrew Also, remember for netdev's /sys is really a third class API. The primary API's are netlink and ioctl. Also why not use existing network namespaces rather than inventing a new abstraction? ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [RFC PATCH 2/3] netdev: kernel-only IFF_HIDDEN netdevice
On Mon, Apr 9, 2018 at 3:15 PM, Andrew Lunnwrote: >> No, implementation wise I'd avoid changing the class on the fly. What >> I'm looking to is a means to add a secondary class or class aliasing >> mechanism for netdevs that allows mapping for a kernel device >> namespace (/class/net-kernel) to userspace (/class/net). Imagine >> creating symlinks between these two namespaces as an analogy. All >> userspace visible netdevs today will have both a kernel name and a >> userspace visible name, having one (/class/net) referecing the other >> (/class/net-kernel) in its own namespace. The newly introduced >> IFF_AUTO_MANAGED device will have a kernel name only >> (/class/net-kernel). As a result, the existing applications using >> /class/net don't break, while we're adding the kernel namespace that >> allows IFF_AUTO_MANAGED devices which will not be exposed to userspace >> at all. > > My gut feeling is this whole scheme will not fly. You really should be > talking to GregKH. Will do. Before spreading it out loudly I'd run it within netdev to clarify the need for why not exposing the lower netdevs is critical for cloud service providers in the face of introducing a new feature, and we are not hiding anything but exposing it in a way that don't break existing userspace applications while introducing feature is possible with the limitation of keeping old userspace still. > > Anyway, please remember that IFF_AUTO_MANAGED will need to be dynamic. > A device can start out as a normal device, and will change to being > automatic later, when the user on top of it probes. Sure. In whatever form it's still a netdev, and changing the namespace should be more dynamic than changing the class. Thanks a lot, -Siwei > > Andrew ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [RFC PATCH 2/3] netdev: kernel-only IFF_HIDDEN netdevice
> No, implementation wise I'd avoid changing the class on the fly. What > I'm looking to is a means to add a secondary class or class aliasing > mechanism for netdevs that allows mapping for a kernel device > namespace (/class/net-kernel) to userspace (/class/net). Imagine > creating symlinks between these two namespaces as an analogy. All > userspace visible netdevs today will have both a kernel name and a > userspace visible name, having one (/class/net) referecing the other > (/class/net-kernel) in its own namespace. The newly introduced > IFF_AUTO_MANAGED device will have a kernel name only > (/class/net-kernel). As a result, the existing applications using > /class/net don't break, while we're adding the kernel namespace that > allows IFF_AUTO_MANAGED devices which will not be exposed to userspace > at all. My gut feeling is this whole scheme will not fly. You really should be talking to GregKH. Anyway, please remember that IFF_AUTO_MANAGED will need to be dynamic. A device can start out as a normal device, and will change to being automatic later, when the user on top of it probes. Andrew ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [RFC PATCH 2/3] netdev: kernel-only IFF_HIDDEN netdevice
On Fri, Apr 6, 2018 at 8:19 PM, Andrew Lunnwrote: > Hi Siwei > >> I think everyone seems to agree not to fiddle with the ":" prefix, but >> rather have a new class of network subsystem under /sys/class thus a >> separate device namespace e.g. /sys/class/net-kernel for those >> auto-managed lower netdevs is needed. > > How do you get a device into this new class? I don't know the Linux > driver model too well, but to get a device out of one class and into > another, i think you need to device_del(dev). modify dev->class and > then device_add(dev). However, device_add() says you are not allowed > to do this. No, implementation wise I'd avoid changing the class on the fly. What I'm looking to is a means to add a secondary class or class aliasing mechanism for netdevs that allows mapping for a kernel device namespace (/class/net-kernel) to userspace (/class/net). Imagine creating symlinks between these two namespaces as an analogy. All userspace visible netdevs today will have both a kernel name and a userspace visible name, having one (/class/net) referecing the other (/class/net-kernel) in its own namespace. The newly introduced IFF_AUTO_MANAGED device will have a kernel name only (/class/net-kernel). As a result, the existing applications using /class/net don't break, while we're adding the kernel namespace that allows IFF_AUTO_MANAGED devices which will not be exposed to userspace at all. As this requires changing the internals of driver model core it's a rather big hammer approach I'd think. If there exists a better implementation than this to allow adding a separate layer of in-kernel device namespace, I'd more than welcome to hear about. > > And i don't even see how this helps. Are you also not going to call > list_netdevice()? Are you going to add some other list for these > devices in a different class? list_netdevice() is still called. I think with the current RFC patch, I've added two lists for netdevs under the kernel namespace: dev_cmpl_list and name_cmpl_hlist. As a result of that, all userspace netdevs get registered will be added to two types of lists: the userspace list for e.g. dev_list, and also the kernelspace list e.g. dev_cmpl_list (I can rename it to something more accurate). The IFF_AUTO_MANAGED device will be only added to kernelspace list e.g. dev_cmpl_list. Hope all your questions are answered. Thanks, -Siwei > >Andrew ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [virtio-dev] Re: [PATCH v2] virtio_balloon: export hugetlb page allocation counts
On 03/22/2018 07:38 PM, Jason Wang wrote: On 2018年03月22日 11:10, Michael S. Tsirkin wrote: On Thu, Mar 22, 2018 at 09:52:18AM +0800, Jason Wang wrote: On 2018年03月20日 12:26, Jonathan Helman wrote: On Mar 19, 2018, at 7:31 PM, Jason Wangwrote: On 2018年03月20日 06:14, Jonathan Helman wrote: Export the number of successful and failed hugetlb page allocations via the virtio balloon driver. These 2 counts come directly from the vm_events HTLB_BUDDY_PGALLOC and HTLB_BUDDY_PGALLOC_FAIL. Signed-off-by: Jonathan Helman Reviewed-by: Jason Wang Thanks. --- drivers/virtio/virtio_balloon.c | 6 ++ include/uapi/linux/virtio_balloon.h | 4 +++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index dfe5684..6b237e3 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -272,6 +272,12 @@ static unsigned int update_balloon_stats(struct virtio_balloon *vb) pages_to_bytes(events[PSWPOUT])); update_stat(vb, idx++, VIRTIO_BALLOON_S_MAJFLT, events[PGMAJFAULT]); update_stat(vb, idx++, VIRTIO_BALLOON_S_MINFLT, events[PGFAULT]); +#ifdef CONFIG_HUGETLB_PAGE + update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC, + events[HTLB_BUDDY_PGALLOC]); + update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGFAIL, + events[HTLB_BUDDY_PGALLOC_FAIL]); +#endif #endif update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMFREE, pages_to_bytes(i.freeram)); diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h index 4e8b830..40297a3 100644 --- a/include/uapi/linux/virtio_balloon.h +++ b/include/uapi/linux/virtio_balloon.h @@ -53,7 +53,9 @@ struct virtio_balloon_config { #define VIRTIO_BALLOON_S_MEMTOT 5 /* Total amount of memory */ #define VIRTIO_BALLOON_S_AVAIL 6 /* Available memory as in /proc */ #define VIRTIO_BALLOON_S_CACHES 7 /* Disk caches */ -#define VIRTIO_BALLOON_S_NR 8 +#define VIRTIO_BALLOON_S_HTLB_PGALLOC 8 /* Hugetlb page allocations */ +#define VIRTIO_BALLOON_S_HTLB_PGFAIL 9 /* Hugetlb page allocation failures */ +#define VIRTIO_BALLOON_S_NR 10 /* * Memory statistics structure. Not for this patch, but it looks to me that exporting such nr through uapi is fragile. Sorry, can you explain what you mean here? Jon Spec said "Within an output buffer submitted to the statsq, the device MUST ignore entries with tag values that it does not recognize". So exporting VIRTIO_BALLOON_S_NR seems useless and device implementation can not depend on such number in uapi. Thanks Suggestions? I don't like to break build for people ... Didn't have a good idea. But maybe we should keep VIRTIO_BALLOON_S_NR unchanged, and add a comment here. Thanks I think Jason's comment is for a future patch. Didn't see this patch get applied, so wondering if it could be. Thanks, Jon ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH RESEND net] vhost: fix vhost_vq_access_ok() log check
From: Stefan HajnocziCommit d65026c6c62e7d9616c8ceb5a53b68bcdc050525 ("vhost: validate log when IOTLB is enabled") introduced a regression. The logic was originally: if (vq->iotlb) return 1; return A && B; After the patch the short-circuit logic for A was inverted: if (A || vq->iotlb) return A; return B; The correct logic is: if (!A || vq->iotlb) return A; return B; Reported-by: syzbot+65a84dde0214b0387...@syzkaller.appspotmail.com Cc: Jason Wang Signed-off-by: Stefan Hajnoczi Acked-by: Michael S. Tsirkin --- drivers/vhost/vhost.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 5320039671b7..f6af4210679a 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1246,7 +1246,7 @@ int vhost_vq_access_ok(struct vhost_virtqueue *vq) { int ret = vq_log_access_ok(vq, vq->log_base); - if (ret || vq->iotlb) + if (!ret || vq->iotlb) return ret; return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used); -- 2.14.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH] vhost: fix vhost_vq_access_ok() log check
On Mon, Apr 9, 2018 at 6:10 AM, Stefan Hajnocziwrote: > @@ -1246,7 +1246,7 @@ int vhost_vq_access_ok(struct vhost_virtqueue *vq) > { > int ret = vq_log_access_ok(vq, vq->log_base); > > - if (ret || vq->iotlb) > + if (!ret || vq->iotlb) > return ret; That logic is still very non-obvious. This code already had one bug because of an odd illegible test sequence. Let's not keep the crazy code. Why not just do the *obvious* thing, and get rid of "ret" entirely, and make the damn thing return a boolean, and then just write it all as /* Caller should have vq mutex and device mutex */ bool vhost_vq_access_ok(struct vhost_virtqueue *vq) { if (!vq_log_access_ok(vq, vq->log_base)) return false; if (vq->iotlb || vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used); } which makes the logic obvious: if vq_log_access_ok() fails, then then vhost_vq_access_ok() fails unconditionally. Otherwise, we need to have an iotlb, or a successful vq_access_ok() check. Doesn't that all make more sense, and avoid the insane "ret" value use that is really quite subtle? Linus ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH RESEND v2] vhost-net: set packet weight of tx polling to 2 * vq size
From: haibinzhang(张海斌)Date: Mon, 9 Apr 2018 07:22:17 + > handle_tx will delay rx for tens or even hundreds of milliseconds when tx busy > polling udp packets with small length(e.g. 1byte udp payload), because setting > VHOST_NET_WEIGHT takes into account only sent-bytes but no single packet > length. > > Ping-Latencies shown below were tested between two Virtual Machines using > netperf (UDP_STREAM, len=1), and then another machine pinged the client: ... > Acked-by: Michael S. Tsirkin > Signed-off-by: Haibin Zhang > Signed-off-by: Yunfang Tai > Signed-off-by: Lidong Chen Applied, thank you. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH] vhost: fix vhost_vq_access_ok() log check
On Mon, Apr 09, 2018 at 09:10:21PM +0800, Stefan Hajnoczi wrote: > Commit d65026c6c62e7d9616c8ceb5a53b68bcdc050525 ("vhost: validate log > when IOTLB is enabled") introduced a regression. The logic was > originally: > > if (vq->iotlb) > return 1; > return A && B; > > After the patch the short-circuit logic for A was inverted: > > if (A || vq->iotlb) > return A; > return B; > > The correct logic is: > > if (!A || vq->iotlb) > return A; > return B; > > Reported-by: syzbot+65a84dde0214b0387...@syzkaller.appspotmail.com > Cc: Jason Wang> Signed-off-by: Stefan Hajnoczi Acked-by: Michael S. Tsirkin > --- > drivers/vhost/vhost.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > index 5320039671b7..f6af4210679a 100644 > --- a/drivers/vhost/vhost.c > +++ b/drivers/vhost/vhost.c > @@ -1246,7 +1246,7 @@ int vhost_vq_access_ok(struct vhost_virtqueue *vq) > { > int ret = vq_log_access_ok(vq, vq->log_base); > > - if (ret || vq->iotlb) > + if (!ret || vq->iotlb) > return ret; > > return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used); > -- > 2.14.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: kernel BUG at drivers/vhost/vhost.c:LINE! (2)
On Mon, Apr 9, 2018 at 11:28 AM, Stefan Hajnocziwrote: > On Mon, Apr 09, 2018 at 05:44:36AM +0300, Michael S. Tsirkin wrote: >> On Mon, Apr 09, 2018 at 10:37:45AM +0800, Stefan Hajnoczi wrote: >> > On Sat, Apr 7, 2018 at 3:02 AM, syzbot >> > wrote: >> > > syzbot hit the following crash on upstream commit >> > > 38c23685b273cfb4ccf31a199feccce3bdcb5d83 (Fri Apr 6 04:29:35 2018 +) >> > > Merge tag 'armsoc-drivers' of >> > > git://git.kernel.org/pub/scm/linux/kernel/git/arm/arm-soc >> > > syzbot dashboard link: >> > > https://syzkaller.appspot.com/bug?extid=65a84dde0214b0387ccd >> > >> > To prevent duplicated work: I am working on this one. >> > >> > Stefan >> >> Do you want to try this patchset: >> https://lkml.org/lkml/2018/4/5/665 >> >> ? > > Thanks, I'll give it a shot. > > I also noticed a regression in commit > d65026c6c62e7d9616c8ceb5a53b68bcdc050525 ("vhost: validate log when > IOTLB is enabled") and am currently testing a fix. I have sent a fix: https://lkml.org/lkml/2018/4/9/390 Stefan ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH] vhost: fix vhost_vq_access_ok() log check
Commit d65026c6c62e7d9616c8ceb5a53b68bcdc050525 ("vhost: validate log when IOTLB is enabled") introduced a regression. The logic was originally: if (vq->iotlb) return 1; return A && B; After the patch the short-circuit logic for A was inverted: if (A || vq->iotlb) return A; return B; The correct logic is: if (!A || vq->iotlb) return A; return B; Reported-by: syzbot+65a84dde0214b0387...@syzkaller.appspotmail.com Cc: Jason WangSigned-off-by: Stefan Hajnoczi --- drivers/vhost/vhost.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 5320039671b7..f6af4210679a 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1246,7 +1246,7 @@ int vhost_vq_access_ok(struct vhost_virtqueue *vq) { int ret = vq_log_access_ok(vq, vq->log_base); - if (ret || vq->iotlb) + if (!ret || vq->iotlb) return ret; return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used); -- 2.14.3 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v31 2/4] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_HINT
On 04/09/2018 02:03 PM, Michael S. Tsirkin wrote: On Fri, Apr 06, 2018 at 08:17:23PM +0800, Wei Wang wrote: Negotiation of the VIRTIO_BALLOON_F_FREE_PAGE_HINT feature indicates the support of reporting hints of guest free pages to host via virtio-balloon. Host requests the guest to report free page hints by sending a new cmd id to the guest via the free_page_report_cmd_id configuration register. When the guest starts to report, the first element added to the free page vq is the cmd id given by host. When the guest finishes the reporting of all the free pages, VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID is added to the vq to tell host that the reporting is done. Host polls the free page vq after sending the starting cmd id, so the guest doesn't need to kick after filling an element to the vq. Host may also requests the guest to stop the reporting in advance by sending the stop cmd id to the guest via the configuration register. Signed-off-by: Wei WangSigned-off-by: Liang Li Cc: Michael S. Tsirkin Cc: Michal Hocko Pretty good by now, Minor comments below. Thanks for the comments. --- drivers/virtio/virtio_balloon.c | 272 +++- include/uapi/linux/virtio_balloon.h | 4 + 2 files changed, 240 insertions(+), 36 deletions(-) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index dfe5684..aef73ee 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -51,9 +51,22 @@ MODULE_PARM_DESC(oom_pages, "pages to free on OOM"); static struct vfsmount *balloon_mnt; #endif +enum virtio_balloon_vq { + VIRTIO_BALLOON_VQ_INFLATE, + VIRTIO_BALLOON_VQ_DEFLATE, + VIRTIO_BALLOON_VQ_STATS, + VIRTIO_BALLOON_VQ_FREE_PAGE, + VIRTIO_BALLOON_VQ_MAX +}; + struct virtio_balloon { struct virtio_device *vdev; - struct virtqueue *inflate_vq, *deflate_vq, *stats_vq; + struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq; + + /* Balloon's own wq for cpu-intensive work items */ + struct workqueue_struct *balloon_wq; + /* The free page reporting work item submitted to the balloon wq */ + struct work_struct report_free_page_work; /* The balloon servicing is delegated to a freezable workqueue. */ struct work_struct update_balloon_stats_work; @@ -63,6 +76,13 @@ struct virtio_balloon { spinlock_t stop_update_lock; bool stop_update; + /* The new cmd id received from host */ + uint32_t cmd_id_received; + /* The cmd id that is in use */ + __virtio32 cmd_id_use; I'd prefer cmd_id_active but it's not critical. OK, will change. + +static void report_free_page_func(struct work_struct *work) +{ + struct virtio_balloon *vb; + struct virtqueue *vq; + unsigned int unused; + int ret; + + vb = container_of(work, struct virtio_balloon, report_free_page_work); + vq = vb->free_page_vq; + + /* Start by sending the received cmd id to host with an outbuf. */ + ret = send_start_cmd_id(vb, vb->cmd_id_received); + if (unlikely(ret)) + goto err; + + ret = walk_free_mem_block(vb, 0, _balloon_send_free_pages); + if (unlikely(ret == -EIO)) + goto err; why is EIO special? I think you should special-case EINTR maybe. Actually EINTR isn't an error which needs to bail out. That's just the case that the vq is full, that hint isn't added. Maybe it is not necessary to treat the "vq full" case as an error. How about just returning "0" when the vq is full, instead of returning "EINTR"? (The next hint will continue to be added) + + /* End by sending a stop id to host with an outbuf. */ + ret = send_stop_cmd_id(vb); + if (likely(!ret)) { What happens on failure? Don't we need to detach anyway? Yes. Please see below, we could make some more change. + /* Ending: detach all the used buffers from the vq. */ + while (vq->num_free != virtqueue_get_vring_size(vq)) + virtqueue_get_buf(vq, ); This isn't all that happens here. It also waits for buffers to be consumed. Is this by design? And why is it good idea to busy poll while doing it? Because host and guest operations happen asynchronously. When the guest reaches here, host may have not put anything to the vq yet. How about doing this via the free page vq handler? Host will send a free page vq interrupt before exiting the optimization. I think this would be nicer. Best, Wei ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [RFC PATCH net-next v5 3/4] virtio_net: Extend virtio to use VF datapath when available
Sat, Apr 07, 2018 at 12:59:14AM CEST, sridhar.samudr...@intel.com wrote: >On 4/6/2018 5:48 AM, Jiri Pirko wrote: >> Thu, Apr 05, 2018 at 11:08:22PM CEST, sridhar.samudr...@intel.com wrote: [...] >> > +static int virtnet_bypass_join_child(struct net_device *bypass_netdev, >> > + struct net_device *child_netdev) >> > +{ >> > + struct virtnet_bypass_info *vbi; >> > + bool backup; >> > + >> > + vbi = netdev_priv(bypass_netdev); >> > + backup = (child_netdev->dev.parent == bypass_netdev->dev.parent); >> > + if (backup ? rtnl_dereference(vbi->backup_netdev) : >> > + rtnl_dereference(vbi->active_netdev)) { >> > + netdev_info(bypass_netdev, >> > + "%s attempting to join bypass dev when %s already >> > present\n", >> > + child_netdev->name, backup ? "backup" : "active"); >> Bypass module should check if there is already some other netdev >> enslaved and refuse right there. > >This will work for virtio-net with 3 netdev model, but this check has to be >done by netvsc >as its bypass_netdev is same as the backup_netdev. >Will add a flag while registering with the bypass module to indicate if the >driver is doing >a 2 netdev or 3 netdev model and based on that flag this check can be done in >bypass module >for 3 netdev scenario. Just let me undestand it clearly. What I expect the difference would be between 2netdev and3 netdev model is this: 2netdev: bypass_master / / VF_slave 3netdev: bypass_master / \ / \ VF_slave backup_slave Is that correct? If not, how does it look like? Thanks! ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v31 2/4] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_HINT
On Fri, Apr 06, 2018 at 08:17:23PM +0800, Wei Wang wrote: > Negotiation of the VIRTIO_BALLOON_F_FREE_PAGE_HINT feature indicates the > support of reporting hints of guest free pages to host via virtio-balloon. > > Host requests the guest to report free page hints by sending a new cmd > id to the guest via the free_page_report_cmd_id configuration register. > > When the guest starts to report, the first element added to the free page > vq is the cmd id given by host. When the guest finishes the reporting > of all the free pages, VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID is added > to the vq to tell host that the reporting is done. Host polls the free > page vq after sending the starting cmd id, so the guest doesn't need to > kick after filling an element to the vq. > > Host may also requests the guest to stop the reporting in advance by > sending the stop cmd id to the guest via the configuration register. > > Signed-off-by: Wei Wang> Signed-off-by: Liang Li > Cc: Michael S. Tsirkin > Cc: Michal Hocko Pretty good by now, Minor comments below. > --- > drivers/virtio/virtio_balloon.c | 272 > +++- > include/uapi/linux/virtio_balloon.h | 4 + > 2 files changed, 240 insertions(+), 36 deletions(-) > > diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c > index dfe5684..aef73ee 100644 > --- a/drivers/virtio/virtio_balloon.c > +++ b/drivers/virtio/virtio_balloon.c > @@ -51,9 +51,22 @@ MODULE_PARM_DESC(oom_pages, "pages to free on OOM"); > static struct vfsmount *balloon_mnt; > #endif > > +enum virtio_balloon_vq { > + VIRTIO_BALLOON_VQ_INFLATE, > + VIRTIO_BALLOON_VQ_DEFLATE, > + VIRTIO_BALLOON_VQ_STATS, > + VIRTIO_BALLOON_VQ_FREE_PAGE, > + VIRTIO_BALLOON_VQ_MAX > +}; > + > struct virtio_balloon { > struct virtio_device *vdev; > - struct virtqueue *inflate_vq, *deflate_vq, *stats_vq; > + struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq; > + > + /* Balloon's own wq for cpu-intensive work items */ > + struct workqueue_struct *balloon_wq; > + /* The free page reporting work item submitted to the balloon wq */ > + struct work_struct report_free_page_work; > > /* The balloon servicing is delegated to a freezable workqueue. */ > struct work_struct update_balloon_stats_work; > @@ -63,6 +76,13 @@ struct virtio_balloon { > spinlock_t stop_update_lock; > bool stop_update; > > + /* The new cmd id received from host */ > + uint32_t cmd_id_received; > + /* The cmd id that is in use */ > + __virtio32 cmd_id_use; I'd prefer cmd_id_active but it's not critical. > + /* Buffer to store the stop sign */ > + __virtio32 stop_cmd_id; > + > /* Waiting for host to ack the pages we released. */ > wait_queue_head_t acked; > > @@ -320,17 +340,6 @@ static void stats_handle_request(struct virtio_balloon > *vb) > virtqueue_kick(vq); > } > > -static void virtballoon_changed(struct virtio_device *vdev) > -{ > - struct virtio_balloon *vb = vdev->priv; > - unsigned long flags; > - > - spin_lock_irqsave(>stop_update_lock, flags); > - if (!vb->stop_update) > - queue_work(system_freezable_wq, >update_balloon_size_work); > - spin_unlock_irqrestore(>stop_update_lock, flags); > -} > - > static inline s64 towards_target(struct virtio_balloon *vb) > { > s64 target; > @@ -347,6 +356,34 @@ static inline s64 towards_target(struct virtio_balloon > *vb) > return target - vb->num_pages; > } > > +static void virtballoon_changed(struct virtio_device *vdev) > +{ > + struct virtio_balloon *vb = vdev->priv; > + unsigned long flags; > + s64 diff = towards_target(vb); > + > + if (diff) { > + spin_lock_irqsave(>stop_update_lock, flags); > + if (!vb->stop_update) > + queue_work(system_freezable_wq, > +>update_balloon_size_work); > + spin_unlock_irqrestore(>stop_update_lock, flags); > + } > + > + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { > + virtio_cread(vdev, struct virtio_balloon_config, > + free_page_report_cmd_id, >cmd_id_received); > + if (vb->cmd_id_received != > + VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID) { > + spin_lock_irqsave(>stop_update_lock, flags); > + if (!vb->stop_update) > + queue_work(vb->balloon_wq, > +>report_free_page_work); > + spin_unlock_irqrestore(>stop_update_lock, flags); > + } > + } > +} > + > static void update_balloon_size(struct virtio_balloon *vb) > { > u32 actual = vb->num_pages; > @@ -421,42 +458,178 @@ static void