[PATCH v2 2/2] vhost: return bool from *_access_ok() functions

2018-04-09 Thread Stefan Hajnoczi
Currently vhost *_access_ok() functions return int.  This is error-prone
because there are two popular conventions:

1. 0 means failure, 1 means success
2. -errno means failure, 0 means success

Although vhost mostly uses #1, it does not do so consistently.
umem_access_ok() uses #2.

This patch changes the return type from int to bool so that false means
failure and true means success.  This eliminates a potential source of
errors.

Suggested-by: Linus Torvalds 
Signed-off-by: Stefan Hajnoczi 
---
 drivers/vhost/vhost.h |  4 ++--
 drivers/vhost/vhost.c | 66 +--
 2 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index ac4b6056f19a..6e00fa57af09 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -178,8 +178,8 @@ void vhost_dev_cleanup(struct vhost_dev *);
 void vhost_dev_stop(struct vhost_dev *);
 long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user 
*argp);
 long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp);
-int vhost_vq_access_ok(struct vhost_virtqueue *vq);
-int vhost_log_access_ok(struct vhost_dev *);
+bool vhost_vq_access_ok(struct vhost_virtqueue *vq);
+bool vhost_log_access_ok(struct vhost_dev *);
 
 int vhost_get_vq_desc(struct vhost_virtqueue *,
  struct iovec iov[], unsigned int iov_count,
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 93fd0c75b0d8..b6a082ef33dd 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -641,14 +641,14 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
 }
 EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
 
-static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
+static bool log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
 {
u64 a = addr / VHOST_PAGE_SIZE / 8;
 
/* Make sure 64 bit math will not overflow. */
if (a > ULONG_MAX - (unsigned long)log_base ||
a + (unsigned long)log_base > ULONG_MAX)
-   return 0;
+   return false;
 
return access_ok(VERIFY_WRITE, log_base + a,
 (sz + VHOST_PAGE_SIZE * 8 - 1) / VHOST_PAGE_SIZE / 8);
@@ -661,30 +661,30 @@ static bool vhost_overflow(u64 uaddr, u64 size)
 }
 
 /* Caller should have vq mutex and device mutex. */
-static int vq_memory_access_ok(void __user *log_base, struct vhost_umem *umem,
-  int log_all)
+static bool vq_memory_access_ok(void __user *log_base, struct vhost_umem *umem,
+   int log_all)
 {
struct vhost_umem_node *node;
 
if (!umem)
-   return 0;
+   return false;
 
list_for_each_entry(node, >umem_list, link) {
unsigned long a = node->userspace_addr;
 
if (vhost_overflow(node->userspace_addr, node->size))
-   return 0;
+   return false;
 
 
if (!access_ok(VERIFY_WRITE, (void __user *)a,
node->size))
-   return 0;
+   return false;
else if (log_all && !log_access_ok(log_base,
   node->start,
   node->size))
-   return 0;
+   return false;
}
-   return 1;
+   return true;
 }
 
 static inline void __user *vhost_vq_meta_fetch(struct vhost_virtqueue *vq,
@@ -701,13 +701,13 @@ static inline void __user *vhost_vq_meta_fetch(struct 
vhost_virtqueue *vq,
 
 /* Can we switch to this memory table? */
 /* Caller should have device mutex but not vq mutex */
-static int memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem,
-   int log_all)
+static bool memory_access_ok(struct vhost_dev *d, struct vhost_umem *umem,
+int log_all)
 {
int i;
 
for (i = 0; i < d->nvqs; ++i) {
-   int ok;
+   bool ok;
bool log;
 
mutex_lock(>vqs[i]->mutex);
@@ -717,12 +717,12 @@ static int memory_access_ok(struct vhost_dev *d, struct 
vhost_umem *umem,
ok = vq_memory_access_ok(d->vqs[i]->log_base,
 umem, log);
else
-   ok = 1;
+   ok = true;
mutex_unlock(>vqs[i]->mutex);
if (!ok)
-   return 0;
+   return false;
}
-   return 1;
+   return true;
 }
 
 static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
@@ -959,21 +959,21 @@ static void vhost_iotlb_notify_vq(struct vhost_dev *d,
spin_unlock(>iotlb_lock);
 }
 
-static int umem_access_ok(u64 uaddr, u64 size, int access)
+static 

[PATCH v2 1/2] vhost: fix vhost_vq_access_ok() log check

2018-04-09 Thread Stefan Hajnoczi
Commit d65026c6c62e7d9616c8ceb5a53b68bcdc050525 ("vhost: validate log
when IOTLB is enabled") introduced a regression.  The logic was
originally:

  if (vq->iotlb)
  return 1;
  return A && B;

After the patch the short-circuit logic for A was inverted:

  if (A || vq->iotlb)
  return A;
  return B;

This patch fixes the regression by rewriting the checks in the obvious
way, no longer returning A when vq->iotlb is non-NULL (which is hard to
understand).

Reported-by: syzbot+65a84dde0214b0387...@syzkaller.appspotmail.com
Cc: Jason Wang 
Signed-off-by: Stefan Hajnoczi 
---
 drivers/vhost/vhost.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 5320039671b7..93fd0c75b0d8 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1244,10 +1244,12 @@ static int vq_log_access_ok(struct vhost_virtqueue *vq,
 /* Caller should have vq mutex and device mutex */
 int vhost_vq_access_ok(struct vhost_virtqueue *vq)
 {
-   int ret = vq_log_access_ok(vq, vq->log_base);
+   if (!vq_log_access_ok(vq, vq->log_base))
+   return 0;
 
-   if (ret || vq->iotlb)
-   return ret;
+   /* Access validation occurs at prefetch time with IOTLB */
+   if (vq->iotlb)
+   return 1;
 
return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used);
 }
-- 
2.14.3

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v2 0/2] vhost: fix vhost_vq_access_ok() log check

2018-04-09 Thread Stefan Hajnoczi
v2:
 * Rewrote the conditional to make the vq access check clearer [Linus]
 * Added Patch 2 to make the return type consistent and harder to misuse [Linus]

The first patch fixes the vhost virtqueue access check which was recently
broken.  The second patch replaces the int return type with bool to prevent
future bugs.

Stefan Hajnoczi (2):
  vhost: fix vhost_vq_access_ok() log check
  vhost: return bool from *_access_ok() functions

 drivers/vhost/vhost.h |  4 +--
 drivers/vhost/vhost.c | 70 ++-
 2 files changed, 38 insertions(+), 36 deletions(-)

-- 
2.14.3

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC] vhost: introduce mdev based hardware vhost backend

2018-04-09 Thread Tiwei Bie
On Tue, Apr 10, 2018 at 10:52:52AM +0800, Jason Wang wrote:
> On 2018年04月02日 23:23, Tiwei Bie wrote:
> > This patch introduces a mdev (mediated device) based hardware
> > vhost backend. This backend is an abstraction of the various
> > hardware vhost accelerators (potentially any device that uses
> > virtio ring can be used as a vhost accelerator). Some generic
> > mdev parent ops are provided for accelerator drivers to support
> > generating mdev instances.
> > 
> > What's this
> > ===
> > 
> > The idea is that we can setup a virtio ring compatible device
> > with the messages available at the vhost-backend. Originally,
> > these messages are used to implement a software vhost backend,
> > but now we will use these messages to setup a virtio ring
> > compatible hardware device. Then the hardware device will be
> > able to work with the guest virtio driver in the VM just like
> > what the software backend does. That is to say, we can implement
> > a hardware based vhost backend in QEMU, and any virtio ring
> > compatible devices potentially can be used with this backend.
> > (We also call it vDPA -- vhost Data Path Acceleration).
> > 
> > One problem is that, different virtio ring compatible devices
> > may have different device interfaces. That is to say, we will
> > need different drivers in QEMU. It could be troublesome. And
> > that's what this patch trying to fix. The idea behind this
> > patch is very simple: mdev is a standard way to emulate device
> > in kernel.
> 
> So you just move the abstraction layer from qemu to kernel, and you still
> need different drivers in kernel for different device interfaces of
> accelerators. This looks even more complex than leaving it in qemu. As you
> said, another idea is to implement userspace vhost backend for accelerators
> which seems easier and could co-work with other parts of qemu without
> inventing new type of messages.

I'm not quite sure. Do you think it's acceptable to
add various vendor specific hardware drivers in QEMU?

> 
> Need careful thought here to seek a best solution here.

Yeah, definitely! :)
And your opinions would be very helpful!

> 
> >   So we defined a standard device based on mdev, which
> > is able to accept vhost messages. When the mdev emulation code
> > (i.e. the generic mdev parent ops provided by this patch) gets
> > vhost messages, it will parse and deliver them to accelerator
> > drivers. Drivers can use these messages to setup accelerators.
> > 
> > That is to say, the generic mdev parent ops (e.g. read()/write()/
> > ioctl()/...) will be provided for accelerator drivers to register
> > accelerators as mdev parent devices. And each accelerator device
> > will support generating standard mdev instance(s).
> > 
> > With this standard device interface, we will be able to just
> > develop one userspace driver to implement the hardware based
> > vhost backend in QEMU.
> > 
> > Difference between vDPA and PCI passthru
> > 
> > 
> > The key difference between vDPA and PCI passthru is that, in
> > vDPA only the data path of the device (e.g. DMA ring, notify
> > region and queue interrupt) is pass-throughed to the VM, the
> > device control path (e.g. PCI configuration space and MMIO
> > regions) is still defined and emulated by QEMU.
> > 
> > The benefits of keeping virtio device emulation in QEMU compared
> > with virtio device PCI passthru include (but not limit to):
> > 
> > - consistent device interface for guest OS in the VM;
> > - max flexibility on the hardware design, especially the
> >accelerator for each vhost backend doesn't have to be a
> >full PCI device;
> > - leveraging the existing virtio live-migration framework;
> > 
> > The interface of this mdev based device
> > ===
> > 
> > 1. BAR0
> > 
> > The MMIO region described by BAR0 is the main control
> > interface. Messages will be written to or read from
> > this region.
> > 
> > The message type is determined by the `request` field
> > in message header. The message size is encoded in the
> > message header too. The message format looks like this:
> > 
> > struct vhost_vfio_op {
> > __u64 request;
> > __u32 flags;
> > /* Flag values: */
> > #define VHOST_VFIO_NEED_REPLY 0x1 /* Whether need reply */
> > __u32 size;
> > union {
> > __u64 u64;
> > struct vhost_vring_state state;
> > struct vhost_vring_addr addr;
> > struct vhost_memory memory;
> > } payload;
> > };
> > 
> > The existing vhost-kernel ioctl cmds are reused as
> > the message requests in above structure.
> > 
> > Each message will be written to or read from this
> > region at offset 0:
> > 
> > int vhost_vfio_write(struct vhost_dev *dev, struct vhost_vfio_op *op)
> > {
> > int count = VHOST_VFIO_OP_HDR_SIZE + op->size;
> > struct vhost_vfio *vfio = dev->opaque;
> > int ret;
> > 
> > ret = pwrite64(vfio->device_fd, op, count, 

Re: [RFC v2] virtio: support packed ring

2018-04-09 Thread Tiwei Bie
On Tue, Apr 10, 2018 at 10:55:25AM +0800, Jason Wang wrote:
> On 2018年04月01日 22:12, Tiwei Bie wrote:
> > Hello everyone,
> > 
> > This RFC implements packed ring support for virtio driver.
> > 
> > The code was tested with DPDK vhost (testpmd/vhost-PMD) implemented
> > by Jens at http://dpdk.org/ml/archives/dev/2018-January/089417.html
> > Minor changes are needed for the vhost code, e.g. to kick the guest.
> > 
> > TODO:
> > - Refinements and bug fixes;
> > - Split into small patches;
> > - Test indirect descriptor support;
> > - Test/fix event suppression support;
> > - Test devices other than net;
> > 
> > RFC v1 -> RFC v2:
> > - Add indirect descriptor support - compile test only;
> > - Add event suppression supprt - compile test only;
> > - Move vring_packed_init() out of uapi (Jason, MST);
> > - Merge two loops into one in virtqueue_add_packed() (Jason);
> > - Split vring_unmap_one() for packed ring and split ring (Jason);
> > - Avoid using '%' operator (Jason);
> > - Rename free_head -> next_avail_idx (Jason);
> > - Add comments for virtio_wmb() in virtqueue_add_packed() (Jason);
> > - Some other refinements and bug fixes;
> > 
> > Thanks!
> 
> Will try to review this later.
> 
> But it would be better if you can split it (more than 1000 lines is too big
> to be reviewed easily). E.g you can at least split it into three patches,
> new structures, datapath, and event suppression.
> 

No problem! It's on my TODO list. I'll get it done in the next version.

Thanks!
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH net-next 1/5] virtio: Add support for SCTP checksum offloading

2018-04-09 Thread Jason Wang



On 2018年04月02日 21:40, Vladislav Yasevich wrote:

To support SCTP checksum offloading, we need to add a new feature
to virtio_net, so we can negotiate support between the hypervisor
and the guest.

The signalling to the guest that an alternate checksum needs to
be used is done via a new flag in the virtio_net_hdr.  If the
flag is set, the host will know to perform an alternate checksum
calculation, which right now is only CRC32c.

Signed-off-by: Vladislav Yasevich 
---
  drivers/net/virtio_net.c| 11 ---
  include/linux/virtio_net.h  |  6 ++
  include/uapi/linux/virtio_net.h |  2 ++
  3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 7b187ec..b601294 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2724,9 +2724,14 @@ static int virtnet_probe(struct virtio_device *vdev)
/* Do we support "hardware" checksums? */
if (virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) {
/* This opens up the world of extra features. */
-   dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG;
+   netdev_features_t sctp = 0;
+
+   if (virtio_has_feature(vdev, VIRTIO_NET_F_SCTP_CSUM))
+   sctp |= NETIF_F_SCTP_CRC;
+
+   dev->hw_features |= NETIF_F_HW_CSUM | NETIF_F_SG | sctp;
if (csum)
-   dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG;
+   dev->features |= NETIF_F_HW_CSUM | NETIF_F_SG | sctp;
  
  		if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) {

dev->hw_features |= NETIF_F_TSO
@@ -2952,7 +2957,7 @@ static struct virtio_device_id id_table[] = {
  };
  
  #define VIRTNET_FEATURES \

-   VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, \
+   VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM,  VIRTIO_NET_F_SCTP_CSUM, \


It looks to me _F_SCTP_CSUM implies the ability of both device and 
driver. Do we still need the flexibility like csum to differ guest/host 
ability like e.g _F_GUEST_SCTP_CSUM?


Thanks


VIRTIO_NET_F_MAC, \
VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, \
VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, 
VIRTIO_NET_F_GUEST_TSO6, \
diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index f144216..2e7a64a 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -39,6 +39,9 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
  
  		if (!skb_partial_csum_set(skb, start, off))

return -EINVAL;
+
+   if (hdr->flags & VIRTIO_NET_HDR_F_CSUM_NOT_INET)
+   skb->csum_not_inet = 1;
}
  
  	if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {

@@ -96,6 +99,9 @@ static inline int virtio_net_hdr_from_skb(const struct 
sk_buff *skb,
hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID;
} /* else everything is zero */
  
+	if (skb->csum_not_inet)

+   hdr->flags &= VIRTIO_NET_HDR_F_CSUM_NOT_INET;
+
return 0;
  }
  
diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h

index 5de6ed3..3f279c8 100644
--- a/include/uapi/linux/virtio_net.h
+++ b/include/uapi/linux/virtio_net.h
@@ -36,6 +36,7 @@
  #define VIRTIO_NET_F_GUEST_CSUM   1   /* Guest handles pkts w/ 
partial csum */
  #define VIRTIO_NET_F_CTRL_GUEST_OFFLOADS 2 /* Dynamic offload configuration. 
*/
  #define VIRTIO_NET_F_MTU  3   /* Initial MTU advice */
+#define VIRTIO_NET_F_SCTP_CSUM  4  /* SCTP checksum offload support */
  #define VIRTIO_NET_F_MAC  5   /* Host has given MAC address. */
  #define VIRTIO_NET_F_GUEST_TSO4   7   /* Guest can handle TSOv4 in. */
  #define VIRTIO_NET_F_GUEST_TSO6   8   /* Guest can handle TSOv6 in. */
@@ -101,6 +102,7 @@ struct virtio_net_config {
  struct virtio_net_hdr_v1 {
  #define VIRTIO_NET_HDR_F_NEEDS_CSUM   1   /* Use csum_start, csum_offset 
*/
  #define VIRTIO_NET_HDR_F_DATA_VALID   2   /* Csum is valid */
+#define VIRTIO_NET_HDR_F_CSUM_NOT_INET  4   /* Checksum is not inet */
__u8 flags;
  #define VIRTIO_NET_HDR_GSO_NONE   0   /* Not a GSO frame */
  #define VIRTIO_NET_HDR_GSO_TCPV4  1   /* GSO frame, IPv4 TCP (TSO) */


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [RFC v2] virtio: support packed ring

2018-04-09 Thread Jason Wang



On 2018年04月01日 22:12, Tiwei Bie wrote:

Hello everyone,

This RFC implements packed ring support for virtio driver.

The code was tested with DPDK vhost (testpmd/vhost-PMD) implemented
by Jens at http://dpdk.org/ml/archives/dev/2018-January/089417.html
Minor changes are needed for the vhost code, e.g. to kick the guest.

TODO:
- Refinements and bug fixes;
- Split into small patches;
- Test indirect descriptor support;
- Test/fix event suppression support;
- Test devices other than net;

RFC v1 -> RFC v2:
- Add indirect descriptor support - compile test only;
- Add event suppression supprt - compile test only;
- Move vring_packed_init() out of uapi (Jason, MST);
- Merge two loops into one in virtqueue_add_packed() (Jason);
- Split vring_unmap_one() for packed ring and split ring (Jason);
- Avoid using '%' operator (Jason);
- Rename free_head -> next_avail_idx (Jason);
- Add comments for virtio_wmb() in virtqueue_add_packed() (Jason);
- Some other refinements and bug fixes;

Thanks!


Will try to review this later.

But it would be better if you can split it (more than 1000 lines is too 
big to be reviewed easily). E.g you can at least split it into three 
patches, new structures, datapath, and event suppression.


Thanks




Signed-off-by: Tiwei Bie 
---
  drivers/virtio/virtio_ring.c   | 1094 +---
  include/linux/virtio_ring.h|8 +-
  include/uapi/linux/virtio_config.h |   12 +-
  include/uapi/linux/virtio_ring.h   |   61 ++
  4 files changed, 980 insertions(+), 195 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 71458f493cf8..0515dca34d77 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -58,14 +58,15 @@
  
  struct vring_desc_state {

void *data; /* Data for callback. */
-   struct vring_desc *indir_desc;  /* Indirect descriptor, if any. */
+   void *indir_desc;   /* Indirect descriptor, if any. */
+   int num;/* Descriptor list length. */
  };
  
  struct vring_virtqueue {

struct virtqueue vq;
  
-	/* Actual memory layout for this queue */

-   struct vring vring;
+   /* Is this a packed ring? */
+   bool packed;
  
  	/* Can we use weak barriers? */

bool weak_barriers;
@@ -79,19 +80,45 @@ struct vring_virtqueue {
/* Host publishes avail event idx */
bool event;
  
-	/* Head of free buffer list. */

-   unsigned int free_head;
/* Number we've added since last sync. */
unsigned int num_added;
  
  	/* Last used index we've seen. */

u16 last_used_idx;
  
-	/* Last written value to avail->flags */

-   u16 avail_flags_shadow;
+   union {
+   /* Available for split ring */
+   struct {
+   /* Actual memory layout for this queue. */
+   struct vring vring;
  
-	/* Last written value to avail->idx in guest byte order */

-   u16 avail_idx_shadow;
+   /* Head of free buffer list. */
+   unsigned int free_head;
+
+   /* Last written value to avail->flags */
+   u16 avail_flags_shadow;
+
+   /* Last written value to avail->idx in
+* guest byte order. */
+   u16 avail_idx_shadow;
+   };
+
+   /* Available for packed ring */
+   struct {
+   /* Actual memory layout for this queue. */
+   struct vring_packed vring_packed;
+
+   /* Driver ring wrap counter. */
+   u8 wrap_counter;
+
+   /* Index of the next avail descriptor. */
+   unsigned int next_avail_idx;
+
+   /* Last written value to driver->flags in
+* guest byte order. */
+   u16 event_flags_shadow;
+   };
+   };
  
  	/* How to notify other side. FIXME: commonalize hcalls! */

bool (*notify)(struct virtqueue *vq);
@@ -201,8 +228,33 @@ static dma_addr_t vring_map_single(const struct 
vring_virtqueue *vq,
  cpu_addr, size, direction);
  }
  
-static void vring_unmap_one(const struct vring_virtqueue *vq,

-   struct vring_desc *desc)
+static void vring_unmap_one_split(const struct vring_virtqueue *vq,
+ struct vring_desc *desc)
+{
+   u16 flags;
+
+   if (!vring_use_dma_api(vq->vq.vdev))
+   return;
+
+   flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
+
+   if (flags & VRING_DESC_F_INDIRECT) {
+   dma_unmap_single(vring_dma_dev(vq),
+virtio64_to_cpu(vq->vq.vdev, desc->addr),
+virtio32_to_cpu(vq->vq.vdev, desc->len),
+   

Re: [RFC] vhost: introduce mdev based hardware vhost backend

2018-04-09 Thread Jason Wang



On 2018年04月02日 23:23, Tiwei Bie wrote:

This patch introduces a mdev (mediated device) based hardware
vhost backend. This backend is an abstraction of the various
hardware vhost accelerators (potentially any device that uses
virtio ring can be used as a vhost accelerator). Some generic
mdev parent ops are provided for accelerator drivers to support
generating mdev instances.

What's this
===

The idea is that we can setup a virtio ring compatible device
with the messages available at the vhost-backend. Originally,
these messages are used to implement a software vhost backend,
but now we will use these messages to setup a virtio ring
compatible hardware device. Then the hardware device will be
able to work with the guest virtio driver in the VM just like
what the software backend does. That is to say, we can implement
a hardware based vhost backend in QEMU, and any virtio ring
compatible devices potentially can be used with this backend.
(We also call it vDPA -- vhost Data Path Acceleration).

One problem is that, different virtio ring compatible devices
may have different device interfaces. That is to say, we will
need different drivers in QEMU. It could be troublesome. And
that's what this patch trying to fix. The idea behind this
patch is very simple: mdev is a standard way to emulate device
in kernel.


So you just move the abstraction layer from qemu to kernel, and you 
still need different drivers in kernel for different device interfaces 
of accelerators. This looks even more complex than leaving it in qemu. 
As you said, another idea is to implement userspace vhost backend for 
accelerators which seems easier and could co-work with other parts of 
qemu without inventing new type of messages.


Need careful thought here to seek a best solution here.


  So we defined a standard device based on mdev, which
is able to accept vhost messages. When the mdev emulation code
(i.e. the generic mdev parent ops provided by this patch) gets
vhost messages, it will parse and deliver them to accelerator
drivers. Drivers can use these messages to setup accelerators.

That is to say, the generic mdev parent ops (e.g. read()/write()/
ioctl()/...) will be provided for accelerator drivers to register
accelerators as mdev parent devices. And each accelerator device
will support generating standard mdev instance(s).

With this standard device interface, we will be able to just
develop one userspace driver to implement the hardware based
vhost backend in QEMU.

Difference between vDPA and PCI passthru


The key difference between vDPA and PCI passthru is that, in
vDPA only the data path of the device (e.g. DMA ring, notify
region and queue interrupt) is pass-throughed to the VM, the
device control path (e.g. PCI configuration space and MMIO
regions) is still defined and emulated by QEMU.

The benefits of keeping virtio device emulation in QEMU compared
with virtio device PCI passthru include (but not limit to):

- consistent device interface for guest OS in the VM;
- max flexibility on the hardware design, especially the
   accelerator for each vhost backend doesn't have to be a
   full PCI device;
- leveraging the existing virtio live-migration framework;

The interface of this mdev based device
===

1. BAR0

The MMIO region described by BAR0 is the main control
interface. Messages will be written to or read from
this region.

The message type is determined by the `request` field
in message header. The message size is encoded in the
message header too. The message format looks like this:

struct vhost_vfio_op {
__u64 request;
__u32 flags;
/* Flag values: */
#define VHOST_VFIO_NEED_REPLY 0x1 /* Whether need reply */
__u32 size;
union {
__u64 u64;
struct vhost_vring_state state;
struct vhost_vring_addr addr;
struct vhost_memory memory;
} payload;
};

The existing vhost-kernel ioctl cmds are reused as
the message requests in above structure.

Each message will be written to or read from this
region at offset 0:

int vhost_vfio_write(struct vhost_dev *dev, struct vhost_vfio_op *op)
{
int count = VHOST_VFIO_OP_HDR_SIZE + op->size;
struct vhost_vfio *vfio = dev->opaque;
int ret;

ret = pwrite64(vfio->device_fd, op, count, vfio->bar0_offset);
if (ret != count)
return -1;

return 0;
}

int vhost_vfio_read(struct vhost_dev *dev, struct vhost_vfio_op *op)
{
int count = VHOST_VFIO_OP_HDR_SIZE + op->size;
struct vhost_vfio *vfio = dev->opaque;
uint64_t request = op->request;
int ret;

ret = pread64(vfio->device_fd, op, count, vfio->bar0_offset);
if (ret != count || request != op->request)
return -1;

return 0;
}

It's quite straightforward to set things to the device.
Just need to write the message to 

[PATCH v32 1/4] mm: support reporting free page blocks

2018-04-09 Thread Wei Wang
This patch adds support to walk through the free page blocks in the
system and report them via a callback function. Some page blocks may
leave the free list after zone->lock is released, so it is the caller's
responsibility to either detect or prevent the use of such pages.

One use example of this patch is to accelerate live migration by skipping
the transfer of free pages reported from the guest. A popular method used
by the hypervisor to track which part of memory is written during live
migration is to write-protect all the guest memory. So, those pages that
are reported as free pages but are written after the report function
returns will be captured by the hypervisor, and they will be added to the
next round of memory transfer.

Signed-off-by: Wei Wang 
Signed-off-by: Liang Li 
Cc: Michal Hocko 
Cc: Andrew Morton 
Cc: Michael S. Tsirkin 
Acked-by: Michal Hocko 
---
 include/linux/mm.h |  6 
 mm/page_alloc.c| 97 ++
 2 files changed, 103 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f945dff..4698df1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1951,6 +1951,12 @@ extern void free_area_init_node(int nid, unsigned long * 
zones_size,
unsigned long zone_start_pfn, unsigned long *zholes_size);
 extern void free_initmem(void);
 
+extern int walk_free_mem_block(void *opaque,
+  int min_order,
+  int (*report_pfn_range)(void *opaque,
+  unsigned long pfn,
+  unsigned long num));
+
 /*
  * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK)
  * into the buddy system. The freed pages will be poisoned with pattern
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4ea0182..83e50fd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4912,6 +4912,103 @@ void show_free_areas(unsigned int filter, nodemask_t 
*nodemask)
show_swap_cache_info();
 }
 
+/*
+ * Walk through a free page list and report the found pfn range via the
+ * callback.
+ *
+ * Return 0 if it completes the reporting. Otherwise, return the non-zero
+ * value returned from the callback.
+ */
+static int walk_free_page_list(void *opaque,
+  struct zone *zone,
+  int order,
+  enum migratetype mt,
+  int (*report_pfn_range)(void *,
+  unsigned long,
+  unsigned long))
+{
+   struct page *page;
+   struct list_head *list;
+   unsigned long pfn, flags;
+   int ret = 0;
+
+   spin_lock_irqsave(>lock, flags);
+   list = >free_area[order].free_list[mt];
+   list_for_each_entry(page, list, lru) {
+   pfn = page_to_pfn(page);
+   ret = report_pfn_range(opaque, pfn, 1 << order);
+   if (ret)
+   break;
+   }
+   spin_unlock_irqrestore(>lock, flags);
+
+   return ret;
+}
+
+/**
+ * walk_free_mem_block - Walk through the free page blocks in the system
+ * @opaque: the context passed from the caller
+ * @min_order: the minimum order of free lists to check
+ * @report_pfn_range: the callback to report the pfn range of the free pages
+ *
+ * If the callback returns a non-zero value, stop iterating the list of free
+ * page blocks. Otherwise, continue to report.
+ *
+ * Please note that there are no locking guarantees for the callback and
+ * that the reported pfn range might be freed or disappear after the
+ * callback returns so the caller has to be very careful how it is used.
+ *
+ * The callback itself must not sleep or perform any operations which would
+ * require any memory allocations directly (not even GFP_NOWAIT/GFP_ATOMIC)
+ * or via any lock dependency. It is generally advisable to implement
+ * the callback as simple as possible and defer any heavy lifting to a
+ * different context.
+ *
+ * There is no guarantee that each free range will be reported only once
+ * during one walk_free_mem_block invocation.
+ *
+ * pfn_to_page on the given range is strongly discouraged and if there is
+ * an absolute need for that make sure to contact MM people to discuss
+ * potential problems.
+ *
+ * The function itself might sleep so it cannot be called from atomic
+ * contexts.
+ *
+ * In general low orders tend to be very volatile and so it makes more
+ * sense to query larger ones first for various optimizations which like
+ * ballooning etc... This will reduce the overhead as well.
+ *
+ * Return 0 if it completes the reporting. Otherwise, return the non-zero
+ * value returned from the callback.
+ */
+int walk_free_mem_block(void 

[PATCH v32 0/4] Virtio-balloon: support free page reporting

2018-04-09 Thread Wei Wang
This patch series is separated from the previous "Virtio-balloon
Enhancement" series. The new feature, VIRTIO_BALLOON_F_FREE_PAGE_HINT,  
implemented by this series enables the virtio-balloon driver to report
hints of guest free pages to the host. It can be used to accelerate live
migration of VMs. Here is an introduction of this usage:

Live migration needs to transfer the VM's memory from the source machine
to the destination round by round. For the 1st round, all the VM's memory
is transferred. From the 2nd round, only the pieces of memory that were
written by the guest (after the 1st round) are transferred. One method
that is popularly used by the hypervisor to track which part of memory is
written is to write-protect all the guest memory.

This feature enables the optimization by skipping the transfer of guest
free pages during VM live migration. It is not concerned that the memory
pages are used after they are given to the hypervisor as a hint of the
free pages, because they will be tracked by the hypervisor and transferred
in the subsequent round if they are used and written.

* Tests
- Test Environment
Host: Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
Guest: 8G RAM, 4 vCPU
Migration setup: migrate_set_speed 100G, migrate_set_downtime 2 second

- Test Results
- Idle Guest Live Migration Time (results are averaged over 10 runs):
- Optimization v.s. Legacy = 271ms vs 1769ms --> ~86% reduction
- Guest with Linux Compilation Workload (make bzImage -j4):
- Live Migration Time (average)
  Optimization v.s. Legacy = 1265ms v.s. 2634ms --> ~51% reduction
- Linux Compilation Time
  Optimization v.s. Legacy = 4min56s v.s. 5min3s
  --> no obvious difference

ChangeLog:
v31->v32:
- virtio-balloon:
- rename cmd_id_use to cmd_id_active;
- report_free_page_func: detach used buffers after host sends a vq
  interrupt, instead of busy waiting for used buffers.
v30->v31:
- virtio-balloon:
- virtio_balloon_send_free_pages: return -EINTR rather than 1 to
  indicate an active stop requested by host; and add more
  comments to explain about access to cmd_id_received without
  locks;
-  add_one_sg: add TODO to comments about possible improvement.
v29->v30:
- mm/walk_free_mem_block: add cond_sched() for each order
v28->v29:
- mm/page_poison: only expose page_poison_enabled(), rather than more
  changes did in v28, as we are not 100% confident about that for now.
- virtio-balloon: use a separate buffer for the stop cmd, instead of
  having the start and stop cmd use the same buffer. This avoids the
  corner case that the start cmd is overridden by the stop cmd when
  the host has a delay in reading the start cmd.
v27->v28:
- mm/page_poison: Move PAGE_POISON to page_poison.c and add a function
  to expose page poison val to kernel modules.
v26->v27:
- add a new patch to expose page_poisoning_enabled to kernel modules
- virtio-balloon: set poison_val to 0x, instead of 0xaa
v25->v26: virtio-balloon changes only
- remove kicking free page vq since the host now polls the vq after
  initiating the reporting
- report_free_page_func: detach all the used buffers after sending
  the stop cmd id. This avoids leaving the detaching burden (i.e.
  overhead) to the next cmd id. Detaching here isn't considered
  overhead since the stop cmd id has been sent, and host has already
  moved formard.
v24->v25:
- mm: change walk_free_mem_block to return 0 (instead of true) on
  completing the report, and return a non-zero value from the
  callabck, which stops the reporting.
- virtio-balloon:
- use enum instead of define for VIRTIO_BALLOON_VQ_INFLATE etc.
- avoid __virtio_clear_bit when bailing out;
- a new method to avoid reporting the some cmd id to host twice
- destroy_workqueue can cancel free page work when the feature is
  negotiated;
- fail probe when the free page vq size is less than 2.
v23->v24:
- change feature name VIRTIO_BALLOON_F_FREE_PAGE_VQ to
  VIRTIO_BALLOON_F_FREE_PAGE_HINT
- kick when vq->num_free < half full, instead of "= half full"
- replace BUG_ON with bailing out
- check vb->balloon_wq in probe(), if null, bail out
- add a new feature bit for page poisoning
- solve the corner case that one cmd id being sent to host twice
v22->v23:
- change to kick the device when the vq is half-way full;
- open-code batch_free_page_sg into add_one_sg;
- change cmd_id from "uint32_t" to "__virtio32";
- reserver one entry in the vq for the driver to send cmd_id, instead
  of busywaiting for an available entry;
- add "stop_update" check before queue_work for prudence purpose for
  now, will have a separate patch to discuss this flag check later;
- init_vqs: change to put some variables on stack to 

[PATCH v32 2/4] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_HINT

2018-04-09 Thread Wei Wang
Negotiation of the VIRTIO_BALLOON_F_FREE_PAGE_HINT feature indicates the
support of reporting hints of guest free pages to host via virtio-balloon.

Host requests the guest to report free page hints by sending a new cmd
id to the guest via the free_page_report_cmd_id configuration register.

When the guest starts to report, the first element added to the free page
vq is the cmd id given by host. When the guest finishes the reporting
of all the free pages, VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID is added
to the vq to tell host that the reporting is done. Host polls the free
page vq after sending the starting cmd id, so the guest doesn't need to
kick after filling an element to the vq.

Host may also requests the guest to stop the reporting in advance by
sending the stop cmd id to the guest via the configuration register.

Signed-off-by: Wei Wang 
Signed-off-by: Liang Li 
Cc: Michael S. Tsirkin 
Cc: Michal Hocko 
---
 drivers/virtio/virtio_balloon.c | 288 +++-
 include/uapi/linux/virtio_balloon.h |   4 +
 2 files changed, 256 insertions(+), 36 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index dfe5684..197f865 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -51,9 +51,22 @@ MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
 static struct vfsmount *balloon_mnt;
 #endif
 
+enum virtio_balloon_vq {
+   VIRTIO_BALLOON_VQ_INFLATE,
+   VIRTIO_BALLOON_VQ_DEFLATE,
+   VIRTIO_BALLOON_VQ_STATS,
+   VIRTIO_BALLOON_VQ_FREE_PAGE,
+   VIRTIO_BALLOON_VQ_MAX
+};
+
 struct virtio_balloon {
struct virtio_device *vdev;
-   struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
+   struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
+
+   /* Balloon's own wq for cpu-intensive work items */
+   struct workqueue_struct *balloon_wq;
+   /* The free page reporting work item submitted to the balloon wq */
+   struct work_struct report_free_page_work;
 
/* The balloon servicing is delegated to a freezable workqueue. */
struct work_struct update_balloon_stats_work;
@@ -63,6 +76,13 @@ struct virtio_balloon {
spinlock_t stop_update_lock;
bool stop_update;
 
+   /* The new cmd id received from host */
+   uint32_t cmd_id_received;
+   /* The cmd id that is actively in use */
+   __virtio32 cmd_id_active;
+   /* Buffer to store the stop sign */
+   __virtio32 stop_cmd_id;
+
/* Waiting for host to ack the pages we released. */
wait_queue_head_t acked;
 
@@ -320,17 +340,6 @@ static void stats_handle_request(struct virtio_balloon *vb)
virtqueue_kick(vq);
 }
 
-static void virtballoon_changed(struct virtio_device *vdev)
-{
-   struct virtio_balloon *vb = vdev->priv;
-   unsigned long flags;
-
-   spin_lock_irqsave(>stop_update_lock, flags);
-   if (!vb->stop_update)
-   queue_work(system_freezable_wq, >update_balloon_size_work);
-   spin_unlock_irqrestore(>stop_update_lock, flags);
-}
-
 static inline s64 towards_target(struct virtio_balloon *vb)
 {
s64 target;
@@ -347,6 +356,34 @@ static inline s64 towards_target(struct virtio_balloon *vb)
return target - vb->num_pages;
 }
 
+static void virtballoon_changed(struct virtio_device *vdev)
+{
+   struct virtio_balloon *vb = vdev->priv;
+   unsigned long flags;
+   s64 diff = towards_target(vb);
+
+   if (diff) {
+   spin_lock_irqsave(>stop_update_lock, flags);
+   if (!vb->stop_update)
+   queue_work(system_freezable_wq,
+  >update_balloon_size_work);
+   spin_unlock_irqrestore(>stop_update_lock, flags);
+   }
+
+   if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
+   virtio_cread(vdev, struct virtio_balloon_config,
+free_page_report_cmd_id, >cmd_id_received);
+   if (vb->cmd_id_received !=
+   VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID) {
+   spin_lock_irqsave(>stop_update_lock, flags);
+   if (!vb->stop_update)
+   queue_work(vb->balloon_wq,
+  >report_free_page_work);
+   spin_unlock_irqrestore(>stop_update_lock, flags);
+   }
+   }
+}
+
 static void update_balloon_size(struct virtio_balloon *vb)
 {
u32 actual = vb->num_pages;
@@ -419,44 +456,196 @@ static void update_balloon_size_func(struct work_struct 
*work)
queue_work(system_freezable_wq, work);
 }
 
+static void free_page_vq_cb(struct virtqueue *vq)
+{
+   unsigned int unused;
+
+   while (virtqueue_get_buf(vq, ))
+   ;
+}
+
 static int init_vqs(struct virtio_balloon 

[PATCH v32 4/4] virtio-balloon: VIRTIO_BALLOON_F_PAGE_POISON

2018-04-09 Thread Wei Wang
The VIRTIO_BALLOON_F_PAGE_POISON feature bit is used to indicate if the
guest is using page poisoning. Guest writes to the poison_val config
field to tell host about the page poisoning value in use.

Signed-off-by: Wei Wang 
Suggested-by: Michael S. Tsirkin 
Cc: Michael S. Tsirkin 
Cc: Michal Hocko 
Cc: Andrew Morton 
---
 drivers/virtio/virtio_balloon.c | 10 ++
 include/uapi/linux/virtio_balloon.h |  3 +++
 2 files changed, 13 insertions(+)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 197f865..aea91ab 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -730,6 +730,7 @@ static struct file_system_type balloon_fs = {
 static int virtballoon_probe(struct virtio_device *vdev)
 {
struct virtio_balloon *vb;
+   __u32 poison_val;
int err;
 
if (!vdev->config->get) {
@@ -775,6 +776,11 @@ static int virtballoon_probe(struct virtio_device *vdev)
vb->stop_cmd_id = cpu_to_virtio32(vb->vdev,
VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID);
INIT_WORK(>report_free_page_work, report_free_page_func);
+   if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_PAGE_POISON)) {
+   memset(_val, PAGE_POISON, sizeof(poison_val));
+   virtio_cwrite(vb->vdev, struct virtio_balloon_config,
+ poison_val, _val);
+   }
}
 
vb->nb.notifier_call = virtballoon_oom_notify;
@@ -893,6 +899,9 @@ static int virtballoon_restore(struct virtio_device *vdev)
 
 static int virtballoon_validate(struct virtio_device *vdev)
 {
+   if (!page_poisoning_enabled())
+   __virtio_clear_bit(vdev, VIRTIO_BALLOON_F_PAGE_POISON);
+
__virtio_clear_bit(vdev, VIRTIO_F_IOMMU_PLATFORM);
return 0;
 }
@@ -902,6 +911,7 @@ static unsigned int features[] = {
VIRTIO_BALLOON_F_STATS_VQ,
VIRTIO_BALLOON_F_DEFLATE_ON_OOM,
VIRTIO_BALLOON_F_FREE_PAGE_HINT,
+   VIRTIO_BALLOON_F_PAGE_POISON,
 };
 
 static struct virtio_driver virtio_balloon_driver = {
diff --git a/include/uapi/linux/virtio_balloon.h 
b/include/uapi/linux/virtio_balloon.h
index b2d86c2..8b93581 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -35,6 +35,7 @@
 #define VIRTIO_BALLOON_F_STATS_VQ  1 /* Memory Stats virtqueue */
 #define VIRTIO_BALLOON_F_DEFLATE_ON_OOM2 /* Deflate balloon on OOM */
 #define VIRTIO_BALLOON_F_FREE_PAGE_HINT3 /* VQ to report free pages */
+#define VIRTIO_BALLOON_F_PAGE_POISON   4 /* Guest is using page poisoning */
 
 /* Size of a PFN in the balloon interface. */
 #define VIRTIO_BALLOON_PFN_SHIFT 12
@@ -47,6 +48,8 @@ struct virtio_balloon_config {
__u32 actual;
/* Free page report command id, readonly by guest */
__u32 free_page_report_cmd_id;
+   /* Stores PAGE_POISON if page poisoning is in use */
+   __u32 poison_val;
 };
 
 #define VIRTIO_BALLOON_S_SWAP_IN  0   /* Amount of memory swapped in */
-- 
2.7.4

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v32 3/4] mm/page_poison: expose page_poisoning_enabled to kernel modules

2018-04-09 Thread Wei Wang
In some usages, e.g. virtio-balloon, a kernel module needs to know if
page poisoning is in use. This patch exposes the page_poisoning_enabled
function to kernel modules.

Signed-off-by: Wei Wang 
Cc: Andrew Morton 
Cc: Michal Hocko 
Cc: Michael S. Tsirkin 
Acked-by: Andrew Morton 
---
 mm/page_poison.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/mm/page_poison.c b/mm/page_poison.c
index e83fd44..762b472 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -17,6 +17,11 @@ static int early_page_poison_param(char *buf)
 }
 early_param("page_poison", early_page_poison_param);
 
+/**
+ * page_poisoning_enabled - check if page poisoning is enabled
+ *
+ * Return true if page poisoning is enabled, or false if not.
+ */
 bool page_poisoning_enabled(void)
 {
/*
@@ -29,6 +34,7 @@ bool page_poisoning_enabled(void)
(!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
debug_pagealloc_enabled()));
 }
+EXPORT_SYMBOL_GPL(page_poisoning_enabled);
 
 static void poison_page(struct page *page)
 {
-- 
2.7.4

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH RESEND net] vhost: fix vhost_vq_access_ok() log check

2018-04-09 Thread Stefan Hajnoczi
On Tue, Apr 10, 2018 at 3:40 AM, Michael S. Tsirkin  wrote:
> From: Stefan Hajnoczi 
>
> Commit d65026c6c62e7d9616c8ceb5a53b68bcdc050525 ("vhost: validate log
> when IOTLB is enabled") introduced a regression.  The logic was
> originally:
>
>   if (vq->iotlb)
>   return 1;
>   return A && B;
>
> After the patch the short-circuit logic for A was inverted:
>
>   if (A || vq->iotlb)
>   return A;
>   return B;
>
> The correct logic is:
>
>   if (!A || vq->iotlb)
>   return A;
>   return B;
>
> Reported-by: syzbot+65a84dde0214b0387...@syzkaller.appspotmail.com
> Cc: Jason Wang 
> Signed-off-by: Stefan Hajnoczi 
> Acked-by: Michael S. Tsirkin 
>
> ---
>  drivers/vhost/vhost.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)

NACK

I will send a v2 with cleaner logic as suggested by Linus.

Stefan
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH 2/3] netdev: kernel-only IFF_HIDDEN netdevice

2018-04-09 Thread Siwei Liu
On Mon, Apr 9, 2018 at 4:03 PM, Stephen Hemminger
 wrote:
> On Mon, 9 Apr 2018 15:30:42 -0700
> Siwei Liu  wrote:
>
>> On Mon, Apr 9, 2018 at 3:15 PM, Andrew Lunn  wrote:
>> >> No, implementation wise I'd avoid changing the class on the fly. What
>> >> I'm looking to is a means to add a secondary class or class aliasing
>> >> mechanism for netdevs that allows mapping for a kernel device
>> >> namespace (/class/net-kernel) to userspace (/class/net). Imagine
>> >> creating symlinks between these two namespaces as an analogy. All
>> >> userspace visible netdevs today will have both a kernel name and a
>> >> userspace visible name, having one (/class/net) referecing the other
>> >> (/class/net-kernel) in its own namespace. The newly introduced
>> >> IFF_AUTO_MANAGED device will have a kernel name only
>> >> (/class/net-kernel). As a result, the existing applications using
>> >> /class/net don't break, while we're adding the kernel namespace that
>> >> allows IFF_AUTO_MANAGED devices which will not be exposed to userspace
>> >> at all.
>> >
>> > My gut feeling is this whole scheme will not fly. You really should be
>> > talking to GregKH.
>>
>> Will do. Before spreading it out loudly I'd run it within netdev to
>> clarify the need for why not exposing the lower netdevs is critical
>> for cloud service providers in the face of introducing a new feature,
>> and we are not hiding anything but exposing it in a way that don't
>> break existing userspace applications while introducing feature is
>> possible with the limitation of keeping old userspace still.
>>
>> >
>> > Anyway, please remember that IFF_AUTO_MANAGED will need to be dynamic.
>> > A device can start out as a normal device, and will change to being
>> > automatic later, when the user on top of it probes.
>>
>> Sure. In whatever form it's still a netdev, and changing the namespace
>> should be more dynamic than changing the class.
>>
>> Thanks a lot,
>> -Siwei
>>
>> >
>> > Andrew
>
> Also, remember for netdev's /sys is really a third class API.
> The primary API's are netlink and ioctl. Also why not use existing
> network namespaces rather than inventing a new abstraction?

Because we want to leave old userspace unmodified while making SR-IOV
live migration transparent to users. Specifically, we'd want old udevd
to skip through uevents for the lower netdevs, while also making new
udevd able to name the bypass_master interface by referencing the pci
slot information which is only present in the sysfs entry for
IFF_AUTO_MANAGED net device.

The problem of using network namespace is that, no sysfs entry will be
around for IFF_AUTO_MANAGED netdev if we isolate it out to a separate
netns, unless we generalize the scope for what netns is designed for
(isolation I mean). For auto-managed netdevs we don't neccessarily
wants strict isolation, but rather a way of sticking to 1-netdev model
for strict backward compatibility requiement of the old userspace,
while exposing the information in a way new userspace understands.

Thanks,
-Siwei
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH 2/3] netdev: kernel-only IFF_HIDDEN netdevice

2018-04-09 Thread Stephen Hemminger
On Mon, 9 Apr 2018 15:30:42 -0700
Siwei Liu  wrote:

> On Mon, Apr 9, 2018 at 3:15 PM, Andrew Lunn  wrote:
> >> No, implementation wise I'd avoid changing the class on the fly. What
> >> I'm looking to is a means to add a secondary class or class aliasing
> >> mechanism for netdevs that allows mapping for a kernel device
> >> namespace (/class/net-kernel) to userspace (/class/net). Imagine
> >> creating symlinks between these two namespaces as an analogy. All
> >> userspace visible netdevs today will have both a kernel name and a
> >> userspace visible name, having one (/class/net) referecing the other
> >> (/class/net-kernel) in its own namespace. The newly introduced
> >> IFF_AUTO_MANAGED device will have a kernel name only
> >> (/class/net-kernel). As a result, the existing applications using
> >> /class/net don't break, while we're adding the kernel namespace that
> >> allows IFF_AUTO_MANAGED devices which will not be exposed to userspace
> >> at all.  
> >
> > My gut feeling is this whole scheme will not fly. You really should be
> > talking to GregKH.  
> 
> Will do. Before spreading it out loudly I'd run it within netdev to
> clarify the need for why not exposing the lower netdevs is critical
> for cloud service providers in the face of introducing a new feature,
> and we are not hiding anything but exposing it in a way that don't
> break existing userspace applications while introducing feature is
> possible with the limitation of keeping old userspace still.
> 
> >
> > Anyway, please remember that IFF_AUTO_MANAGED will need to be dynamic.
> > A device can start out as a normal device, and will change to being
> > automatic later, when the user on top of it probes.  
> 
> Sure. In whatever form it's still a netdev, and changing the namespace
> should be more dynamic than changing the class.
> 
> Thanks a lot,
> -Siwei
> 
> >
> > Andrew  

Also, remember for netdev's /sys is really a third class API.
The primary API's are netlink and ioctl. Also why not use existing
network namespaces rather than inventing a new abstraction?
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH 2/3] netdev: kernel-only IFF_HIDDEN netdevice

2018-04-09 Thread Siwei Liu
On Mon, Apr 9, 2018 at 3:15 PM, Andrew Lunn  wrote:
>> No, implementation wise I'd avoid changing the class on the fly. What
>> I'm looking to is a means to add a secondary class or class aliasing
>> mechanism for netdevs that allows mapping for a kernel device
>> namespace (/class/net-kernel) to userspace (/class/net). Imagine
>> creating symlinks between these two namespaces as an analogy. All
>> userspace visible netdevs today will have both a kernel name and a
>> userspace visible name, having one (/class/net) referecing the other
>> (/class/net-kernel) in its own namespace. The newly introduced
>> IFF_AUTO_MANAGED device will have a kernel name only
>> (/class/net-kernel). As a result, the existing applications using
>> /class/net don't break, while we're adding the kernel namespace that
>> allows IFF_AUTO_MANAGED devices which will not be exposed to userspace
>> at all.
>
> My gut feeling is this whole scheme will not fly. You really should be
> talking to GregKH.

Will do. Before spreading it out loudly I'd run it within netdev to
clarify the need for why not exposing the lower netdevs is critical
for cloud service providers in the face of introducing a new feature,
and we are not hiding anything but exposing it in a way that don't
break existing userspace applications while introducing feature is
possible with the limitation of keeping old userspace still.

>
> Anyway, please remember that IFF_AUTO_MANAGED will need to be dynamic.
> A device can start out as a normal device, and will change to being
> automatic later, when the user on top of it probes.

Sure. In whatever form it's still a netdev, and changing the namespace
should be more dynamic than changing the class.

Thanks a lot,
-Siwei

>
> Andrew
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH 2/3] netdev: kernel-only IFF_HIDDEN netdevice

2018-04-09 Thread Andrew Lunn
> No, implementation wise I'd avoid changing the class on the fly. What
> I'm looking to is a means to add a secondary class or class aliasing
> mechanism for netdevs that allows mapping for a kernel device
> namespace (/class/net-kernel) to userspace (/class/net). Imagine
> creating symlinks between these two namespaces as an analogy. All
> userspace visible netdevs today will have both a kernel name and a
> userspace visible name, having one (/class/net) referecing the other
> (/class/net-kernel) in its own namespace. The newly introduced
> IFF_AUTO_MANAGED device will have a kernel name only
> (/class/net-kernel). As a result, the existing applications using
> /class/net don't break, while we're adding the kernel namespace that
> allows IFF_AUTO_MANAGED devices which will not be exposed to userspace
> at all.

My gut feeling is this whole scheme will not fly. You really should be
talking to GregKH.

Anyway, please remember that IFF_AUTO_MANAGED will need to be dynamic.
A device can start out as a normal device, and will change to being
automatic later, when the user on top of it probes.

Andrew
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH 2/3] netdev: kernel-only IFF_HIDDEN netdevice

2018-04-09 Thread Siwei Liu
On Fri, Apr 6, 2018 at 8:19 PM, Andrew Lunn  wrote:
> Hi Siwei
>
>> I think everyone seems to agree not to fiddle with the ":" prefix, but
>> rather have a new class of network subsystem under /sys/class thus a
>> separate device namespace e.g. /sys/class/net-kernel for those
>> auto-managed lower netdevs is needed.
>
> How do you get a device into this new class? I don't know the Linux
> driver model too well, but to get a device out of one class and into
> another, i think you need to device_del(dev). modify dev->class and
> then device_add(dev). However, device_add() says you are not allowed
> to do this.

No, implementation wise I'd avoid changing the class on the fly. What
I'm looking to is a means to add a secondary class or class aliasing
mechanism for netdevs that allows mapping for a kernel device
namespace (/class/net-kernel) to userspace (/class/net). Imagine
creating symlinks between these two namespaces as an analogy. All
userspace visible netdevs today will have both a kernel name and a
userspace visible name, having one (/class/net) referecing the other
(/class/net-kernel) in its own namespace. The newly introduced
IFF_AUTO_MANAGED device will have a kernel name only
(/class/net-kernel). As a result, the existing applications using
/class/net don't break, while we're adding the kernel namespace that
allows IFF_AUTO_MANAGED devices which will not be exposed to userspace
at all.

As this requires changing the internals of driver model core it's a
rather big hammer approach I'd think. If there exists a better
implementation than this to allow adding a separate layer of in-kernel
device namespace, I'd more than welcome to hear about.

>
> And i don't even see how this helps. Are you also not going to call
> list_netdevice()? Are you going to add some other list for these
> devices in a different class?

list_netdevice() is still called. I think with the current RFC patch,
I've added two lists for netdevs under the kernel namespace:
dev_cmpl_list and name_cmpl_hlist. As a result of that, all userspace
netdevs get registered will be added to two types of lists: the
userspace list for e.g. dev_list, and also the kernelspace list e.g.
dev_cmpl_list (I can rename it to something more accurate). The
IFF_AUTO_MANAGED device will be only added to kernelspace list e.g.
dev_cmpl_list.

Hope all your questions are answered.

Thanks,
-Siwei


>
>Andrew
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [virtio-dev] Re: [PATCH v2] virtio_balloon: export hugetlb page allocation counts

2018-04-09 Thread Jonathan Helman



On 03/22/2018 07:38 PM, Jason Wang wrote:



On 2018年03月22日 11:10, Michael S. Tsirkin wrote:

On Thu, Mar 22, 2018 at 09:52:18AM +0800, Jason Wang wrote:

On 2018年03月20日 12:26, Jonathan Helman wrote:

On Mar 19, 2018, at 7:31 PM, Jason Wang  wrote:



On 2018年03月20日 06:14, Jonathan Helman wrote:

Export the number of successful and failed hugetlb page
allocations via the virtio balloon driver. These 2 counts
come directly from the vm_events HTLB_BUDDY_PGALLOC and
HTLB_BUDDY_PGALLOC_FAIL.

Signed-off-by: Jonathan Helman

Reviewed-by: Jason Wang

Thanks.


---
   drivers/virtio/virtio_balloon.c | 6 ++
   include/uapi/linux/virtio_balloon.h | 4 +++-
   2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/virtio/virtio_balloon.c 
b/drivers/virtio/virtio_balloon.c

index dfe5684..6b237e3 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -272,6 +272,12 @@ static unsigned int 
update_balloon_stats(struct virtio_balloon *vb)

   pages_to_bytes(events[PSWPOUT]));
   update_stat(vb, idx++, VIRTIO_BALLOON_S_MAJFLT, 
events[PGMAJFAULT]);
   update_stat(vb, idx++, VIRTIO_BALLOON_S_MINFLT, 
events[PGFAULT]);

+#ifdef CONFIG_HUGETLB_PAGE
+    update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGALLOC,
+    events[HTLB_BUDDY_PGALLOC]);
+    update_stat(vb, idx++, VIRTIO_BALLOON_S_HTLB_PGFAIL,
+    events[HTLB_BUDDY_PGALLOC_FAIL]);
+#endif
   #endif
   update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMFREE,
   pages_to_bytes(i.freeram));
diff --git a/include/uapi/linux/virtio_balloon.h 
b/include/uapi/linux/virtio_balloon.h

index 4e8b830..40297a3 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -53,7 +53,9 @@ struct virtio_balloon_config {
   #define VIRTIO_BALLOON_S_MEMTOT   5   /* Total amount of memory */
   #define VIRTIO_BALLOON_S_AVAIL    6   /* Available memory as in 
/proc */

   #define VIRTIO_BALLOON_S_CACHES   7   /* Disk caches */
-#define VIRTIO_BALLOON_S_NR   8
+#define VIRTIO_BALLOON_S_HTLB_PGALLOC  8  /* Hugetlb page 
allocations */
+#define VIRTIO_BALLOON_S_HTLB_PGFAIL   9  /* Hugetlb page 
allocation failures */

+#define VIRTIO_BALLOON_S_NR   10
 /*
    * Memory statistics structure.
Not for this patch, but it looks to me that exporting such nr 
through uapi is fragile.

Sorry, can you explain what you mean here?

Jon
Spec said "Within an output buffer submitted to the statsq, the 
device MUST

ignore entries with tag values that it does not recognize". So exporting
VIRTIO_BALLOON_S_NR seems useless and device implementation can not 
depend

on such number in uapi.

Thanks

Suggestions? I don't like to break build for people ...



Didn't have a good idea. But maybe we should keep VIRTIO_BALLOON_S_NR 
unchanged, and add a comment here.


Thanks


I think Jason's comment is for a future patch. Didn't see this patch get 
applied, so wondering if it could be.


Thanks,
Jon
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[PATCH RESEND net] vhost: fix vhost_vq_access_ok() log check

2018-04-09 Thread Michael S. Tsirkin
From: Stefan Hajnoczi 

Commit d65026c6c62e7d9616c8ceb5a53b68bcdc050525 ("vhost: validate log
when IOTLB is enabled") introduced a regression.  The logic was
originally:

  if (vq->iotlb)
  return 1;
  return A && B;

After the patch the short-circuit logic for A was inverted:

  if (A || vq->iotlb)
  return A;
  return B;

The correct logic is:

  if (!A || vq->iotlb)
  return A;
  return B;

Reported-by: syzbot+65a84dde0214b0387...@syzkaller.appspotmail.com
Cc: Jason Wang 
Signed-off-by: Stefan Hajnoczi 
Acked-by: Michael S. Tsirkin 

---
 drivers/vhost/vhost.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 5320039671b7..f6af4210679a 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1246,7 +1246,7 @@ int vhost_vq_access_ok(struct vhost_virtqueue *vq)
 {
int ret = vq_log_access_ok(vq, vq->log_base);
 
-   if (ret || vq->iotlb)
+   if (!ret || vq->iotlb)
return ret;
 
return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used);
-- 
2.14.3
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH] vhost: fix vhost_vq_access_ok() log check

2018-04-09 Thread Linus Torvalds
On Mon, Apr 9, 2018 at 6:10 AM, Stefan Hajnoczi  wrote:
> @@ -1246,7 +1246,7 @@ int vhost_vq_access_ok(struct vhost_virtqueue *vq)
>  {
> int ret = vq_log_access_ok(vq, vq->log_base);
>
> -   if (ret || vq->iotlb)
> +   if (!ret || vq->iotlb)
> return ret;

That logic is still very non-obvious.

This code already had one bug because of an odd illegible test
sequence. Let's not keep the crazy code.

Why not just do the *obvious* thing, and get rid of "ret" entirely,
and make the damn thing return a boolean, and then just write it all
as

/* Caller should have vq mutex and device mutex */
bool vhost_vq_access_ok(struct vhost_virtqueue *vq)
{
if (!vq_log_access_ok(vq, vq->log_base))
return false;

if (vq->iotlb || vq_access_ok(vq, vq->num, vq->desc,
vq->avail, vq->used);
}

which makes the logic obvious: if vq_log_access_ok() fails, then then
vhost_vq_access_ok() fails unconditionally.

Otherwise, we need to have an iotlb, or a successful vq_access_ok() check.

Doesn't that all make more sense, and avoid the insane "ret" value use
that is really quite subtle?

Linus
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH RESEND v2] vhost-net: set packet weight of tx polling to 2 * vq size

2018-04-09 Thread David Miller
From: haibinzhang(张海斌) 
Date: Mon, 9 Apr 2018 07:22:17 +

> handle_tx will delay rx for tens or even hundreds of milliseconds when tx busy
> polling udp packets with small length(e.g. 1byte udp payload), because setting
> VHOST_NET_WEIGHT takes into account only sent-bytes but no single packet 
> length.
> 
> Ping-Latencies shown below were tested between two Virtual Machines using
> netperf (UDP_STREAM, len=1), and then another machine pinged the client:
 ...
> Acked-by: Michael S. Tsirkin 
> Signed-off-by: Haibin Zhang 
> Signed-off-by: Yunfang Tai 
> Signed-off-by: Lidong Chen 

Applied, thank you.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH] vhost: fix vhost_vq_access_ok() log check

2018-04-09 Thread Michael S. Tsirkin
On Mon, Apr 09, 2018 at 09:10:21PM +0800, Stefan Hajnoczi wrote:
> Commit d65026c6c62e7d9616c8ceb5a53b68bcdc050525 ("vhost: validate log
> when IOTLB is enabled") introduced a regression.  The logic was
> originally:
> 
>   if (vq->iotlb)
>   return 1;
>   return A && B;
> 
> After the patch the short-circuit logic for A was inverted:
> 
>   if (A || vq->iotlb)
>   return A;
>   return B;
> 
> The correct logic is:
> 
>   if (!A || vq->iotlb)
>   return A;
>   return B;
> 
> Reported-by: syzbot+65a84dde0214b0387...@syzkaller.appspotmail.com
> Cc: Jason Wang 
> Signed-off-by: Stefan Hajnoczi 

Acked-by: Michael S. Tsirkin 

> ---
>  drivers/vhost/vhost.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 5320039671b7..f6af4210679a 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -1246,7 +1246,7 @@ int vhost_vq_access_ok(struct vhost_virtqueue *vq)
>  {
>   int ret = vq_log_access_ok(vq, vq->log_base);
>  
> - if (ret || vq->iotlb)
> + if (!ret || vq->iotlb)
>   return ret;
>  
>   return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used);
> -- 
> 2.14.3
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: kernel BUG at drivers/vhost/vhost.c:LINE! (2)

2018-04-09 Thread Stefan Hajnoczi
On Mon, Apr 9, 2018 at 11:28 AM, Stefan Hajnoczi  wrote:
> On Mon, Apr 09, 2018 at 05:44:36AM +0300, Michael S. Tsirkin wrote:
>> On Mon, Apr 09, 2018 at 10:37:45AM +0800, Stefan Hajnoczi wrote:
>> > On Sat, Apr 7, 2018 at 3:02 AM, syzbot
>> >  wrote:
>> > > syzbot hit the following crash on upstream commit
>> > > 38c23685b273cfb4ccf31a199feccce3bdcb5d83 (Fri Apr 6 04:29:35 2018 +)
>> > > Merge tag 'armsoc-drivers' of
>> > > git://git.kernel.org/pub/scm/linux/kernel/git/arm/arm-soc
>> > > syzbot dashboard link:
>> > > https://syzkaller.appspot.com/bug?extid=65a84dde0214b0387ccd
>> >
>> > To prevent duplicated work: I am working on this one.
>> >
>> > Stefan
>>
>> Do you want to try this patchset:
>> https://lkml.org/lkml/2018/4/5/665
>>
>> ?
>
> Thanks, I'll give it a shot.
>
> I also noticed a regression in commit
> d65026c6c62e7d9616c8ceb5a53b68bcdc050525 ("vhost: validate log when
> IOTLB is enabled") and am currently testing a fix.

I have sent a fix:
https://lkml.org/lkml/2018/4/9/390

Stefan
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH] vhost: fix vhost_vq_access_ok() log check

2018-04-09 Thread Stefan Hajnoczi
Commit d65026c6c62e7d9616c8ceb5a53b68bcdc050525 ("vhost: validate log
when IOTLB is enabled") introduced a regression.  The logic was
originally:

  if (vq->iotlb)
  return 1;
  return A && B;

After the patch the short-circuit logic for A was inverted:

  if (A || vq->iotlb)
  return A;
  return B;

The correct logic is:

  if (!A || vq->iotlb)
  return A;
  return B;

Reported-by: syzbot+65a84dde0214b0387...@syzkaller.appspotmail.com
Cc: Jason Wang 
Signed-off-by: Stefan Hajnoczi 
---
 drivers/vhost/vhost.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 5320039671b7..f6af4210679a 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1246,7 +1246,7 @@ int vhost_vq_access_ok(struct vhost_virtqueue *vq)
 {
int ret = vq_log_access_ok(vq, vq->log_base);
 
-   if (ret || vq->iotlb)
+   if (!ret || vq->iotlb)
return ret;
 
return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used);
-- 
2.14.3

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v31 2/4] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_HINT

2018-04-09 Thread Wei Wang

On 04/09/2018 02:03 PM, Michael S. Tsirkin wrote:

On Fri, Apr 06, 2018 at 08:17:23PM +0800, Wei Wang wrote:

Negotiation of the VIRTIO_BALLOON_F_FREE_PAGE_HINT feature indicates the
support of reporting hints of guest free pages to host via virtio-balloon.

Host requests the guest to report free page hints by sending a new cmd
id to the guest via the free_page_report_cmd_id configuration register.

When the guest starts to report, the first element added to the free page
vq is the cmd id given by host. When the guest finishes the reporting
of all the free pages, VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID is added
to the vq to tell host that the reporting is done. Host polls the free
page vq after sending the starting cmd id, so the guest doesn't need to
kick after filling an element to the vq.

Host may also requests the guest to stop the reporting in advance by
sending the stop cmd id to the guest via the configuration register.

Signed-off-by: Wei Wang 
Signed-off-by: Liang Li 
Cc: Michael S. Tsirkin 
Cc: Michal Hocko 


Pretty good by now, Minor comments below.


Thanks for the comments.




---
  drivers/virtio/virtio_balloon.c | 272 +++-
  include/uapi/linux/virtio_balloon.h |   4 +
  2 files changed, 240 insertions(+), 36 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index dfe5684..aef73ee 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -51,9 +51,22 @@ MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
  static struct vfsmount *balloon_mnt;
  #endif
  
+enum virtio_balloon_vq {

+   VIRTIO_BALLOON_VQ_INFLATE,
+   VIRTIO_BALLOON_VQ_DEFLATE,
+   VIRTIO_BALLOON_VQ_STATS,
+   VIRTIO_BALLOON_VQ_FREE_PAGE,
+   VIRTIO_BALLOON_VQ_MAX
+};
+
  struct virtio_balloon {
struct virtio_device *vdev;
-   struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
+   struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
+
+   /* Balloon's own wq for cpu-intensive work items */
+   struct workqueue_struct *balloon_wq;
+   /* The free page reporting work item submitted to the balloon wq */
+   struct work_struct report_free_page_work;
  
  	/* The balloon servicing is delegated to a freezable workqueue. */

struct work_struct update_balloon_stats_work;
@@ -63,6 +76,13 @@ struct virtio_balloon {
spinlock_t stop_update_lock;
bool stop_update;
  
+	/* The new cmd id received from host */

+   uint32_t cmd_id_received;
+   /* The cmd id that is in use */
+   __virtio32 cmd_id_use;

I'd prefer cmd_id_active but it's not critical.


OK, will change.



+
+static void report_free_page_func(struct work_struct *work)
+{
+   struct virtio_balloon *vb;
+   struct virtqueue *vq;
+   unsigned int unused;
+   int ret;
+
+   vb = container_of(work, struct virtio_balloon, report_free_page_work);
+   vq = vb->free_page_vq;
+
+   /* Start by sending the received cmd id to host with an outbuf. */
+   ret = send_start_cmd_id(vb, vb->cmd_id_received);
+   if (unlikely(ret))
+   goto err;
+
+   ret = walk_free_mem_block(vb, 0, _balloon_send_free_pages);
+   if (unlikely(ret == -EIO))
+   goto err;
why is EIO special? I think you should special-case EINTR maybe.


Actually EINTR isn't an error which needs to bail out. That's just the 
case that the vq is full, that hint isn't added. Maybe it is not 
necessary to treat the "vq full" case as an error.
How about just returning "0" when the vq is full, instead of returning 
"EINTR"? (The next hint will continue to be added)







+
+   /* End by sending a stop id to host with an outbuf. */
+   ret = send_stop_cmd_id(vb);
+   if (likely(!ret)) {

What happens on failure? Don't we need to detach anyway?


Yes. Please see below, we could make some more change.



+   /* Ending: detach all the used buffers from the vq. */
+   while (vq->num_free != virtqueue_get_vring_size(vq))
+   virtqueue_get_buf(vq, );

This isn't all that happens here. It also waits for buffers to
be consumed. Is this by design? And why is it good idea to
busy poll while doing it?


Because host and guest operations happen asynchronously. When the guest 
reaches here, host may have not put anything to the vq yet. How about 
doing this via the free page vq handler?
Host will send a free page vq interrupt before exiting the optimization. 
I think this would be nicer.



Best,
Wei
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH net-next v5 3/4] virtio_net: Extend virtio to use VF datapath when available

2018-04-09 Thread Jiri Pirko
Sat, Apr 07, 2018 at 12:59:14AM CEST, sridhar.samudr...@intel.com wrote:
>On 4/6/2018 5:48 AM, Jiri Pirko wrote:
>> Thu, Apr 05, 2018 at 11:08:22PM CEST, sridhar.samudr...@intel.com wrote:

[...]

>> > +static int virtnet_bypass_join_child(struct net_device *bypass_netdev,
>> > +   struct net_device *child_netdev)
>> > +{
>> > +  struct virtnet_bypass_info *vbi;
>> > +  bool backup;
>> > +
>> > +  vbi = netdev_priv(bypass_netdev);
>> > +  backup = (child_netdev->dev.parent == bypass_netdev->dev.parent);
>> > +  if (backup ? rtnl_dereference(vbi->backup_netdev) :
>> > +  rtnl_dereference(vbi->active_netdev)) {
>> > +  netdev_info(bypass_netdev,
>> > +  "%s attempting to join bypass dev when %s already 
>> > present\n",
>> > +  child_netdev->name, backup ? "backup" : "active");
>> Bypass module should check if there is already some other netdev
>> enslaved and refuse right there.
>
>This will work for virtio-net with 3 netdev model, but this check has to be 
>done by netvsc
>as its bypass_netdev is same as the backup_netdev.
>Will add a flag while registering with the bypass module to indicate if the 
>driver is doing
>a 2 netdev or 3 netdev model and based on that flag this check can be done in 
>bypass module
>for 3 netdev scenario.

Just let me undestand it clearly. What I expect the difference would be
between 2netdev and3 netdev model is this:
2netdev:
  bypass_master
 /
/
VF_slave

3netdev:
  bypass_master
 / \
/   \
VF_slave   backup_slave

Is that correct? If not, how does it look like?

Thanks!
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v31 2/4] virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_HINT

2018-04-09 Thread Michael S. Tsirkin
On Fri, Apr 06, 2018 at 08:17:23PM +0800, Wei Wang wrote:
> Negotiation of the VIRTIO_BALLOON_F_FREE_PAGE_HINT feature indicates the
> support of reporting hints of guest free pages to host via virtio-balloon.
> 
> Host requests the guest to report free page hints by sending a new cmd
> id to the guest via the free_page_report_cmd_id configuration register.
> 
> When the guest starts to report, the first element added to the free page
> vq is the cmd id given by host. When the guest finishes the reporting
> of all the free pages, VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID is added
> to the vq to tell host that the reporting is done. Host polls the free
> page vq after sending the starting cmd id, so the guest doesn't need to
> kick after filling an element to the vq.
> 
> Host may also requests the guest to stop the reporting in advance by
> sending the stop cmd id to the guest via the configuration register.
> 
> Signed-off-by: Wei Wang 
> Signed-off-by: Liang Li 
> Cc: Michael S. Tsirkin 
> Cc: Michal Hocko 


Pretty good by now, Minor comments below.

> ---
>  drivers/virtio/virtio_balloon.c | 272 
> +++-
>  include/uapi/linux/virtio_balloon.h |   4 +
>  2 files changed, 240 insertions(+), 36 deletions(-)
> 
> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> index dfe5684..aef73ee 100644
> --- a/drivers/virtio/virtio_balloon.c
> +++ b/drivers/virtio/virtio_balloon.c
> @@ -51,9 +51,22 @@ MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
>  static struct vfsmount *balloon_mnt;
>  #endif
>  
> +enum virtio_balloon_vq {
> + VIRTIO_BALLOON_VQ_INFLATE,
> + VIRTIO_BALLOON_VQ_DEFLATE,
> + VIRTIO_BALLOON_VQ_STATS,
> + VIRTIO_BALLOON_VQ_FREE_PAGE,
> + VIRTIO_BALLOON_VQ_MAX
> +};
> +
>  struct virtio_balloon {
>   struct virtio_device *vdev;
> - struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
> + struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
> +
> + /* Balloon's own wq for cpu-intensive work items */
> + struct workqueue_struct *balloon_wq;
> + /* The free page reporting work item submitted to the balloon wq */
> + struct work_struct report_free_page_work;
>  
>   /* The balloon servicing is delegated to a freezable workqueue. */
>   struct work_struct update_balloon_stats_work;
> @@ -63,6 +76,13 @@ struct virtio_balloon {
>   spinlock_t stop_update_lock;
>   bool stop_update;
>  
> + /* The new cmd id received from host */
> + uint32_t cmd_id_received;
> + /* The cmd id that is in use */
> + __virtio32 cmd_id_use;

I'd prefer cmd_id_active but it's not critical.

> + /* Buffer to store the stop sign */
> + __virtio32 stop_cmd_id;
> +
>   /* Waiting for host to ack the pages we released. */
>   wait_queue_head_t acked;
>  
> @@ -320,17 +340,6 @@ static void stats_handle_request(struct virtio_balloon 
> *vb)
>   virtqueue_kick(vq);
>  }
>  
> -static void virtballoon_changed(struct virtio_device *vdev)
> -{
> - struct virtio_balloon *vb = vdev->priv;
> - unsigned long flags;
> -
> - spin_lock_irqsave(>stop_update_lock, flags);
> - if (!vb->stop_update)
> - queue_work(system_freezable_wq, >update_balloon_size_work);
> - spin_unlock_irqrestore(>stop_update_lock, flags);
> -}
> -
>  static inline s64 towards_target(struct virtio_balloon *vb)
>  {
>   s64 target;
> @@ -347,6 +356,34 @@ static inline s64 towards_target(struct virtio_balloon 
> *vb)
>   return target - vb->num_pages;
>  }
>  
> +static void virtballoon_changed(struct virtio_device *vdev)
> +{
> + struct virtio_balloon *vb = vdev->priv;
> + unsigned long flags;
> + s64 diff = towards_target(vb);
> +
> + if (diff) {
> + spin_lock_irqsave(>stop_update_lock, flags);
> + if (!vb->stop_update)
> + queue_work(system_freezable_wq,
> +>update_balloon_size_work);
> + spin_unlock_irqrestore(>stop_update_lock, flags);
> + }
> +
> + if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
> + virtio_cread(vdev, struct virtio_balloon_config,
> +  free_page_report_cmd_id, >cmd_id_received);
> + if (vb->cmd_id_received !=
> + VIRTIO_BALLOON_FREE_PAGE_REPORT_STOP_ID) {
> + spin_lock_irqsave(>stop_update_lock, flags);
> + if (!vb->stop_update)
> + queue_work(vb->balloon_wq,
> +>report_free_page_work);
> + spin_unlock_irqrestore(>stop_update_lock, flags);
> + }
> + }
> +}
> +
>  static void update_balloon_size(struct virtio_balloon *vb)
>  {
>   u32 actual = vb->num_pages;
> @@ -421,42 +458,178 @@ static void