[RFC PATCH V3 4/5] vhost: introduce helpers to get the size of metadata area

2018-12-29 Thread Jason Wang
Signed-off-by: Jason Wang 
---
 drivers/vhost/vhost.c | 46 ++-
 1 file changed, 28 insertions(+), 18 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 27b5c03feaac..54b43feef8d9 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -489,6 +489,27 @@ bool vhost_dev_has_owner(struct vhost_dev *dev)
 }
 EXPORT_SYMBOL_GPL(vhost_dev_has_owner);
 
+static size_t vhost_get_avail_size(struct vhost_virtqueue *vq, int num)
+{
+   size_t event = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
+
+   return sizeof(*vq->avail) +
+  sizeof(*vq->avail->ring) * num + event;
+}
+
+static size_t vhost_get_used_size(struct vhost_virtqueue *vq, int num)
+{
+   size_t event = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
+
+   return sizeof(*vq->used) +
+  sizeof(*vq->used->ring) * num + event;
+}
+
+static size_t vhost_get_desc_size(struct vhost_virtqueue *vq, int num)
+{
+   return sizeof(*vq->desc) * num;
+}
+
 /* Caller should have device mutex */
 long vhost_dev_set_owner(struct vhost_dev *dev)
 {
@@ -1248,13 +1269,9 @@ static bool vq_access_ok(struct vhost_virtqueue *vq, 
unsigned int num,
 struct vring_used __user *used)
 
 {
-   size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
-
-   return access_ok(VERIFY_READ, desc, num * sizeof *desc) &&
-  access_ok(VERIFY_READ, avail,
-sizeof *avail + num * sizeof *avail->ring + s) &&
-  access_ok(VERIFY_WRITE, used,
-   sizeof *used + num * sizeof *used->ring + s);
+   return access_ok(VERIFY_READ, desc, vhost_get_desc_size(vq, num)) &&
+  access_ok(VERIFY_READ, avail, vhost_get_avail_size(vq, num)) &&
+  access_ok(VERIFY_WRITE, used, vhost_get_used_size(vq, num));
 }
 
 static void vhost_vq_meta_update(struct vhost_virtqueue *vq,
@@ -1306,22 +1323,18 @@ static bool iotlb_access_ok(struct vhost_virtqueue *vq,
 
 int vq_meta_prefetch(struct vhost_virtqueue *vq)
 {
-   size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
unsigned int num = vq->num;
 
if (!vq->iotlb)
return 1;
 
return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
-  num * sizeof(*vq->desc), VHOST_ADDR_DESC) &&
+  vhost_get_desc_size(vq, num), VHOST_ADDR_DESC) &&
   iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->avail,
-  sizeof *vq->avail +
-  num * sizeof(*vq->avail->ring) + s,
+  vhost_get_avail_size(vq, num),
   VHOST_ADDR_AVAIL) &&
   iotlb_access_ok(vq, VHOST_ACCESS_WO, (u64)(uintptr_t)vq->used,
-  sizeof *vq->used +
-  num * sizeof(*vq->used->ring) + s,
-  VHOST_ADDR_USED);
+  vhost_get_used_size(vq, num), VHOST_ADDR_USED);
 }
 EXPORT_SYMBOL_GPL(vq_meta_prefetch);
 
@@ -1338,13 +1351,10 @@ EXPORT_SYMBOL_GPL(vhost_log_access_ok);
 static bool vq_log_access_ok(struct vhost_virtqueue *vq,
 void __user *log_base)
 {
-   size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
-
return vq_memory_access_ok(log_base, vq->umem,
   vhost_has_feature(vq, VHOST_F_LOG_ALL)) &&
(!vq->log_used || log_access_ok(log_base, vq->log_addr,
-   sizeof *vq->used +
-   vq->num * sizeof *vq->used->ring + s));
+ vhost_get_used_size(vq, vq->num)));
 }
 
 /* Can we start vq? */
-- 
2.17.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[RFC PATCH V3 3/5] vhost: rename vq_iotlb_prefetch() to vq_meta_prefetch()

2018-12-29 Thread Jason Wang
Rename the function to be more accurate since it actually tries to
prefetch vq metadata address in IOTLB. And this will be used by
following patch to prefetch metadata virtual addresses.

Signed-off-by: Jason Wang 
---
 drivers/vhost/net.c   | 4 ++--
 drivers/vhost/vhost.c | 4 ++--
 drivers/vhost/vhost.h | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 36f3d0f49e60..0b4b3deab5aa 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -971,7 +971,7 @@ static void handle_tx(struct vhost_net *net)
if (!sock)
goto out;
 
-   if (!vq_iotlb_prefetch(vq))
+   if (!vq_meta_prefetch(vq))
goto out;
 
vhost_disable_notify(>dev, vq);
@@ -1140,7 +1140,7 @@ static void handle_rx(struct vhost_net *net)
if (!sock)
goto out;
 
-   if (!vq_iotlb_prefetch(vq))
+   if (!vq_meta_prefetch(vq))
goto out;
 
vhost_disable_notify(>dev, vq);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 337ce6f5a098..27b5c03feaac 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1304,7 +1304,7 @@ static bool iotlb_access_ok(struct vhost_virtqueue *vq,
return true;
 }
 
-int vq_iotlb_prefetch(struct vhost_virtqueue *vq)
+int vq_meta_prefetch(struct vhost_virtqueue *vq)
 {
size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
unsigned int num = vq->num;
@@ -1323,7 +1323,7 @@ int vq_iotlb_prefetch(struct vhost_virtqueue *vq)
   num * sizeof(*vq->used->ring) + s,
   VHOST_ADDR_USED);
 }
-EXPORT_SYMBOL_GPL(vq_iotlb_prefetch);
+EXPORT_SYMBOL_GPL(vq_meta_prefetch);
 
 /* Can we log writes? */
 /* Caller should have device mutex but not vq mutex */
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 466ef7542291..0d1ff977a43e 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -206,7 +206,7 @@ bool vhost_enable_notify(struct vhost_dev *, struct 
vhost_virtqueue *);
 
 int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
unsigned int log_num, u64 len);
-int vq_iotlb_prefetch(struct vhost_virtqueue *vq);
+int vq_meta_prefetch(struct vhost_virtqueue *vq);
 
 struct vhost_msg_node *vhost_new_msg(struct vhost_virtqueue *vq, int type);
 void vhost_enqueue_msg(struct vhost_dev *dev,
-- 
2.17.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[RFC PATCH V3 5/5] vhost: access vq metadata through kernel virtual address

2018-12-29 Thread Jason Wang
It was noticed that the copy_user() friends that was used to access
virtqueue metdata tends to be very expensive for dataplane
implementation like vhost since it involves lots of software checks,
speculation barrier, hardware feature toggling (e.g SMAP). The
extra cost will be more obvious when transferring small packets since
the time spent on metadata accessing become significant..

This patch tries to eliminate those overhead by accessing them through
kernel virtual address by vmap(). To make the pages can be migrated,
instead of pinning them through GUP, we use mmu notifiers to
invalidate vmaps and re-establish vmaps during each round of metadata
prefetching in necessary. For devices that doesn't use metadata
prefetching, the memory acessors fallback to normal copy_user()
implementation gracefully. The invalidation was synchronized with
datapath through vq mutex, and in order to avoid hold vq mutex during
range checking, MMU notifier was teared down when trying to modify vq
metadata.

Note that this was only done when device IOTLB is not enabled. We
could use similar method to optimize it in the future.

Tests shows about ~24% improvement on TX PPS when using virtio-user +
vhost_net + xdp1 on TAP:

Before: ~5.0Mpps
After:  ~6.1Mpps

Signed-off-by: Jason Wang 
---
 drivers/vhost/vhost.c | 263 +-
 drivers/vhost/vhost.h |  13 +++
 2 files changed, 274 insertions(+), 2 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 54b43feef8d9..e1ecb8acf8a3 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -440,6 +440,9 @@ void vhost_dev_init(struct vhost_dev *dev,
vq->indirect = NULL;
vq->heads = NULL;
vq->dev = dev;
+   memset(>avail_ring, 0, sizeof(vq->avail_ring));
+   memset(>used_ring, 0, sizeof(vq->used_ring));
+   memset(>desc_ring, 0, sizeof(vq->desc_ring));
mutex_init(>mutex);
vhost_vq_reset(dev, vq);
if (vq->handle_kick)
@@ -510,6 +513,73 @@ static size_t vhost_get_desc_size(struct vhost_virtqueue 
*vq, int num)
return sizeof(*vq->desc) * num;
 }
 
+static void vhost_uninit_vmap(struct vhost_vmap *map)
+{
+   if (map->addr)
+   vunmap(map->unmap_addr);
+
+   map->addr = NULL;
+   map->unmap_addr = NULL;
+}
+
+static int vhost_invalidate_vmap(struct vhost_virtqueue *vq,
+struct vhost_vmap *map,
+unsigned long ustart,
+size_t size,
+unsigned long start,
+unsigned long end,
+bool blockable)
+{
+   if (end < ustart || start > ustart - 1 + size)
+   return 0;
+
+   if (!blockable)
+   return -EAGAIN;
+
+   mutex_lock(>mutex);
+   vhost_uninit_vmap(map);
+   mutex_unlock(>mutex);
+
+   return 0;
+}
+
+static int vhost_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
+struct mm_struct *mm,
+unsigned long start,
+unsigned long end,
+bool blockable)
+{
+   struct vhost_dev *dev = container_of(mn, struct vhost_dev,
+mmu_notifier);
+   int i;
+
+   for (i = 0; i < dev->nvqs; i++) {
+   struct vhost_virtqueue *vq = dev->vqs[i];
+
+   if (vhost_invalidate_vmap(vq, >avail_ring,
+ (unsigned long)vq->avail,
+ vhost_get_avail_size(vq, vq->num),
+ start, end, blockable))
+   return -EAGAIN;
+   if (vhost_invalidate_vmap(vq, >desc_ring,
+ (unsigned long)vq->desc,
+ vhost_get_desc_size(vq, vq->num),
+ start, end, blockable))
+   return -EAGAIN;
+   if (vhost_invalidate_vmap(vq, >used_ring,
+ (unsigned long)vq->used,
+ vhost_get_used_size(vq, vq->num),
+ start, end, blockable))
+   return -EAGAIN;
+   }
+
+   return 0;
+}
+
+static const struct mmu_notifier_ops vhost_mmu_notifier_ops = {
+   .invalidate_range_start = vhost_mmu_notifier_invalidate_range_start,
+};
+
 /* Caller should have device mutex */
 long vhost_dev_set_owner(struct vhost_dev *dev)
 {
@@ -541,7 +611,14 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
if (err)
goto err_cgroup;
 
+   dev->mmu_notifier.ops = 

[RFC PATCH V3 2/5] vhost: fine grain userspace memory accessors

2018-12-29 Thread Jason Wang
This is used to hide the metadata address from virtqueue helpers. This
will allow to implement a vmap based fast accessing to metadata.

Signed-off-by: Jason Wang 
---
 drivers/vhost/vhost.c | 94 +++
 1 file changed, 77 insertions(+), 17 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index f179b5ee14c4..337ce6f5a098 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -868,6 +868,34 @@ static inline void __user *__vhost_get_user(struct 
vhost_virtqueue *vq,
ret; \
 })
 
+static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
+{
+   return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
+ vhost_avail_event(vq));
+}
+
+static inline int vhost_put_used(struct vhost_virtqueue *vq,
+struct vring_used_elem *head, int idx,
+int count)
+{
+   return vhost_copy_to_user(vq, vq->used->ring + idx, head,
+ count * sizeof(*head));
+}
+
+static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
+
+{
+   return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
+ >used->flags);
+}
+
+static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
+
+{
+   return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
+ >used->idx);
+}
+
 #define vhost_get_user(vq, x, ptr, type)   \
 ({ \
int ret; \
@@ -906,6 +934,43 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d)
mutex_unlock(>vqs[i]->mutex);
 }
 
+static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
+ __virtio16 *idx)
+{
+   return vhost_get_avail(vq, *idx, >avail->idx);
+}
+
+static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
+  __virtio16 *head, int idx)
+{
+   return vhost_get_avail(vq, *head,
+  >avail->ring[idx & (vq->num - 1)]);
+}
+
+static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
+   __virtio16 *flags)
+{
+   return vhost_get_avail(vq, *flags, >avail->flags);
+}
+
+static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
+  __virtio16 *event)
+{
+   return vhost_get_avail(vq, *event, vhost_used_event(vq));
+}
+
+static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
+__virtio16 *idx)
+{
+   return vhost_get_used(vq, *idx, >used->idx);
+}
+
+static inline int vhost_get_desc(struct vhost_virtqueue *vq,
+struct vring_desc *desc, int idx)
+{
+   return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
+}
+
 static int vhost_new_umem_range(struct vhost_umem *umem,
u64 start, u64 size, u64 end,
u64 userspace_addr, int perm)
@@ -1761,8 +1826,7 @@ EXPORT_SYMBOL_GPL(vhost_log_write);
 static int vhost_update_used_flags(struct vhost_virtqueue *vq)
 {
void __user *used;
-   if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
-  >used->flags) < 0)
+   if (vhost_put_used_flags(vq))
return -EFAULT;
if (unlikely(vq->log_used)) {
/* Make sure the flag is seen before log. */
@@ -1780,8 +1844,7 @@ static int vhost_update_used_flags(struct vhost_virtqueue 
*vq)
 
 static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 
avail_event)
 {
-   if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
-  vhost_avail_event(vq)))
+   if (vhost_put_avail_event(vq))
return -EFAULT;
if (unlikely(vq->log_used)) {
void __user *used;
@@ -1818,7 +1881,7 @@ int vhost_vq_init_access(struct vhost_virtqueue *vq)
r = -EFAULT;
goto err;
}
-   r = vhost_get_used(vq, last_used_idx, >used->idx);
+   r = vhost_get_used_idx(vq, _used_idx);
if (r) {
vq_err(vq, "Can't access used idx at %p\n",
   >used->idx);
@@ -2017,7 +2080,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
last_avail_idx = vq->last_avail_idx;
 
if (vq->avail_idx == vq->last_avail_idx) {
-   if (unlikely(vhost_get_avail(vq, avail_idx, >avail->idx))) {
+   if (unlikely(vhost_get_avail_idx(vq, _idx))) {
vq_err(vq, "Failed to access avail idx at %p\n",
>avail->idx);
return -EFAULT;
@@ -2044,8 +2107,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 
/* Grab the next descriptor number they're advertising, and increment
 * the index we've seen. */
-   if 

[RFC PATCH V3 1/5] vhost: generalize adding used elem

2018-12-29 Thread Jason Wang
Use one generic vhost_copy_to_user() instead of two dedicated
accessor. This will simplify the conversion to fine grain
accessors. About 2% improvement of PPS were seen during vitio-user
txonly test.

Signed-off-by: Jason Wang 
---
 drivers/vhost/vhost.c | 11 +--
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 55e5aa662ad5..f179b5ee14c4 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2174,16 +2174,7 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq,
 
start = vq->last_used_idx & (vq->num - 1);
used = vq->used->ring + start;
-   if (count == 1) {
-   if (vhost_put_user(vq, heads[0].id, >id)) {
-   vq_err(vq, "Failed to write used id");
-   return -EFAULT;
-   }
-   if (vhost_put_user(vq, heads[0].len, >len)) {
-   vq_err(vq, "Failed to write used len");
-   return -EFAULT;
-   }
-   } else if (vhost_copy_to_user(vq, used, heads, count * sizeof *used)) {
+   if (vhost_copy_to_user(vq, used, heads, count * sizeof *used)) {
vq_err(vq, "Failed to write used");
return -EFAULT;
}
-- 
2.17.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[RFC PATCH V3 0/5] Hi:

2018-12-29 Thread Jason Wang
This series tries to access virtqueue metadata through kernel virtual
address instead of copy_user() friends since they had too much
overheads like checks, spec barriers or even hardware feature
toggling.

Test shows about 24% improvement on TX PPS. It should benefit other
cases as well.

Changes from V2:
- fix buggy range overlapping check
- tear down MMU notifier during vhost ioctl to make sure invalidation
  request can read metadata userspace address and vq size without
  holding vq mutex.
Changes from V1:
- instead of pinning pages, use MMU notifier to invalidate vmaps and
  remap duing metadata prefetch
- fix build warning on MIPS

Jason Wang (5):
  vhost: generalize adding used elem
  vhost: fine grain userspace memory accessors
  vhost: rename vq_iotlb_prefetch() to vq_meta_prefetch()
  vhost: introduce helpers to get the size of metadata area
  vhost: access vq metadata through kernel virtual address

 drivers/vhost/net.c   |   4 +-
 drivers/vhost/vhost.c | 416 +-
 drivers/vhost/vhost.h |  15 +-
 3 files changed, 384 insertions(+), 51 deletions(-)

-- 
2.17.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH V2 3/3] vhost: access vq metadata through kernel virtual address

2018-12-29 Thread Jason Wang


On 2018/12/29 上午3:34, David Miller wrote:

From: Jason Wang 
Date: Fri, 28 Dec 2018 15:55:37 +0800


+static int vhost_invalidate_vmap(struct vhost_virtqueue *vq,
+struct vhost_vmap *map,
+unsigned long uaddr,
+unsigned long start,
+unsigned long end,
+bool blockable)
+{
+   if (start < uaddr && end >= uaddr) {
+   if (!blockable)
+   return -EAGAIN;
+   mutex_lock(>mutex);
+   if (map->addr)
+   vunmap(map->unmap_addr);
+   map->addr = NULL;
+   map->unmap_addr = NULL;
+   mutex_unlock(>mutex);
+   }
+
+   return 0;
+}

What are the rules for these invalidate operations?

Can there be partial overlaps?  If so, wouldn't you need some way of
keeping track of the partially overlapping unmaps so that once all of
the invalidates covering the range occur you properly cleanup and do
the vunmap()?



Yes, there can be partial overlap, so the check is buggy. We will remap 
the whole range in vq_meta_prefetch() before datapath path try to use 
them, so there's no need to track partial mapping here.


I spot another bug that the caller will access vq->avail without 
synchronized with vhost ioctl. Since we don't want to hold vq mutex for 
each invalidation, I will tear down MMU notifier during vhost ioctl to 
make sure invalidation request can access them without hold vq mutex.


Thanks

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization