Re: [RFC PATCH V3 5/5] vhost: access vq metadata through kernel virtual address

2019-01-07 Thread Jason Wang


On 2019/1/5 上午5:34, Michael S. Tsirkin wrote:

On Sat, Dec 29, 2018 at 08:46:56PM +0800, Jason Wang wrote:

It was noticed that the copy_user() friends that was used to access
virtqueue metdata tends to be very expensive for dataplane
implementation like vhost since it involves lots of software checks,
speculation barrier, hardware feature toggling (e.g SMAP). The
extra cost will be more obvious when transferring small packets since
the time spent on metadata accessing become significant..

This patch tries to eliminate those overhead by accessing them through
kernel virtual address by vmap(). To make the pages can be migrated,
instead of pinning them through GUP, we use mmu notifiers to
invalidate vmaps and re-establish vmaps during each round of metadata
prefetching in necessary. For devices that doesn't use metadata
prefetching, the memory acessors fallback to normal copy_user()
implementation gracefully. The invalidation was synchronized with
datapath through vq mutex, and in order to avoid hold vq mutex during
range checking, MMU notifier was teared down when trying to modify vq
metadata.

Note that this was only done when device IOTLB is not enabled. We
could use similar method to optimize it in the future.

Tests shows about ~24% improvement on TX PPS when using virtio-user +
vhost_net + xdp1 on TAP:

Before: ~5.0Mpps
After:  ~6.1Mpps

Signed-off-by: Jason Wang 
---
  drivers/vhost/vhost.c | 263 +-
  drivers/vhost/vhost.h |  13 +++
  2 files changed, 274 insertions(+), 2 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 54b43feef8d9..e1ecb8acf8a3 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -440,6 +440,9 @@ void vhost_dev_init(struct vhost_dev *dev,
vq->indirect = NULL;
vq->heads = NULL;
vq->dev = dev;
+   memset(>avail_ring, 0, sizeof(vq->avail_ring));
+   memset(>used_ring, 0, sizeof(vq->used_ring));
+   memset(>desc_ring, 0, sizeof(vq->desc_ring));
mutex_init(>mutex);
vhost_vq_reset(dev, vq);
if (vq->handle_kick)
@@ -510,6 +513,73 @@ static size_t vhost_get_desc_size(struct vhost_virtqueue 
*vq, int num)
return sizeof(*vq->desc) * num;
  }
  
+static void vhost_uninit_vmap(struct vhost_vmap *map)

+{
+   if (map->addr)
+   vunmap(map->unmap_addr);
+
+   map->addr = NULL;
+   map->unmap_addr = NULL;
+}
+
+static int vhost_invalidate_vmap(struct vhost_virtqueue *vq,
+struct vhost_vmap *map,
+unsigned long ustart,
+size_t size,
+unsigned long start,
+unsigned long end,
+bool blockable)
+{
+   if (end < ustart || start > ustart - 1 + size)
+   return 0;
+
+   if (!blockable)
+   return -EAGAIN;
+
+   mutex_lock(>mutex);
+   vhost_uninit_vmap(map);
+   mutex_unlock(>mutex);
+
+   return 0;
+}
+
+static int vhost_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
+struct mm_struct *mm,
+unsigned long start,
+unsigned long end,
+bool blockable)
+{
+   struct vhost_dev *dev = container_of(mn, struct vhost_dev,
+mmu_notifier);
+   int i;
+
+   for (i = 0; i < dev->nvqs; i++) {
+   struct vhost_virtqueue *vq = dev->vqs[i];
+
+   if (vhost_invalidate_vmap(vq, >avail_ring,
+ (unsigned long)vq->avail,
+ vhost_get_avail_size(vq, vq->num),
+ start, end, blockable))
+   return -EAGAIN;
+   if (vhost_invalidate_vmap(vq, >desc_ring,
+ (unsigned long)vq->desc,
+ vhost_get_desc_size(vq, vq->num),
+ start, end, blockable))
+   return -EAGAIN;
+   if (vhost_invalidate_vmap(vq, >used_ring,
+ (unsigned long)vq->used,
+ vhost_get_used_size(vq, vq->num),
+ start, end, blockable))
+   return -EAGAIN;
+   }
+
+   return 0;
+}
+
+static const struct mmu_notifier_ops vhost_mmu_notifier_ops = {
+   .invalidate_range_start = vhost_mmu_notifier_invalidate_range_start,
+};
+
  /* Caller should have device mutex */
  long vhost_dev_set_owner(struct vhost_dev *dev)
  {
@@ -541,7 +611,14 @@ long 

Re: [RFC PATCH V3 5/5] vhost: access vq metadata through kernel virtual address

2019-01-04 Thread Michael S. Tsirkin
On Sat, Dec 29, 2018 at 08:46:56PM +0800, Jason Wang wrote:
> It was noticed that the copy_user() friends that was used to access
> virtqueue metdata tends to be very expensive for dataplane
> implementation like vhost since it involves lots of software checks,
> speculation barrier, hardware feature toggling (e.g SMAP). The
> extra cost will be more obvious when transferring small packets since
> the time spent on metadata accessing become significant..
> 
> This patch tries to eliminate those overhead by accessing them through
> kernel virtual address by vmap(). To make the pages can be migrated,
> instead of pinning them through GUP, we use mmu notifiers to
> invalidate vmaps and re-establish vmaps during each round of metadata
> prefetching in necessary. For devices that doesn't use metadata
> prefetching, the memory acessors fallback to normal copy_user()
> implementation gracefully. The invalidation was synchronized with
> datapath through vq mutex, and in order to avoid hold vq mutex during
> range checking, MMU notifier was teared down when trying to modify vq
> metadata.
> 
> Note that this was only done when device IOTLB is not enabled. We
> could use similar method to optimize it in the future.
> 
> Tests shows about ~24% improvement on TX PPS when using virtio-user +
> vhost_net + xdp1 on TAP:
> 
> Before: ~5.0Mpps
> After:  ~6.1Mpps
> 
> Signed-off-by: Jason Wang 
> ---
>  drivers/vhost/vhost.c | 263 +-
>  drivers/vhost/vhost.h |  13 +++
>  2 files changed, 274 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 54b43feef8d9..e1ecb8acf8a3 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -440,6 +440,9 @@ void vhost_dev_init(struct vhost_dev *dev,
>   vq->indirect = NULL;
>   vq->heads = NULL;
>   vq->dev = dev;
> + memset(>avail_ring, 0, sizeof(vq->avail_ring));
> + memset(>used_ring, 0, sizeof(vq->used_ring));
> + memset(>desc_ring, 0, sizeof(vq->desc_ring));
>   mutex_init(>mutex);
>   vhost_vq_reset(dev, vq);
>   if (vq->handle_kick)
> @@ -510,6 +513,73 @@ static size_t vhost_get_desc_size(struct vhost_virtqueue 
> *vq, int num)
>   return sizeof(*vq->desc) * num;
>  }
>  
> +static void vhost_uninit_vmap(struct vhost_vmap *map)
> +{
> + if (map->addr)
> + vunmap(map->unmap_addr);
> +
> + map->addr = NULL;
> + map->unmap_addr = NULL;
> +}
> +
> +static int vhost_invalidate_vmap(struct vhost_virtqueue *vq,
> +  struct vhost_vmap *map,
> +  unsigned long ustart,
> +  size_t size,
> +  unsigned long start,
> +  unsigned long end,
> +  bool blockable)
> +{
> + if (end < ustart || start > ustart - 1 + size)
> + return 0;
> +
> + if (!blockable)
> + return -EAGAIN;
> +
> + mutex_lock(>mutex);
> + vhost_uninit_vmap(map);
> + mutex_unlock(>mutex);
> +
> + return 0;
> +}
> +
> +static int vhost_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
> +  struct mm_struct *mm,
> +  unsigned long start,
> +  unsigned long end,
> +  bool blockable)
> +{
> + struct vhost_dev *dev = container_of(mn, struct vhost_dev,
> +  mmu_notifier);
> + int i;
> +
> + for (i = 0; i < dev->nvqs; i++) {
> + struct vhost_virtqueue *vq = dev->vqs[i];
> +
> + if (vhost_invalidate_vmap(vq, >avail_ring,
> +   (unsigned long)vq->avail,
> +   vhost_get_avail_size(vq, vq->num),
> +   start, end, blockable))
> + return -EAGAIN;
> + if (vhost_invalidate_vmap(vq, >desc_ring,
> +   (unsigned long)vq->desc,
> +   vhost_get_desc_size(vq, vq->num),
> +   start, end, blockable))
> + return -EAGAIN;
> + if (vhost_invalidate_vmap(vq, >used_ring,
> +   (unsigned long)vq->used,
> +   vhost_get_used_size(vq, vq->num),
> +   start, end, blockable))
> + return -EAGAIN;
> + }
> +
> + return 0;
> +}
> +
> +static const struct mmu_notifier_ops vhost_mmu_notifier_ops = {
> + .invalidate_range_start = vhost_mmu_notifier_invalidate_range_start,
> +};
> +
>  /* Caller should have device mutex */
>  long 

[RFC PATCH V3 5/5] vhost: access vq metadata through kernel virtual address

2018-12-29 Thread Jason Wang
It was noticed that the copy_user() friends that was used to access
virtqueue metdata tends to be very expensive for dataplane
implementation like vhost since it involves lots of software checks,
speculation barrier, hardware feature toggling (e.g SMAP). The
extra cost will be more obvious when transferring small packets since
the time spent on metadata accessing become significant..

This patch tries to eliminate those overhead by accessing them through
kernel virtual address by vmap(). To make the pages can be migrated,
instead of pinning them through GUP, we use mmu notifiers to
invalidate vmaps and re-establish vmaps during each round of metadata
prefetching in necessary. For devices that doesn't use metadata
prefetching, the memory acessors fallback to normal copy_user()
implementation gracefully. The invalidation was synchronized with
datapath through vq mutex, and in order to avoid hold vq mutex during
range checking, MMU notifier was teared down when trying to modify vq
metadata.

Note that this was only done when device IOTLB is not enabled. We
could use similar method to optimize it in the future.

Tests shows about ~24% improvement on TX PPS when using virtio-user +
vhost_net + xdp1 on TAP:

Before: ~5.0Mpps
After:  ~6.1Mpps

Signed-off-by: Jason Wang 
---
 drivers/vhost/vhost.c | 263 +-
 drivers/vhost/vhost.h |  13 +++
 2 files changed, 274 insertions(+), 2 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 54b43feef8d9..e1ecb8acf8a3 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -440,6 +440,9 @@ void vhost_dev_init(struct vhost_dev *dev,
vq->indirect = NULL;
vq->heads = NULL;
vq->dev = dev;
+   memset(>avail_ring, 0, sizeof(vq->avail_ring));
+   memset(>used_ring, 0, sizeof(vq->used_ring));
+   memset(>desc_ring, 0, sizeof(vq->desc_ring));
mutex_init(>mutex);
vhost_vq_reset(dev, vq);
if (vq->handle_kick)
@@ -510,6 +513,73 @@ static size_t vhost_get_desc_size(struct vhost_virtqueue 
*vq, int num)
return sizeof(*vq->desc) * num;
 }
 
+static void vhost_uninit_vmap(struct vhost_vmap *map)
+{
+   if (map->addr)
+   vunmap(map->unmap_addr);
+
+   map->addr = NULL;
+   map->unmap_addr = NULL;
+}
+
+static int vhost_invalidate_vmap(struct vhost_virtqueue *vq,
+struct vhost_vmap *map,
+unsigned long ustart,
+size_t size,
+unsigned long start,
+unsigned long end,
+bool blockable)
+{
+   if (end < ustart || start > ustart - 1 + size)
+   return 0;
+
+   if (!blockable)
+   return -EAGAIN;
+
+   mutex_lock(>mutex);
+   vhost_uninit_vmap(map);
+   mutex_unlock(>mutex);
+
+   return 0;
+}
+
+static int vhost_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
+struct mm_struct *mm,
+unsigned long start,
+unsigned long end,
+bool blockable)
+{
+   struct vhost_dev *dev = container_of(mn, struct vhost_dev,
+mmu_notifier);
+   int i;
+
+   for (i = 0; i < dev->nvqs; i++) {
+   struct vhost_virtqueue *vq = dev->vqs[i];
+
+   if (vhost_invalidate_vmap(vq, >avail_ring,
+ (unsigned long)vq->avail,
+ vhost_get_avail_size(vq, vq->num),
+ start, end, blockable))
+   return -EAGAIN;
+   if (vhost_invalidate_vmap(vq, >desc_ring,
+ (unsigned long)vq->desc,
+ vhost_get_desc_size(vq, vq->num),
+ start, end, blockable))
+   return -EAGAIN;
+   if (vhost_invalidate_vmap(vq, >used_ring,
+ (unsigned long)vq->used,
+ vhost_get_used_size(vq, vq->num),
+ start, end, blockable))
+   return -EAGAIN;
+   }
+
+   return 0;
+}
+
+static const struct mmu_notifier_ops vhost_mmu_notifier_ops = {
+   .invalidate_range_start = vhost_mmu_notifier_invalidate_range_start,
+};
+
 /* Caller should have device mutex */
 long vhost_dev_set_owner(struct vhost_dev *dev)
 {
@@ -541,7 +611,14 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
if (err)
goto err_cgroup;
 
+   dev->mmu_notifier.ops =