[PATCH 7/7] virtio-ring: store DMA metadata in desc_extra for split virtqueue

2021-06-03 Thread Jason Wang
For split virtqueue, we used to depend on the address, length and
flags stored in the descriptor ring for DMA unmapping. This is unsafe
for the case since the device can manipulate the behavior of virtio
driver, IOMMU drivers and swiotlb.

For safety, maintain the DMA address, DMA length, descriptor flags and
next filed of the non indirect descriptors in vring_desc_state_extra
when DMA API is used for virtio as we did for packed virtqueue and use
those metadata for performing DMA operations. Indirect descriptors
should be safe since they are using streaming mappings.

With this the descriptor ring is write only form the view of the
driver.

This slight increase the footprint of the drive but it's not noticed
through pktgen (64B) test and netperf test in the case of virtio-net.

Signed-off-by: Jason Wang 
---
 drivers/virtio/virtio_ring.c | 112 +++
 1 file changed, 87 insertions(+), 25 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 9800f1c9ce4c..5f0076eeb39c 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -130,6 +130,7 @@ struct vring_virtqueue {
 
/* Per-descriptor state. */
struct vring_desc_state_split *desc_state;
+   struct vring_desc_extra *desc_extra;
 
/* DMA address and size information */
dma_addr_t queue_dma_addr;
@@ -364,8 +365,8 @@ static int vring_mapping_error(const struct vring_virtqueue 
*vq,
  * Split ring specific functions - *_split().
  */
 
-static void vring_unmap_one_split(const struct vring_virtqueue *vq,
- struct vring_desc *desc)
+static void vring_unmap_one_split_indirect(const struct vring_virtqueue *vq,
+  struct vring_desc *desc)
 {
u16 flags;
 
@@ -389,6 +390,35 @@ static void vring_unmap_one_split(const struct 
vring_virtqueue *vq,
}
 }
 
+static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq,
+ unsigned int i)
+{
+   struct vring_desc_extra *extra = vq->split.desc_extra;
+   u16 flags;
+
+   if (!vq->use_dma_api)
+   goto out;
+
+   flags = extra[i].flags;
+
+   if (flags & VRING_DESC_F_INDIRECT) {
+   dma_unmap_single(vring_dma_dev(vq),
+extra[i].addr,
+extra[i].len,
+(flags & VRING_DESC_F_WRITE) ?
+DMA_FROM_DEVICE : DMA_TO_DEVICE);
+   } else {
+   dma_unmap_page(vring_dma_dev(vq),
+  extra[i].addr,
+  extra[i].len,
+  (flags & VRING_DESC_F_WRITE) ?
+  DMA_FROM_DEVICE : DMA_TO_DEVICE);
+   }
+
+out:
+   return extra[i].next;
+}
+
 static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq,
   unsigned int total_sg,
   gfp_t gfp)
@@ -417,13 +447,28 @@ static inline unsigned int 
virtqueue_add_desc_split(struct virtqueue *vq,
unsigned int i,
dma_addr_t addr,
unsigned int len,
-   u16 flags)
+   u16 flags,
+   bool indirect)
 {
+   struct vring_virtqueue *vring = to_vvq(vq);
+   struct vring_desc_extra *extra = vring->split.desc_extra;
+   u16 next;
+
desc[i].flags = cpu_to_virtio16(vq->vdev, flags);
desc[i].addr = cpu_to_virtio64(vq->vdev, addr);
desc[i].len = cpu_to_virtio32(vq->vdev, len);
 
-   return virtio16_to_cpu(vq->vdev, desc[i].next);
+   if (!indirect) {
+   next = extra[i].next;
+   desc[i].next = cpu_to_virtio16(vq->vdev, next);
+
+   extra[i].addr = addr;
+   extra[i].len = len;
+   extra[i].flags = flags;
+   } else
+   next = virtio16_to_cpu(vq->vdev, desc[i].next);
+
+   return next;
 }
 
 static inline int virtqueue_add_split(struct virtqueue *_vq,
@@ -499,8 +544,12 @@ static inline int virtqueue_add_split(struct virtqueue 
*_vq,
goto unmap_release;
 
prev = i;
+   /* Note that we trust indirect descriptor
+* table since it use stream DMA mapping.
+*/
i = virtqueue_add_desc_split(_vq, desc, i, addr, 
sg->length,
-VRING_DESC_F_NEXT);
+ 

[PATCH 6/7] virtio: use err label in __vring_new_virtqueue()

2021-06-03 Thread Jason Wang
Using error label for unwind in __vring_new_virtqueue. This is useful
for future refacotring.

Signed-off-by: Jason Wang 
---
 drivers/virtio/virtio_ring.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 11dfa0dc8ec1..9800f1c9ce4c 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -2137,10 +2137,8 @@ struct virtqueue *__vring_new_virtqueue(unsigned int 
index,
 
vq->split.desc_state = kmalloc_array(vring.num,
sizeof(struct vring_desc_state_split), GFP_KERNEL);
-   if (!vq->split.desc_state) {
-   kfree(vq);
-   return NULL;
-   }
+   if (!vq->split.desc_state)
+   goto err_state;
 
/* Put everything in free lists. */
vq->free_head = 0;
@@ -2151,6 +2149,10 @@ struct virtqueue *__vring_new_virtqueue(unsigned int 
index,
 
list_add_tail(>vq.list, >vqs);
return >vq;
+
+err_state:
+   kfree(vq);
+   return NULL;
 }
 EXPORT_SYMBOL_GPL(__vring_new_virtqueue);
 
-- 
2.25.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH 5/7] virtio_ring: introduce virtqueue_desc_add_split()

2021-06-03 Thread Jason Wang
This patch introduces a helper for storing descriptor in the
descriptor table for split virtqueue.

Signed-off-by: Jason Wang 
---
 drivers/virtio/virtio_ring.c | 39 ++--
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 5509c2643fb1..11dfa0dc8ec1 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -412,6 +412,20 @@ static struct vring_desc *alloc_indirect_split(struct 
virtqueue *_vq,
return desc;
 }
 
+static inline unsigned int virtqueue_add_desc_split(struct virtqueue *vq,
+   struct vring_desc *desc,
+   unsigned int i,
+   dma_addr_t addr,
+   unsigned int len,
+   u16 flags)
+{
+   desc[i].flags = cpu_to_virtio16(vq->vdev, flags);
+   desc[i].addr = cpu_to_virtio64(vq->vdev, addr);
+   desc[i].len = cpu_to_virtio32(vq->vdev, len);
+
+   return virtio16_to_cpu(vq->vdev, desc[i].next);
+}
+
 static inline int virtqueue_add_split(struct virtqueue *_vq,
  struct scatterlist *sgs[],
  unsigned int total_sg,
@@ -484,11 +498,9 @@ static inline int virtqueue_add_split(struct virtqueue 
*_vq,
if (vring_mapping_error(vq, addr))
goto unmap_release;
 
-   desc[i].flags = cpu_to_virtio16(_vq->vdev, 
VRING_DESC_F_NEXT);
-   desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
-   desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
prev = i;
-   i = virtio16_to_cpu(_vq->vdev, desc[i].next);
+   i = virtqueue_add_desc_split(_vq, desc, i, addr, 
sg->length,
+VRING_DESC_F_NEXT);
}
}
for (; n < (out_sgs + in_sgs); n++) {
@@ -497,11 +509,11 @@ static inline int virtqueue_add_split(struct virtqueue 
*_vq,
if (vring_mapping_error(vq, addr))
goto unmap_release;
 
-   desc[i].flags = cpu_to_virtio16(_vq->vdev, 
VRING_DESC_F_NEXT | VRING_DESC_F_WRITE);
-   desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
-   desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
prev = i;
-   i = virtio16_to_cpu(_vq->vdev, desc[i].next);
+   i = virtqueue_add_desc_split(_vq, desc, i, addr,
+sg->length,
+VRING_DESC_F_NEXT |
+VRING_DESC_F_WRITE);
}
}
/* Last one doesn't continue. */
@@ -515,13 +527,10 @@ static inline int virtqueue_add_split(struct virtqueue 
*_vq,
if (vring_mapping_error(vq, addr))
goto unmap_release;
 
-   vq->split.vring.desc[head].flags = cpu_to_virtio16(_vq->vdev,
-   VRING_DESC_F_INDIRECT);
-   vq->split.vring.desc[head].addr = cpu_to_virtio64(_vq->vdev,
-   addr);
-
-   vq->split.vring.desc[head].len = cpu_to_virtio32(_vq->vdev,
-   total_sg * sizeof(struct vring_desc));
+   virtqueue_add_desc_split(_vq, vq->split.vring.desc,
+head, addr,
+total_sg * sizeof(struct vring_desc),
+VRING_DESC_F_INDIRECT);
}
 
/* We're using some buffers from the free list. */
-- 
2.25.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH 4/7] virtio_ring: secure handling of mapping errors

2021-06-03 Thread Jason Wang
We should not depend on the DMA address, length and flag of descriptor
table since they could be wrote with arbitrary value by the device. So
this patch switches to use the stored one in desc_extra.

Note that the indirect descriptors are fine since they are read-only
streaming mappings.

Signed-off-by: Jason Wang 
---
 drivers/virtio/virtio_ring.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 0cdd965dba58..5509c2643fb1 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -1213,13 +1213,16 @@ static inline int virtqueue_add_packed(struct virtqueue 
*_vq,
 unmap_release:
err_idx = i;
i = head;
+   curr = vq->free_head;
 
vq->packed.avail_used_flags = avail_used_flags;
 
for (n = 0; n < total_sg; n++) {
if (i == err_idx)
break;
-   vring_unmap_desc_packed(vq, [i]);
+   vring_unmap_state_packed(vq,
+>packed.desc_extra[curr]);
+   curr = vq->packed.desc_extra[curr].next;
i++;
if (i >= vq->packed.vring.num)
i = 0;
-- 
2.25.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH 3/7] virtio-ring: factor out desc_extra allocation

2021-06-03 Thread Jason Wang
A helper is introduced for the logic of allocating the descriptor
extra data. This will be reused by split virtqueue.

Signed-off-by: Jason Wang 
---
 drivers/virtio/virtio_ring.c | 30 --
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index c25ea5776687..0cdd965dba58 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -1550,6 +1550,25 @@ static void *virtqueue_detach_unused_buf_packed(struct 
virtqueue *_vq)
return NULL;
 }
 
+static struct vring_desc_extra *vring_alloc_desc_extra(struct vring_virtqueue 
*vq,
+  unsigned int num)
+{
+   struct vring_desc_extra *desc_extra;
+   unsigned int i;
+
+   desc_extra = kmalloc_array(num, sizeof(struct vring_desc_extra),
+  GFP_KERNEL);
+   if (!desc_extra)
+   return NULL;
+
+   memset(desc_extra, 0, num * sizeof(struct vring_desc_extra));
+
+   for (i = 0; i < num - 1; i++)
+   desc_extra[i].next = i + 1;
+
+   return desc_extra;
+}
+
 static struct virtqueue *vring_create_virtqueue_packed(
unsigned int index,
unsigned int num,
@@ -1567,7 +1586,6 @@ static struct virtqueue *vring_create_virtqueue_packed(
struct vring_packed_desc_event *driver, *device;
dma_addr_t ring_dma_addr, driver_event_dma_addr, device_event_dma_addr;
size_t ring_size_in_bytes, event_size_in_bytes;
-   unsigned int i;
 
ring_size_in_bytes = num * sizeof(struct vring_packed_desc);
 
@@ -1650,18 +1668,10 @@ static struct virtqueue *vring_create_virtqueue_packed(
/* Put everything in free lists. */
vq->free_head = 0;
 
-   vq->packed.desc_extra = kmalloc_array(num,
-   sizeof(struct vring_desc_extra),
-   GFP_KERNEL);
+   vq->packed.desc_extra = vring_alloc_desc_extra(vq, num);
if (!vq->packed.desc_extra)
goto err_desc_extra;
 
-   memset(vq->packed.desc_extra, 0,
-   num * sizeof(struct vring_desc_extra));
-
-   for (i = 0; i < num - 1; i++)
-   vq->packed.desc_extra[i].next = i + 1;
-
/* No callback?  Tell other side not to bother us. */
if (!callback) {
vq->packed.event_flags_shadow = VRING_PACKED_EVENT_FLAG_DISABLE;
-- 
2.25.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH 2/7] virtio_ring: rename vring_desc_extra_packed

2021-06-03 Thread Jason Wang
Rename vring_desc_extra_packed to vring_desc_extra since the structure
are pretty generic which could be reused by split virtqueue as well.

Signed-off-by: Jason Wang 
---
 drivers/virtio/virtio_ring.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index e1e9ed42e637..c25ea5776687 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -77,7 +77,7 @@ struct vring_desc_state_packed {
u16 last;   /* The last desc state in a list. */
 };
 
-struct vring_desc_extra_packed {
+struct vring_desc_extra {
dma_addr_t addr;/* Buffer DMA addr. */
u32 len;/* Buffer length. */
u16 flags;  /* Descriptor flags. */
@@ -166,7 +166,7 @@ struct vring_virtqueue {
 
/* Per-descriptor state. */
struct vring_desc_state_packed *desc_state;
-   struct vring_desc_extra_packed *desc_extra;
+   struct vring_desc_extra *desc_extra;
 
/* DMA address and size information */
dma_addr_t ring_dma_addr;
@@ -912,7 +912,7 @@ static struct virtqueue *vring_create_virtqueue_split(
  */
 
 static void vring_unmap_state_packed(const struct vring_virtqueue *vq,
-struct vring_desc_extra_packed *state)
+struct vring_desc_extra *state)
 {
u16 flags;
 
@@ -1651,13 +1651,13 @@ static struct virtqueue *vring_create_virtqueue_packed(
vq->free_head = 0;
 
vq->packed.desc_extra = kmalloc_array(num,
-   sizeof(struct vring_desc_extra_packed),
+   sizeof(struct vring_desc_extra),
GFP_KERNEL);
if (!vq->packed.desc_extra)
goto err_desc_extra;
 
memset(vq->packed.desc_extra, 0,
-   num * sizeof(struct vring_desc_extra_packed));
+   num * sizeof(struct vring_desc_extra));
 
for (i = 0; i < num - 1; i++)
vq->packed.desc_extra[i].next = i + 1;
-- 
2.25.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH 1/7] virtio-ring: maintain next in extra state for packed virtqueue

2021-06-03 Thread Jason Wang
This patch moves next from vring_desc_state_packed to
vring_desc_desc_extra_packed. This makes it simpler to let extra state
to be reused by split virtqueue.

Signed-off-by: Jason Wang 
---
 drivers/virtio/virtio_ring.c | 15 ---
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 71e16b53e9c1..e1e9ed42e637 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -74,7 +74,6 @@ struct vring_desc_state_packed {
void *data; /* Data for callback. */
struct vring_packed_desc *indir_desc; /* Indirect descriptor, if any. */
u16 num;/* Descriptor list length. */
-   u16 next;   /* The next desc state in a list. */
u16 last;   /* The last desc state in a list. */
 };
 
@@ -82,6 +81,7 @@ struct vring_desc_extra_packed {
dma_addr_t addr;/* Buffer DMA addr. */
u32 len;/* Buffer length. */
u16 flags;  /* Descriptor flags. */
+   u16 next;   /* The next desc state in a list. */
 };
 
 struct vring_virtqueue {
@@ -1061,7 +1061,7 @@ static int virtqueue_add_indirect_packed(struct 
vring_virtqueue *vq,
1 << VRING_PACKED_DESC_F_USED;
}
vq->packed.next_avail_idx = n;
-   vq->free_head = vq->packed.desc_state[id].next;
+   vq->free_head = vq->packed.desc_extra[id].next;
 
/* Store token and indirect buffer state. */
vq->packed.desc_state[id].num = 1;
@@ -1169,7 +1169,7 @@ static inline int virtqueue_add_packed(struct virtqueue 
*_vq,
le16_to_cpu(flags);
}
prev = curr;
-   curr = vq->packed.desc_state[curr].next;
+   curr = vq->packed.desc_extra[curr].next;
 
if ((unlikely(++i >= vq->packed.vring.num))) {
i = 0;
@@ -1290,7 +1290,7 @@ static void detach_buf_packed(struct vring_virtqueue *vq,
/* Clear data ptr. */
state->data = NULL;
 
-   vq->packed.desc_state[state->last].next = vq->free_head;
+   vq->packed.desc_extra[state->last].next = vq->free_head;
vq->free_head = id;
vq->vq.num_free += state->num;
 
@@ -1299,7 +1299,7 @@ static void detach_buf_packed(struct vring_virtqueue *vq,
for (i = 0; i < state->num; i++) {
vring_unmap_state_packed(vq,
>packed.desc_extra[curr]);
-   curr = vq->packed.desc_state[curr].next;
+   curr = vq->packed.desc_extra[curr].next;
}
}
 
@@ -1649,8 +1649,6 @@ static struct virtqueue *vring_create_virtqueue_packed(
 
/* Put everything in free lists. */
vq->free_head = 0;
-   for (i = 0; i < num-1; i++)
-   vq->packed.desc_state[i].next = i + 1;
 
vq->packed.desc_extra = kmalloc_array(num,
sizeof(struct vring_desc_extra_packed),
@@ -1661,6 +1659,9 @@ static struct virtqueue *vring_create_virtqueue_packed(
memset(vq->packed.desc_extra, 0,
num * sizeof(struct vring_desc_extra_packed));
 
+   for (i = 0; i < num - 1; i++)
+   vq->packed.desc_extra[i].next = i + 1;
+
/* No callback?  Tell other side not to bother us. */
if (!callback) {
vq->packed.event_flags_shadow = VRING_PACKED_EVENT_FLAG_DISABLE;
-- 
2.25.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH 0/7] Do not read from descriptor ring

2021-06-03 Thread Jason Wang
Hi:

The virtio driver should not trust the device. This beame more urgent
for the case of encrtpyed VM or VDUSE[1]. In both cases, technology
like swiotlb/IOMMU is used to prevent the poking/mangling of memory
from the device. But this is not sufficient since current virtio
driver may trust what is stored in the descriptor table (coherent
mapping) for performing the DMA operations like unmap and bounce so
the device may choose to utilize the behaviour of swiotlb to perform
attacks[2].

To protect from a malicous device, this series store and use the
descriptor metadata in an auxiliay structure which can not be accessed
via swiotlb/device instead of the ones in the descriptor table. This
means the descriptor table is write-only from the view of the driver.

Actually, we've almost achieved that through packed virtqueue and we
just need to fix a corner case of handling mapping errors. For split
virtqueue we just follow what's done in the packed.

Note that we don't duplicate descriptor medata for indirect
descriptors since it uses stream mapping which is read only so it's
safe if the metadata of non-indirect descriptors are correct.

For split virtqueue, the change increase the footprint due the the
auxiliary metadata but it's almost neglectlable in simple test like
pktgen and netperf TCP stream (slightly noticed in a 40GBE environment
with more CPU usage).

Slightly tested with packed on/off, iommu on/of, swiotlb force/off in
the guest.

Note that this series tries to fix the attack via descriptor
ring. The other cases (used ring and config space) will be fixed by
other series or patches.

Please review.

Changes from RFC V2:
- no code change
- twaeak the commit log a little bit

Changes from RFC V1:
- Always use auxiliary metadata for split virtqueue
- Don't read from descripto when detaching indirect descriptor

Jason Wang (7):
  virtio-ring: maintain next in extra state for packed virtqueue
  virtio_ring: rename vring_desc_extra_packed
  virtio-ring: factor out desc_extra allocation
  virtio_ring: secure handling of mapping errors
  virtio_ring: introduce virtqueue_desc_add_split()
  virtio: use err label in __vring_new_virtqueue()
  virtio-ring: store DMA metadata in desc_extra for split virtqueue

 drivers/virtio/virtio_ring.c | 201 +--
 1 file changed, 144 insertions(+), 57 deletions(-)

-- 
2.25.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH V2 0/7] Do not read from descripto ring

2021-06-03 Thread Jason Wang


在 2021/5/14 下午7:13, Michael S. Tsirkin 写道:

On Thu, May 06, 2021 at 01:38:29PM +0100, Christoph Hellwig wrote:

On Thu, May 06, 2021 at 04:12:17AM -0400, Michael S. Tsirkin wrote:

Let's try for just a bit, won't make this window anyway:

I have an old idea. Add a way to find out that unmap is a nop
(or more exactly does not use the address/length).
Then in that case even with DMA API we do not need
the extra data. Hmm?

So we actually do have a check for that from the early days of the DMA
API, but it only works at compile time: CONFIG_NEED_DMA_MAP_STATE.

But given how rare configs without an iommu or swiotlb are these days
it has stopped to be very useful.  Unfortunately a runtime-version is
not entirely trivial, but maybe if we allow for false positives we
could do something like this

bool dma_direct_need_state(struct device *dev)
{
/* some areas could not be covered by any map at all */
if (dev->dma_range_map)
return false;
if (force_dma_unencrypted(dev))
return false;
if (dma_direct_need_sync(dev))
return false;
return *dev->dma_mask == DMA_BIT_MASK(64);
}

bool dma_need_state(struct device *dev)
{
const struct dma_map_ops *ops = get_dma_ops(dev);

if (dma_map_direct(dev, ops))
return dma_direct_need_state(dev);
return ops->unmap_page ||
ops->sync_single_for_cpu || ops->sync_single_for_device;
}

Yea that sounds like a good idea. We will need to document that.


Something like:

/*
  * dma_need_state - report whether unmap calls use the address and length
  * @dev: device to guery
  *
  * This is a runtime version of CONFIG_NEED_DMA_MAP_STATE.
  *
  * Return the value indicating whether dma_unmap_* and dma_sync_* calls for 
the device
  * use the DMA state parameters passed to them.
  * The DMA state parameters are: scatter/gather list/table, address and
  * length.
  *
  * If dma_need_state returns false then DMA state parameters are
  * ignored by all dma_unmap_* and dma_sync_* calls, so it is safe to pass 0 for
  * address and length, and DMA_UNMAP_SG_TABLE_INVALID and
  * DMA_UNMAP_SG_LIST_INVALID for s/g table and length respectively.
  * If dma_need_state returns true then DMA state might
  * be used and so the actual values are required.
  */

And we will need DMA_UNMAP_SG_TABLE_INVALID and
DMA_UNMAP_SG_LIST_INVALID as pointers to an empty global table and list
for calls such as dma_unmap_sgtable that dereference pointers before checking
they are used.


Does this look good?

The table/length variants are for consistency, virtio specifically does
not use s/g at the moment, but it seems nicer than leaving
users wonder what to do about these.

Thoughts? Jason want to try implementing?



I can add it in my todo list other if other people are interested in 
this, please let us know.


But this is just about saving the efforts of unmap and it doesn't 
eliminate the necessary of using private memory (addr, length) for the 
metadata for validating the device inputs.


And just to clarify, the slight regression we see is testing without 
VIRTIO_F_ACCESS_PLATFORM which means DMA API is not used.


So I will go to post a formal version of this series and we can start 
from there.


Thanks






___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[PATCH] vdpa: mandate 1.0 device

2021-06-03 Thread Jason Wang
This patch mandates 1.0 for vDPA devices. The plan is never to support
transitional devices for vDPA devices in the future.

The reasons are:

- To have the semantic of normative statement in the virtio spec and
  eliminate the burden of transitional device for both vDPA bus and
  vDPA parent
- Eliminate the efforts for dealing with endian conversion in the
  management tool
- Mandate vDPA vendor to ship modern device instead of transitional
  device which is easily broken and unsafe
- Transitional device never work since the first day of vDPA

Signed-off-by: Jason Wang 
---
 include/linux/vdpa.h | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h
index f311d227aa1b..11dd05b805a7 100644
--- a/include/linux/vdpa.h
+++ b/include/linux/vdpa.h
@@ -6,6 +6,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /**
  * struct vdpa_calllback - vDPA callback definition.
@@ -328,6 +329,11 @@ static inline int vdpa_set_features(struct vdpa_device 
*vdev, u64 features)
 {
 const struct vdpa_config_ops *ops = vdev->config;
 
+/* Mandating 1.0 to have semantics of normative statements in
+ * the spec. */
+if (!(features & BIT_ULL(VIRTIO_F_VERSION_1)))
+   return -EINVAL;
+
vdev->features_valid = true;
 return ops->set_features(vdev, features);
 }
-- 
2.25.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH net] virtio-net: fix for skb_over_panic inside big mode

2021-06-03 Thread Xuan Zhuo
On Fri, 4 Jun 2021 11:00:25 +0800, Jason Wang  wrote:
>
> 在 2021/6/4 上午10:30, Xuan Zhuo 写道:
> > On Fri, 4 Jun 2021 10:28:41 +0800, Jason Wang  wrote:
> >> 在 2021/6/4 上午1:09, Xuan Zhuo 写道:
> >>> In virtio-net's large packet mode, there is a hole in the space behind
> >>> buf.
> >>
> >> before the buf actually or behind the vnet header?
> >>
> >>
> >>>   hdr_padded_len - hdr_len
> >>>
> >>> We must take this into account when calculating tailroom.
> >>>
> >>> [   44.544385] skb_put.cold (net/core/skbuff.c:5254 (discriminator 1) 
> >>> net/core/skbuff.c:5252 (discriminator 1))
> >>> [   44.544864] page_to_skb (drivers/net/virtio_net.c:485) [   44.545361] 
> >>> receive_buf (drivers/net/virtio_net.c:849 drivers/net/virtio_net.c:1131)
> >>> [   44.545870] ? netif_receive_skb_list_internal (net/core/dev.c:5714)
> >>> [   44.546628] ? dev_gro_receive (net/core/dev.c:6103)
> >>> [   44.547135] ? napi_complete_done (./include/linux/list.h:35 
> >>> net/core/dev.c:5867 net/core/dev.c:5862 net/core/dev.c:6565)
> >>> [   44.547672] virtnet_poll (drivers/net/virtio_net.c:1427 
> >>> drivers/net/virtio_net.c:1525)
> >>> [   44.548251] __napi_poll (net/core/dev.c:6985)
> >>> [   44.548744] net_rx_action (net/core/dev.c:7054 net/core/dev.c:7139)
> >>> [   44.549264] __do_softirq (./arch/x86/include/asm/jump_label.h:19 
> >>> ./include/linux/jump_label.h:200 ./include/trace/events/irq.h:142 
> >>> kernel/softirq.c:560)
> >>> [   44.549762] irq_exit_rcu (kernel/softirq.c:433 kernel/softirq.c:637 
> >>> kernel/softirq.c:649)
> >>> [   44.551384] common_interrupt (arch/x86/kernel/irq.c:240 (discriminator 
> >>> 13))
> >>> [   44.551991] ? asm_common_interrupt 
> >>> (./arch/x86/include/asm/idtentry.h:638)
> >>> [   44.552654] asm_common_interrupt 
> >>> (./arch/x86/include/asm/idtentry.h:638)
> >>>
> >>> Fixes: fb32856b16ad ("virtio-net: page_to_skb() use build_skb when 
> >>> there's sufficient tailroom")
> >>> Signed-off-by: Xuan Zhuo 
> >>> Reported-by: Corentin Noël 
> >>> Tested-by: Corentin Noël 
> >>> ---
> >>>drivers/net/virtio_net.c | 2 +-
> >>>1 file changed, 1 insertion(+), 1 deletion(-)
> >>>
> >>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> >>> index fa407eb8b457..78a01c71a17c 100644
> >>> --- a/drivers/net/virtio_net.c
> >>> +++ b/drivers/net/virtio_net.c
> >>> @@ -406,7 +406,7 @@ static struct sk_buff *page_to_skb(struct 
> >>> virtnet_info *vi,
> >>>* add_recvbuf_mergeable() + get_mergeable_buf_len()
> >>>*/
> >>>   truesize = headroom ? PAGE_SIZE : truesize;
> >>> - tailroom = truesize - len - headroom;
> >>> + tailroom = truesize - len - headroom - (hdr_padded_len - hdr_len);
> >>
> >> The patch looks correct and I saw it has been merged.
> >>
> >> But I prefer to do that in receive_big() instead of here.
> >>
> >> Thanks
> > How?
> >
> > change truesize or headroom?
> >
> > I didn't find a good way. Do you have a good way?
>
>
> Something like the following? The API is designed to let the caller to
> pass a correct headroom instead of figure it out by itself.
>
>      struct sk_buff *skb =
>      page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, true, 0,
> hdr_padded_len - hdr_len);
>
> Thanks


This line may be affected.

buf = p - headroom;

In my opinion, this changes the semantics of the original headroom. The meaning
of headroom in big mode and merge mode has become different. The more confusing
problem is that the parameters of page_to_skb() are getting more and more
chaotic.  So I wrote the previous patch. Of course, I understand your concern.
This patch may bring Here are more questions, although I did a lot of tests.

"virtio-net: Refactor the code related to page_to_skb"

But I hope that our code development direction is as close to what this patch
realizes. I hope that the meaning of the parameters can be more clear.

Do you think this is ok?

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 78a01c71a17c..6d62bb45a188 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -380,34 +380,20 @@ static struct sk_buff *page_to_skb(struct virtnet_info 
*vi,
   struct page *page, unsigned int offset,
   unsigned int len, unsigned int truesize,
   bool hdr_valid, unsigned int metasize,
-  unsigned int headroom)
+  int tailroom, char *buf,
+  unsigned int hdr_padded_len)
 {
struct sk_buff *skb;
struct virtio_net_hdr_mrg_rxbuf *hdr;
-   unsigned int copy, hdr_len, hdr_padded_len;
+   unsigned int copy, hdr_len;
struct page *page_to_free = NULL;
-   int tailroom, shinfo_size;
-   char *p, *hdr_p, *buf;
+   int shinfo_size;
+   char *p, *hdr_p;

p = page_address(page) + offset;
hdr_p = p;

hdr_len = vi->hdr_len;
-  

Re: [PATCH net] virtio-net: fix for skb_over_panic inside big mode

2021-06-03 Thread Jason Wang


在 2021/6/4 上午10:30, Xuan Zhuo 写道:

On Fri, 4 Jun 2021 10:28:41 +0800, Jason Wang  wrote:

在 2021/6/4 上午1:09, Xuan Zhuo 写道:

In virtio-net's large packet mode, there is a hole in the space behind
buf.


before the buf actually or behind the vnet header?



  hdr_padded_len - hdr_len

We must take this into account when calculating tailroom.

[   44.544385] skb_put.cold (net/core/skbuff.c:5254 (discriminator 1) 
net/core/skbuff.c:5252 (discriminator 1))
[   44.544864] page_to_skb (drivers/net/virtio_net.c:485) [   44.545361] 
receive_buf (drivers/net/virtio_net.c:849 drivers/net/virtio_net.c:1131)
[   44.545870] ? netif_receive_skb_list_internal (net/core/dev.c:5714)
[   44.546628] ? dev_gro_receive (net/core/dev.c:6103)
[   44.547135] ? napi_complete_done (./include/linux/list.h:35 
net/core/dev.c:5867 net/core/dev.c:5862 net/core/dev.c:6565)
[   44.547672] virtnet_poll (drivers/net/virtio_net.c:1427 
drivers/net/virtio_net.c:1525)
[   44.548251] __napi_poll (net/core/dev.c:6985)
[   44.548744] net_rx_action (net/core/dev.c:7054 net/core/dev.c:7139)
[   44.549264] __do_softirq (./arch/x86/include/asm/jump_label.h:19 
./include/linux/jump_label.h:200 ./include/trace/events/irq.h:142 
kernel/softirq.c:560)
[   44.549762] irq_exit_rcu (kernel/softirq.c:433 kernel/softirq.c:637 
kernel/softirq.c:649)
[   44.551384] common_interrupt (arch/x86/kernel/irq.c:240 (discriminator 13))
[   44.551991] ? asm_common_interrupt (./arch/x86/include/asm/idtentry.h:638)
[   44.552654] asm_common_interrupt (./arch/x86/include/asm/idtentry.h:638)

Fixes: fb32856b16ad ("virtio-net: page_to_skb() use build_skb when there's 
sufficient tailroom")
Signed-off-by: Xuan Zhuo 
Reported-by: Corentin Noël 
Tested-by: Corentin Noël 
---
   drivers/net/virtio_net.c | 2 +-
   1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index fa407eb8b457..78a01c71a17c 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -406,7 +406,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 * add_recvbuf_mergeable() + get_mergeable_buf_len()
 */
truesize = headroom ? PAGE_SIZE : truesize;
-   tailroom = truesize - len - headroom;
+   tailroom = truesize - len - headroom - (hdr_padded_len - hdr_len);


The patch looks correct and I saw it has been merged.

But I prefer to do that in receive_big() instead of here.

Thanks

How?

change truesize or headroom?

I didn't find a good way. Do you have a good way?



Something like the following? The API is designed to let the caller to 
pass a correct headroom instead of figure it out by itself.


    struct sk_buff *skb =
    page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, true, 0, 
hdr_padded_len - hdr_len);


Thanks




Thanks.





buf = p - headroom;

len -= hdr_len;


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH net] virtio-net: fix for skb_over_panic inside big mode

2021-06-03 Thread Xuan Zhuo
On Fri, 4 Jun 2021 10:28:41 +0800, Jason Wang  wrote:
>
> 在 2021/6/4 上午1:09, Xuan Zhuo 写道:
> > In virtio-net's large packet mode, there is a hole in the space behind
> > buf.
>
>
> before the buf actually or behind the vnet header?
>
>
> >
> >  hdr_padded_len - hdr_len
> >
> > We must take this into account when calculating tailroom.
> >
> > [   44.544385] skb_put.cold (net/core/skbuff.c:5254 (discriminator 1) 
> > net/core/skbuff.c:5252 (discriminator 1))
> > [   44.544864] page_to_skb (drivers/net/virtio_net.c:485) [   44.545361] 
> > receive_buf (drivers/net/virtio_net.c:849 drivers/net/virtio_net.c:1131)
> > [   44.545870] ? netif_receive_skb_list_internal (net/core/dev.c:5714)
> > [   44.546628] ? dev_gro_receive (net/core/dev.c:6103)
> > [   44.547135] ? napi_complete_done (./include/linux/list.h:35 
> > net/core/dev.c:5867 net/core/dev.c:5862 net/core/dev.c:6565)
> > [   44.547672] virtnet_poll (drivers/net/virtio_net.c:1427 
> > drivers/net/virtio_net.c:1525)
> > [   44.548251] __napi_poll (net/core/dev.c:6985)
> > [   44.548744] net_rx_action (net/core/dev.c:7054 net/core/dev.c:7139)
> > [   44.549264] __do_softirq (./arch/x86/include/asm/jump_label.h:19 
> > ./include/linux/jump_label.h:200 ./include/trace/events/irq.h:142 
> > kernel/softirq.c:560)
> > [   44.549762] irq_exit_rcu (kernel/softirq.c:433 kernel/softirq.c:637 
> > kernel/softirq.c:649)
> > [   44.551384] common_interrupt (arch/x86/kernel/irq.c:240 (discriminator 
> > 13))
> > [   44.551991] ? asm_common_interrupt 
> > (./arch/x86/include/asm/idtentry.h:638)
> > [   44.552654] asm_common_interrupt (./arch/x86/include/asm/idtentry.h:638)
> >
> > Fixes: fb32856b16ad ("virtio-net: page_to_skb() use build_skb when there's 
> > sufficient tailroom")
> > Signed-off-by: Xuan Zhuo 
> > Reported-by: Corentin Noël 
> > Tested-by: Corentin Noël 
> > ---
> >   drivers/net/virtio_net.c | 2 +-
> >   1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index fa407eb8b457..78a01c71a17c 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -406,7 +406,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info 
> > *vi,
> >  * add_recvbuf_mergeable() + get_mergeable_buf_len()
> >  */
> > truesize = headroom ? PAGE_SIZE : truesize;
> > -   tailroom = truesize - len - headroom;
> > +   tailroom = truesize - len - headroom - (hdr_padded_len - hdr_len);
>
>
> The patch looks correct and I saw it has been merged.
>
> But I prefer to do that in receive_big() instead of here.
>
> Thanks

How?

change truesize or headroom?

I didn't find a good way. Do you have a good way?

Thanks.

>
>
>
> > buf = p - headroom;
> >
> > len -= hdr_len;
>
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH net-next] virtio_net: set link state down when virtqueue is broken

2021-06-03 Thread Jason Wang


在 2021/6/3 下午7:34, wangyunjian 写道:

-Original Message-
From: Jason Wang [mailto:jasow...@redhat.com]
Sent: Monday, May 31, 2021 11:29 AM
To: wangyunjian ; net...@vger.kernel.org
Cc: k...@kernel.org; da...@davemloft.net; m...@redhat.com;
virtualization@lists.linux-foundation.org; dingxiaoxiong

Subject: Re: [PATCH net-next] virtio_net: set link state down when virtqueue is
broken


在 2021/5/28 下午6:58, wangyunjian 写道:

-Original Message-

From: Yunjian Wang 

The NIC can't receive/send packets if a rx/tx virtqueue is broken.
However, the link state of the NIC is still normal. As a result, the
user cannot detect the NIC exception.

Doesn't we have:

      /* This should not happen! */
       if (unlikely(err)) {
       dev->stats.tx_fifo_errors++;
       if (net_ratelimit())
       dev_warn(>dev,
    "Unexpected TXQ (%d)

queue

failure: %d\n",
    qnum, err);
       dev->stats.tx_dropped++;
       dev_kfree_skb_any(skb);
       return NETDEV_TX_OK;
       }

Which should be sufficient?

There may be other reasons for this error, e.g -ENOSPC(no free desc).


This should not happen unless the device or driver is buggy. We always reserved
sufficient slots:

      if (sq->vq->num_free < 2+MAX_SKB_FRAGS) {
      netif_stop_subqueue(dev, qnum); ...



And if rx virtqueue is broken, there is no error statistics.


Feel free to add one if it's necessary.

Currently receiving scenario, it is impossible to distinguish whether the 
reason for
not receiving packet is virtqueue's broken or no packet.



Can we introduce rx_fifo_errors for that?





Let's leave the policy decision (link down) to userspace.



The driver can set the link state down when the virtqueue is broken.
If the state is down, the user can switch over to another NIC.

Note that, we probably need the watchdog for virtio-net in order to
be a complete solution.

Yes, I can think of is that the virtqueue's broken exception is detected on

watchdog.

Is there anything else that needs to be done?


Basically, it's all about TX stall which watchdog tries to catch. Broken vq is 
only
one of the possible reason.

Are there any plans for the watchdog?



Somebody posted a prototype 3 or 4 years ago, you can search it and 
maybe we can start from there.


Thanks




Thanks


Thanks



Thanks


Thanks




___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH] vdp/mlx5: Fix setting the correct dma_device

2021-06-03 Thread Jason Wang


在 2021/6/3 下午7:22, Eli Cohen 写道:

Before SF support was introduced, the DMA device was equal to
mdev->device which was in essence equal to pdev->dev;
With SF introduction this is no longer true. It has already been
handled for vhost_vdpa since the reference to the dma device can from
within mlx5_vdpa. With virtio_vdpa this broke. To fix this we set the
real dma device when initializing the device.

Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices")



Note sure this is correct, according to the commit log it should be the 
patch that introduces the SF or aux bus support for vDPA.




Signed-off-by: Eli Cohen 



Patch looks correct.

Thanks



---
  drivers/vdpa/mlx5/net/mlx5_vnet.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index bc33f2c523d3..a4ff158181e0 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -2046,7 +2046,7 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev 
*v_mdev, const char *name)
if (err)
goto err_mtu;
  
-	mvdev->vdev.dma_dev = mdev->device;

+   mvdev->vdev.dma_dev = >pdev->dev;
err = mlx5_vdpa_alloc_resources(>mvdev);
if (err)
goto err_mtu;


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v1] vdpa/mlx5: Clear vq ready indication upon device reset

2021-06-03 Thread Jason Wang


在 2021/6/3 下午4:10, Eli Cohen 写道:

After device reset, the virtqueues are not ready so clear the ready
field.

Failing to do so can result in virtio_vdpa failing to load if the device
was previously used by vhost_vdpa and the old values are ready.
virtio_vdpa expects to find VQs in "not ready" state.

Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices")
Signed-off-by: Eli Cohen 
---
v0 --> v1:
   Make sure clear of ready is done only in reset flow

  drivers/vdpa/mlx5/net/mlx5_vnet.c | 9 +
  1 file changed, 9 insertions(+)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 02a05492204c..eaeae67e0ff0 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1766,6 +1766,14 @@ static void teardown_driver(struct mlx5_vdpa_net *ndev)
mutex_unlock(>reslock);
  }
  
+static void clear_vqs_ready(struct mlx5_vdpa_net *ndev)

+{
+   int i;
+
+   for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--)
+   ndev->vqs[i].ready = false;



The patch looks correct but I wonder the reason for the decreasing used 
in the loop.


Usually, it means it has some reason that must be done in that way.

Thanks



+}
+
  static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status)
  {
struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
@@ -1776,6 +1784,7 @@ static void mlx5_vdpa_set_status(struct vdpa_device 
*vdev, u8 status)
if (!status) {
mlx5_vdpa_info(mvdev, "performing device reset\n");
teardown_driver(ndev);
+   clear_vqs_ready(ndev);
mlx5_vdpa_destroy_mr(>mvdev);
ndev->mvdev.status = 0;
ndev->mvdev.mlx_features = 0;


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v2] vdpa/mlx5: Add support for doorbell bypassing

2021-06-03 Thread Jason Wang


在 2021/6/3 下午4:11, Eli Cohen 写道:

Implement mlx5_get_vq_notification() to return the doorbell address.
Since the notification area is mapped to userspace, make sure that the
BAR size is at least PAGE_SIZE large.

Signed-off-by: Eli Cohen 
---
v0 --> v1:
   Make sure SF bar size is not smaller than PAGE_SIZE
v1 --> v2:
   Remove test on addr alignment since it's alrady done by the caller.



Acked-by: Jason Wang 




  drivers/vdpa/mlx5/core/mlx5_vdpa.h |  1 +
  drivers/vdpa/mlx5/core/resources.c |  1 +
  drivers/vdpa/mlx5/net/mlx5_vnet.c  | 14 ++
  3 files changed, 16 insertions(+)

diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h 
b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
index 09a16a3d1b2a..0002b2136b48 100644
--- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h
+++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
@@ -42,6 +42,7 @@ struct mlx5_vdpa_resources {
u32 pdn;
struct mlx5_uars_page *uar;
void __iomem *kick_addr;
+   u64 phys_kick_addr;
u16 uid;
u32 null_mkey;
bool valid;
diff --git a/drivers/vdpa/mlx5/core/resources.c 
b/drivers/vdpa/mlx5/core/resources.c
index 836ab9ef0fa6..d4606213f88a 100644
--- a/drivers/vdpa/mlx5/core/resources.c
+++ b/drivers/vdpa/mlx5/core/resources.c
@@ -253,6 +253,7 @@ int mlx5_vdpa_alloc_resources(struct mlx5_vdpa_dev *mvdev)
goto err_key;
  
  	kick_addr = mdev->bar_addr + offset;

+   res->phys_kick_addr = kick_addr;
  
  	res->kick_addr = ioremap(kick_addr, PAGE_SIZE);

if (!res->kick_addr) {
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 689d3fa61e08..bc33f2c523d3 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1879,8 +1879,22 @@ static void mlx5_vdpa_free(struct vdpa_device *vdev)
  
  static struct vdpa_notification_area mlx5_get_vq_notification(struct vdpa_device *vdev, u16 idx)

  {
+   struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
struct vdpa_notification_area ret = {};
+   struct mlx5_vdpa_net *ndev;
+   phys_addr_t addr;
  
+	/* If SF BAR size is smaller than PAGE_SIZE, do not use direct

+* notification to avoid the risk of mapping pages that contain BAR of 
more
+* than one SF
+*/
+   if (MLX5_CAP_GEN(mvdev->mdev, log_min_sf_size) + 12 < PAGE_SHIFT)
+   return ret;
+
+   ndev = to_mlx5_vdpa_ndev(mvdev);
+   addr = (phys_addr_t)ndev->mvdev.res.phys_kick_addr;
+   ret.addr = addr;
+   ret.size = PAGE_SIZE;
return ret;
  }
  


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v1 1/8] virtio: Force only split mode with protected guest

2021-06-03 Thread Jason Wang


在 2021/6/3 下午9:55, Andi Kleen 写道:


Ok, but what I meant is this, if we don't read from the descriptor 
ring, and validate all the other metadata supplied by the device 
(used id and len). Then there should be no way for the device to 
suppress the dma flags to write to the indirect descriptor table.


Or do you have an example how it can do that?


I don't. If you can validate everything it's probably ok

The only drawback is even more code to audit and test.

-Andi




Ok, then I'm going to post a formal series, please have a look and we 
can start from there.


Thanks

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH net] virtio-net: fix for skb_over_panic inside big mode

2021-06-03 Thread Jason Wang


在 2021/6/4 上午1:09, Xuan Zhuo 写道:

In virtio-net's large packet mode, there is a hole in the space behind
buf.



before the buf actually or behind the vnet header?




 hdr_padded_len - hdr_len

We must take this into account when calculating tailroom.

[   44.544385] skb_put.cold (net/core/skbuff.c:5254 (discriminator 1) 
net/core/skbuff.c:5252 (discriminator 1))
[   44.544864] page_to_skb (drivers/net/virtio_net.c:485) [   44.545361] 
receive_buf (drivers/net/virtio_net.c:849 drivers/net/virtio_net.c:1131)
[   44.545870] ? netif_receive_skb_list_internal (net/core/dev.c:5714)
[   44.546628] ? dev_gro_receive (net/core/dev.c:6103)
[   44.547135] ? napi_complete_done (./include/linux/list.h:35 
net/core/dev.c:5867 net/core/dev.c:5862 net/core/dev.c:6565)
[   44.547672] virtnet_poll (drivers/net/virtio_net.c:1427 
drivers/net/virtio_net.c:1525)
[   44.548251] __napi_poll (net/core/dev.c:6985)
[   44.548744] net_rx_action (net/core/dev.c:7054 net/core/dev.c:7139)
[   44.549264] __do_softirq (./arch/x86/include/asm/jump_label.h:19 
./include/linux/jump_label.h:200 ./include/trace/events/irq.h:142 
kernel/softirq.c:560)
[   44.549762] irq_exit_rcu (kernel/softirq.c:433 kernel/softirq.c:637 
kernel/softirq.c:649)
[   44.551384] common_interrupt (arch/x86/kernel/irq.c:240 (discriminator 13))
[   44.551991] ? asm_common_interrupt (./arch/x86/include/asm/idtentry.h:638)
[   44.552654] asm_common_interrupt (./arch/x86/include/asm/idtentry.h:638)

Fixes: fb32856b16ad ("virtio-net: page_to_skb() use build_skb when there's 
sufficient tailroom")
Signed-off-by: Xuan Zhuo 
Reported-by: Corentin Noël 
Tested-by: Corentin Noël 
---
  drivers/net/virtio_net.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index fa407eb8b457..78a01c71a17c 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -406,7 +406,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 * add_recvbuf_mergeable() + get_mergeable_buf_len()
 */
truesize = headroom ? PAGE_SIZE : truesize;
-   tailroom = truesize - len - headroom;
+   tailroom = truesize - len - headroom - (hdr_padded_len - hdr_len);



The patch looks correct and I saw it has been merged.

But I prefer to do that in receive_big() instead of here.

Thanks




buf = p - headroom;
  
  	len -= hdr_len;


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v1 1/8] virtio: Force only split mode with protected guest

2021-06-03 Thread Jason Wang


在 2021/6/4 上午1:33, Andy Lutomirski 写道:

On 6/2/21 5:41 PM, Andi Kleen wrote:

Only allow split mode when in a protected guest. Followon
patches harden the split mode code paths, and we don't want
an malicious host to force anything else. Also disallow
indirect mode for similar reasons.

I read this as "the virtio driver is buggy.  Let's disable most of the
buggy code in one special case in which we need a driver without bugs.
In all the other cases (e.g. hardware virtio device connected over
USB-C), driver bugs are still allowed."

Can we just fix the driver without special cases?



I think we can, this is what this series tries to do:

https://www.spinics.net/lists/kvm/msg241825.html

It tries to fix without a special caring for any specific features.

Thanks





--Andy



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v1 1/8] virtio: Force only split mode with protected guest

2021-06-03 Thread Andi Kleen




For most Linux drivers, a report that a misbehaving device can corrupt
host memory is a bug, not a feature.  If a USB device can corrupt kernel
memory, that's a serious bug.  If a USB-C device can corrupt kernel
memory, that's also a serious bug, although, sadly, we probably have
lots of these bugs.  If a Firewire device can corrupt kernel memory,
news at 11.  If a Bluetooth or WiFi peer can corrupt kernel memory,
people write sonnets about it and give it clever names.  Why is virtio
special?


Well for most cases it's pointless because they don't have any memory 
protection anyways.


Why break compatibility if it does not buy you anything?

Anyways if you want to enable the restricted mode for something else, 
it's easy to do. The cases where it matters seem to already work on it, 
like the user space virtio ring.


My changes for boundary checking are enabled unconditionally anyways, as 
well as the other patchkits.





This one:

int arch_has_restricted_virtio_memory_access(void)
+{
+   return is_tdx_guest();
+}

I'm looking at a fairly recent kernel, and I don't see anything for s390
wired up in vring_use_dma_api.


It's not using vring_use_dma_api, but enforces the DMA API at virtio 
ring setup time, same as SEV/TDX.


-Andi

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v1 1/8] virtio: Force only split mode with protected guest

2021-06-03 Thread Andy Lutomirski
On 6/3/21 4:32 PM, Andi Kleen wrote:
> 
>> We do not need an increasing pile of kludges
> 
> Do you mean disabling features is a kludge?
> 
> If yes I disagree with that characterization.
> 
> 
>> to make TDX and SEV “secure”.  We need the actual loaded driver to be
>> secure.  The virtio architecture is full of legacy nonsense,
>> and there is no good reason for SEV and TDX to be a giant special case.
> 
> I don't know where you see a "giant special case". Except for the
> limited feature negotiation all the changes are common, and the
> disabling of features (which is not new BTW, but already done e.g. with
> forcing DMA API in some cases) can be of course used by all these other
> technologies too. But it just cannot be done by default for everything
> because it would break compatibility. So every technology with such
> requirements has to explicitly opt-in.
> 
> 
>>
>> As I said before, real PCIe (Thunderbolt/USB-C or anything else) has
>> the exact same problem.  The fact that TDX has encrypted memory is, at
>> best, a poor proxy for the actual condition.  The actual condition is
>> that the host does not trust the device to implement the virtio
>> protocol correctly.
> 
> Right they can do similar limitations of feature sets. But again it
> cannot be default.

Let me try again.

For most Linux drivers, a report that a misbehaving device can corrupt
host memory is a bug, not a feature.  If a USB device can corrupt kernel
memory, that's a serious bug.  If a USB-C device can corrupt kernel
memory, that's also a serious bug, although, sadly, we probably have
lots of these bugs.  If a Firewire device can corrupt kernel memory,
news at 11.  If a Bluetooth or WiFi peer can corrupt kernel memory,
people write sonnets about it and give it clever names.  Why is virtio
special?

If, for some reason, the virtio driver cannot be fixed so that it is
secure and compatible [1], then I think that the limited cases that are
secure should be accessible to anyone, with or without TDX.  Have a
virtio.secure_mode module option or a udev-controllable parameter or an
alternative driver name or *something*.  An alternative driver name
would allow userspace to prevent the insecure mode from auto-binding to
devices.  And make whatever system configures encrypted guests for
security use this mode.  (Linux is not going to be magically secure just
by booting it in TDX.  There's a whole process of unsealing or remote
attestation, something needs to prevent the hypervisor from connecting a
virtual keyboard and typing init=/bin/bash, something needs to provision
an SSH key, etc.)

In my opinion, it is not so great to identify bugs in the driver and
then say that they're only being fixed for TDX and SEV.

Keep in mind that, as I understand it, there is nothing virt specific
about virtio.  There are real physical devices that speak virtio.

[1] The DMA quirk is nasty.  Fortunately, it's the only case I'm aware
of in which the virtio driver genuinely cannot be made secure and
compatible at the smae time.  Also, fortunately, most real deployments
except on powerpc work just fine with the DMA quirk unquirked.

> 
> 
>>
>>>
>>> TDX and SEV use the arch hook to enforce DMA API, so that part is also
>>> solved.
>>>
>> Can you point me to the code you’re referring to?
> 
> See 4/8 in this patch kit. It uses an existing hook which is already
> used in tree by s390.

This one:

int arch_has_restricted_virtio_memory_access(void)
+{
+   return is_tdx_guest();
+}

I'm looking at a fairly recent kernel, and I don't see anything for s390
wired up in vring_use_dma_api.

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v1 1/8] virtio: Force only split mode with protected guest

2021-06-03 Thread Jason Wang


在 2021/6/4 上午2:00, Andi Kleen 写道:


On 6/3/2021 10:33 AM, Andy Lutomirski wrote:

On 6/2/21 5:41 PM, Andi Kleen wrote:

Only allow split mode when in a protected guest. Followon
patches harden the split mode code paths, and we don't want
an malicious host to force anything else. Also disallow
indirect mode for similar reasons.

I read this as "the virtio driver is buggy.  Let's disable most of the
buggy code in one special case in which we need a driver without bugs.
In all the other cases (e.g. hardware virtio device connected over
USB-C), driver bugs are still allowed."


My understanding is most of the other modes (except for split with 
separate descriptors) are obsolete and just there for compatibility. 
As long as they're deprecated they won't harm anyone.


-Andi



For "mode" do you packed vs split? If yes, it's not just for 
compatibility. Though packed virtqueue is designed to be more hardware 
friendly, most hardware vendors choose to start from split.


Thanks

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v1 1/8] virtio: Force only split mode with protected guest

2021-06-03 Thread Jason Wang


在 2021/6/4 上午3:31, Andy Lutomirski 写道:


On Thu, Jun 3, 2021, at 11:00 AM, Andi Kleen wrote:

On 6/3/2021 10:33 AM, Andy Lutomirski wrote:

On 6/2/21 5:41 PM, Andi Kleen wrote:

Only allow split mode when in a protected guest. Followon
patches harden the split mode code paths, and we don't want
an malicious host to force anything else. Also disallow
indirect mode for similar reasons.

I read this as "the virtio driver is buggy.  Let's disable most of the
buggy code in one special case in which we need a driver without bugs.
In all the other cases (e.g. hardware virtio device connected over
USB-C), driver bugs are still allowed."

My understanding is most of the other modes (except for split with
separate descriptors) are obsolete and just there for compatibility. As
long as they're deprecated they won't harm anyone.



Tell that to every crypto downgrade attack ever.

I see two credible solutions:

1. Actually harden the virtio driver.

2. Have a new virtio-modern driver and use it for modern use cases. Maybe 
rename the old driver virtio-legacy or virtio-insecure.  They can share code.



Note that we had already split legacy driver out which can be turned off 
via Kconfig.





Another snag you may hit: virtio’s heuristic for whether to use proper DMA ops 
or to bypass them is a giant kludge. I’m very slightly optimistic that getting 
the heuristic wrong will make the driver fail to operate but won’t allow the 
host to take over the guest, but I’m not really convinced. And I wrote that 
code!  A virtio-modern mode probably should not have a heuristic, and the 
various iommu-bypassing modes should be fixed to work at the bus level, not the 
device level.



I remember there's a very long discussion about this and probably 
without any conclusion. Fortunately, the management layer has been 
taught to enforce VIRTIO_F_ACCESS_PLATFORM for encrypted guests.


A possible way to fix this is without any conflicts is to mandate the 
VIRTIO_F_ACCESS_PLATFORM in version 1.2.


Thanks






___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v1 1/8] virtio: Force only split mode with protected guest

2021-06-03 Thread Andi Kleen



We do not need an increasing pile of kludges


Do you mean disabling features is a kludge?

If yes I disagree with that characterization.



to make TDX and SEV “secure”.  We need the actual loaded driver to be secure.  
The virtio architecture is full of legacy nonsense,
and there is no good reason for SEV and TDX to be a giant special case.


I don't know where you see a "giant special case". Except for the 
limited feature negotiation all the changes are common, and the 
disabling of features (which is not new BTW, but already done e.g. with 
forcing DMA API in some cases) can be of course used by all these other 
technologies too. But it just cannot be done by default for everything 
because it would break compatibility. So every technology with such 
requirements has to explicitly opt-in.





As I said before, real PCIe (Thunderbolt/USB-C or anything else) has the exact 
same problem.  The fact that TDX has encrypted memory is, at best, a poor proxy 
for the actual condition.  The actual condition is that the host does not trust 
the device to implement the virtio protocol correctly.


Right they can do similar limitations of feature sets. But again it 
cannot be default.







TDX and SEV use the arch hook to enforce DMA API, so that part is also
solved.


Can you point me to the code you’re referring to?


See 4/8 in this patch kit. It uses an existing hook which is already 
used in tree by s390.



-Andi



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v1 1/8] virtio: Force only split mode with protected guest

2021-06-03 Thread Andy Lutomirski


On Thu, Jun 3, 2021, at 12:53 PM, Andi Kleen wrote:
> 
> > Tell that to every crypto downgrade attack ever.
> 
> That's exactly what this patch addresses.
> 
> >
> > I see two credible solutions:
> >
> > 1. Actually harden the virtio driver.
> That's exactly what this patchkit, and the alternative approaches, like 
> Jason's, are doing.
> >
> > 2. Have a new virtio-modern driver and use it for modern use cases. Maybe 
> > rename the old driver virtio-legacy or virtio-insecure.  They can share 
> > code.
> 
> In most use cases the legacy driver is not insecure because there is no 
> memory protection anyways.
> 
> Yes maybe such a split would be a good idea for maintenance and maybe 
> performance reasons, but at least from the security perspective I don't 
> see any need for it.


Please reread my email.

We do not need an increasing pile of kludges to make TDX and SEV “secure”.  We 
need the actual loaded driver to be secure.  The virtio architecture is full of 
legacy nonsense, and there is no good reason for SEV and TDX to be a giant 
special case.

As I said before, real PCIe (Thunderbolt/USB-C or anything else) has the exact 
same problem.  The fact that TDX has encrypted memory is, at best, a poor proxy 
for the actual condition.  The actual condition is that the host does not trust 
the device to implement the virtio protocol correctly.

> 
> >
> > Another snag you may hit: virtio’s heuristic for whether to use proper DMA 
> > ops or to bypass them is a giant kludge. I’m very slightly optimistic that 
> > getting the heuristic wrong will make the driver fail to operate but won’t 
> > allow the host to take over the guest, but I’m not really convinced. And I 
> > wrote that code!  A virtio-modern mode probably should not have a 
> > heuristic, and the various iommu-bypassing modes should be fixed to work at 
> > the bus level, not the device level
> 
> TDX and SEV use the arch hook to enforce DMA API, so that part is also 
> solved.
> 

Can you point me to the code you’re referring to?

> 
> -Andi
> 
> 
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: vhost: multiple worker support

2021-06-03 Thread Mike Christie
On 6/3/21 9:37 AM, Stefan Hajnoczi wrote:
> On Tue, May 25, 2021 at 01:05:51PM -0500, Mike Christie wrote:
>> The following patches apply over linus's tree or mst's vhost branch
>> and my cleanup patchset:
>>
>> https://lists.linuxfoundation.org/pipermail/virtualization/2021-May/054354.html
>>
>> These patches allow us to support multiple vhost workers per device. I
>> ended up just doing Stefan's original idea where userspace has the
>> kernel create a worker and we pass back the pid. This has the benefit
>> over the workqueue and userspace thread approach where we only have
>> one'ish code path in the kernel during setup to detect old tools. The
>> main IO paths and device/vq setup/teardown paths all use common code.
>>
>> The kernel patches here allow us to then do N workers device and also
>> share workers across devices.
>>
>> I've also included a patch for qemu so you can get an idea of how it
>> works. If we are ok with the kernel code then I'll break that up into
>> a patchset and send to qemu-devel.
> 
> It seems risky to allow userspace process A to "share" a vhost worker
> thread with userspace process B based on a matching pid alone. Should
> they have ptrace_may_access() or similar?
> 

I'm not sure. I already made it a little restrictive in this posting, but
it may not be enough depending on what's possible and what we want to allow.

Right now to share a worker the userspace process doing the
VHOST_SET_VRING_WORKER ioctl has to be the owner. Before we do a
VHOST_SET_VRING_WORKER, vhost_dev_ioctl calls vhost_dev_check_owner,
so we will fail if 2 random processes try to share.

So we can share a worker across a vhost-dev's N virtqueues or share a
worker with multiple vhost devs and their virtqueues, but the devs have
to be managed by the same VM/qemu process. If that's all we want to support,
then is the owner check enough?

If we want to share workers across VMs then I think we definitely want
something like ptrace_may_access.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v1 1/8] virtio: Force only split mode with protected guest

2021-06-03 Thread Andi Kleen



Tell that to every crypto downgrade attack ever.


That's exactly what this patch addresses.



I see two credible solutions:

1. Actually harden the virtio driver.
That's exactly what this patchkit, and the alternative approaches, like 
Jason's, are doing.


2. Have a new virtio-modern driver and use it for modern use cases. Maybe 
rename the old driver virtio-legacy or virtio-insecure.  They can share code.


In most use cases the legacy driver is not insecure because there is no 
memory protection anyways.


Yes maybe such a split would be a good idea for maintenance and maybe 
performance reasons, but at least from the security perspective I don't 
see any need for it.




Another snag you may hit: virtio’s heuristic for whether to use proper DMA ops 
or to bypass them is a giant kludge. I’m very slightly optimistic that getting 
the heuristic wrong will make the driver fail to operate but won’t allow the 
host to take over the guest, but I’m not really convinced. And I wrote that 
code!  A virtio-modern mode probably should not have a heuristic, and the 
various iommu-bypassing modes should be fixed to work at the bus level, not the 
device level


TDX and SEV use the arch hook to enforce DMA API, so that part is also 
solved.



-Andi

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v1 1/8] virtio: Force only split mode with protected guest

2021-06-03 Thread Andy Lutomirski


On Thu, Jun 3, 2021, at 11:00 AM, Andi Kleen wrote:
> 
> On 6/3/2021 10:33 AM, Andy Lutomirski wrote:
> > On 6/2/21 5:41 PM, Andi Kleen wrote:
> >> Only allow split mode when in a protected guest. Followon
> >> patches harden the split mode code paths, and we don't want
> >> an malicious host to force anything else. Also disallow
> >> indirect mode for similar reasons.
> > I read this as "the virtio driver is buggy.  Let's disable most of the
> > buggy code in one special case in which we need a driver without bugs.
> > In all the other cases (e.g. hardware virtio device connected over
> > USB-C), driver bugs are still allowed."
> 
> My understanding is most of the other modes (except for split with 
> separate descriptors) are obsolete and just there for compatibility. As 
> long as they're deprecated they won't harm anyone.
> 
>

Tell that to every crypto downgrade attack ever.

I see two credible solutions:

1. Actually harden the virtio driver.

2. Have a new virtio-modern driver and use it for modern use cases. Maybe 
rename the old driver virtio-legacy or virtio-insecure.  They can share code.

Another snag you may hit: virtio’s heuristic for whether to use proper DMA ops 
or to bypass them is a giant kludge. I’m very slightly optimistic that getting 
the heuristic wrong will make the driver fail to operate but won’t allow the 
host to take over the guest, but I’m not really convinced. And I wrote that 
code!  A virtio-modern mode probably should not have a heuristic, and the 
various iommu-bypassing modes should be fixed to work at the bus level, not the 
device level.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: vhost: multiple worker support

2021-06-03 Thread Mike Christie
On 6/3/21 5:13 AM, Stefan Hajnoczi wrote:
> On Tue, May 25, 2021 at 01:05:51PM -0500, Mike Christie wrote:
>> Results:
>> 
>> When running with the null_blk driver and vhost-scsi I can get 1.2
>> million IOPs by just running a simple
>>
>> fio --filename=/dev/sda --direct=1 --rw=randrw --bs=4k --ioengine=libaio
>> --iodepth=128  --numjobs=8 --time_based --group_reporting --name=iops
>> --runtime=60 --eta-newline=1
>>
>> The VM has 8 vCPUs and sda has 8 virtqueues and we can do a total of
>> 1024 cmds per devices. To get 1.2 million IOPs I did have to tune and
>> ran the virsh emulatorpin command so the vhost threads were running
>> on different CPUs than the VM. If the vhost threads share CPUs then I
>> get around 800K.
>>
>> For a more real device that are also CPU hogs like iscsi, I can still
>> get 1 million IOPs using 1 dm-multipath device over 8 iscsi paths
>> (natively it gets 1.1 million IOPs).
> 
> There is no comparison against a baseline, but I guess it would be the
> same 8 vCPU guest with single queue vhost-scsi?
> 

For the iscsi device the max IOPs for the single thread case was around
380K IOPs.

Here are the results with null_blk as the backend device with a 16
vCPU guest to give you a better picture.

fio
numjobs 124812   16


Current upstream (single thread per vhost-scsi device).
After 8 jobs there was no perf diff.

VQs
1   130k 338k 390k 404k --
2   146k 440k 448k 478k --
4   146k 456k 448k 482k --
8   154k 464k 500k 490k --
12  160k 454k 486k 490k --
16  162k 460k 484k 486k --

thread per VQ:
After 16 jobs there was no perf diff even if I increased
the number of guest vCPUs.
*
1   same as above
2   166k 320k 542k 664k 558k 658k
4   156k 310k 660k 986k 860k 890k
8   156k 328k 652k 988k 972k 1074k
12  162k 336k 660k 1172k1190k1324
16  162k 332k 664k 1398k850k 1426k

Note:
- For numjobs > 8, I lowered iodepth so we had a total of 1024
cmds over all jobs.
- virtqueue_size/cmd_per_lun=1024 was used for all tests.
- If I modify vhost-scsi so vhost_scsi_handle_vq queues the
response immediately so we never enter the LIO/block/scsi layers
then I can get around 1.6-1.8M IOPs as the max.
- There are some device wide locks in the LIO main IO path that
we are hitting in these results. We are working on removing them.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v1 1/8] virtio: Force only split mode with protected guest

2021-06-03 Thread Andi Kleen



On 6/3/2021 10:33 AM, Andy Lutomirski wrote:

On 6/2/21 5:41 PM, Andi Kleen wrote:

Only allow split mode when in a protected guest. Followon
patches harden the split mode code paths, and we don't want
an malicious host to force anything else. Also disallow
indirect mode for similar reasons.

I read this as "the virtio driver is buggy.  Let's disable most of the
buggy code in one special case in which we need a driver without bugs.
In all the other cases (e.g. hardware virtio device connected over
USB-C), driver bugs are still allowed."


My understanding is most of the other modes (except for split with separate 
descriptors) are obsolete and just there for compatibility. As long as they're 
deprecated they won't harm anyone.

-Andi

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v1 1/8] virtio: Force only split mode with protected guest

2021-06-03 Thread Andy Lutomirski
On 6/2/21 5:41 PM, Andi Kleen wrote:
> Only allow split mode when in a protected guest. Followon
> patches harden the split mode code paths, and we don't want
> an malicious host to force anything else. Also disallow
> indirect mode for similar reasons.

I read this as "the virtio driver is buggy.  Let's disable most of the
buggy code in one special case in which we need a driver without bugs.
In all the other cases (e.g. hardware virtio device connected over
USB-C), driver bugs are still allowed."

Can we just fix the driver without special cases?

--Andy
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH net] virtio-net: fix for skb_over_panic inside big mode

2021-06-03 Thread Xuan Zhuo
In virtio-net's large packet mode, there is a hole in the space behind
buf.

hdr_padded_len - hdr_len

We must take this into account when calculating tailroom.

[   44.544385] skb_put.cold (net/core/skbuff.c:5254 (discriminator 1) 
net/core/skbuff.c:5252 (discriminator 1))
[   44.544864] page_to_skb (drivers/net/virtio_net.c:485) [   44.545361] 
receive_buf (drivers/net/virtio_net.c:849 drivers/net/virtio_net.c:1131)
[   44.545870] ? netif_receive_skb_list_internal (net/core/dev.c:5714)
[   44.546628] ? dev_gro_receive (net/core/dev.c:6103)
[   44.547135] ? napi_complete_done (./include/linux/list.h:35 
net/core/dev.c:5867 net/core/dev.c:5862 net/core/dev.c:6565)
[   44.547672] virtnet_poll (drivers/net/virtio_net.c:1427 
drivers/net/virtio_net.c:1525)
[   44.548251] __napi_poll (net/core/dev.c:6985)
[   44.548744] net_rx_action (net/core/dev.c:7054 net/core/dev.c:7139)
[   44.549264] __do_softirq (./arch/x86/include/asm/jump_label.h:19 
./include/linux/jump_label.h:200 ./include/trace/events/irq.h:142 
kernel/softirq.c:560)
[   44.549762] irq_exit_rcu (kernel/softirq.c:433 kernel/softirq.c:637 
kernel/softirq.c:649)
[   44.551384] common_interrupt (arch/x86/kernel/irq.c:240 (discriminator 13))
[   44.551991] ? asm_common_interrupt (./arch/x86/include/asm/idtentry.h:638)
[   44.552654] asm_common_interrupt (./arch/x86/include/asm/idtentry.h:638)

Fixes: fb32856b16ad ("virtio-net: page_to_skb() use build_skb when there's 
sufficient tailroom")
Signed-off-by: Xuan Zhuo 
Reported-by: Corentin Noël 
Tested-by: Corentin Noël 
---
 drivers/net/virtio_net.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index fa407eb8b457..78a01c71a17c 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -406,7 +406,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 * add_recvbuf_mergeable() + get_mergeable_buf_len()
 */
truesize = headroom ? PAGE_SIZE : truesize;
-   tailroom = truesize - len - headroom;
+   tailroom = truesize - len - headroom - (hdr_padded_len - hdr_len);
buf = p - headroom;
 
len -= hdr_len;
-- 
2.31.0

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v10 18/18] virtio/vsock: update trace event for SEQPACKET

2021-06-03 Thread Stefano Garzarella

On Thu, May 20, 2021 at 10:20:04PM +0300, Arseny Krasnov wrote:

Add SEQPACKET socket type to vsock trace event.

Signed-off-by: Arseny Krasnov 
---
include/trace/events/vsock_virtio_transport_common.h | 5 -
1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/trace/events/vsock_virtio_transport_common.h 
b/include/trace/events/vsock_virtio_transport_common.h
index 6782213778be..b30c0e319b0e 100644
--- a/include/trace/events/vsock_virtio_transport_common.h
+++ b/include/trace/events/vsock_virtio_transport_common.h
@@ -9,9 +9,12 @@
#include 

TRACE_DEFINE_ENUM(VIRTIO_VSOCK_TYPE_STREAM);
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_TYPE_SEQPACKET);

#define show_type(val) \
-   __print_symbolic(val, { VIRTIO_VSOCK_TYPE_STREAM, "STREAM" })
+   __print_symbolic(val, \
+   { VIRTIO_VSOCK_TYPE_STREAM, "STREAM" }, \
+   { VIRTIO_VSOCK_TYPE_SEQPACKET, "SEQPACKET" })


I think we should fixe the indentation here (e.g. following show_op):
 #define show_type(val) \
__print_symbolic(val, \
 { VIRTIO_VSOCK_TYPE_STREAM, "STREAM" }, \
 { VIRTIO_VSOCK_TYPE_SEQPACKET, "SEQPACKET" })

Thanks,
Stefano



TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_INVALID);
TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_REQUEST);
--
2.25.1



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v10 17/18] vsock_test: add SOCK_SEQPACKET tests

2021-06-03 Thread Stefano Garzarella

On Thu, May 20, 2021 at 10:19:50PM +0300, Arseny Krasnov wrote:

Implement two tests of SOCK_SEQPACKET socket: first sends data by
several 'write()'s and checks that number of 'read()' were same.
Second test checks MSG_TRUNC flag. Cases for connect(), bind(),
etc. are not tested, because it is same as for stream socket.

Signed-off-by: Arseny Krasnov 
---
v9 -> v10:
1) Commit message updated.
2) Add second test for message bounds.


This patch LGTM, but I'll review better with the next version, running 
also the test suite on my VMs.


Thanks,
Stefano



tools/testing/vsock/util.c   |  32 +++--
tools/testing/vsock/util.h   |   3 +
tools/testing/vsock/vsock_test.c | 116 +++
3 files changed, 146 insertions(+), 5 deletions(-)

diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c
index 93cbd6f603f9..2acbb7703c6a 100644
--- a/tools/testing/vsock/util.c
+++ b/tools/testing/vsock/util.c
@@ -84,7 +84,7 @@ void vsock_wait_remote_close(int fd)
}

/* Connect to  and return the file descriptor. */
-int vsock_stream_connect(unsigned int cid, unsigned int port)
+static int vsock_connect(unsigned int cid, unsigned int port, int type)
{
union {
struct sockaddr sa;
@@ -101,7 +101,7 @@ int vsock_stream_connect(unsigned int cid, unsigned int 
port)

control_expectln("LISTENING");

-   fd = socket(AF_VSOCK, SOCK_STREAM, 0);
+   fd = socket(AF_VSOCK, type, 0);

timeout_begin(TIMEOUT);
do {
@@ -120,11 +120,21 @@ int vsock_stream_connect(unsigned int cid, unsigned int 
port)
return fd;
}

+int vsock_stream_connect(unsigned int cid, unsigned int port)
+{
+   return vsock_connect(cid, port, SOCK_STREAM);
+}
+
+int vsock_seqpacket_connect(unsigned int cid, unsigned int port)
+{
+   return vsock_connect(cid, port, SOCK_SEQPACKET);
+}
+
/* Listen on  and return the first incoming connection.  The remote
 * address is stored to clientaddrp.  clientaddrp may be NULL.
 */
-int vsock_stream_accept(unsigned int cid, unsigned int port,
-   struct sockaddr_vm *clientaddrp)
+static int vsock_accept(unsigned int cid, unsigned int port,
+   struct sockaddr_vm *clientaddrp, int type)
{
union {
struct sockaddr sa;
@@ -145,7 +155,7 @@ int vsock_stream_accept(unsigned int cid, unsigned int port,
int client_fd;
int old_errno;

-   fd = socket(AF_VSOCK, SOCK_STREAM, 0);
+   fd = socket(AF_VSOCK, type, 0);

if (bind(fd, , sizeof(addr.svm)) < 0) {
perror("bind");
@@ -189,6 +199,18 @@ int vsock_stream_accept(unsigned int cid, unsigned int 
port,
return client_fd;
}

+int vsock_stream_accept(unsigned int cid, unsigned int port,
+   struct sockaddr_vm *clientaddrp)
+{
+   return vsock_accept(cid, port, clientaddrp, SOCK_STREAM);
+}
+
+int vsock_seqpacket_accept(unsigned int cid, unsigned int port,
+  struct sockaddr_vm *clientaddrp)
+{
+   return vsock_accept(cid, port, clientaddrp, SOCK_SEQPACKET);
+}
+
/* Transmit one byte and check the return value.
 *
 * expected_ret:
diff --git a/tools/testing/vsock/util.h b/tools/testing/vsock/util.h
index e53dd09d26d9..a3375ad2fb7f 100644
--- a/tools/testing/vsock/util.h
+++ b/tools/testing/vsock/util.h
@@ -36,8 +36,11 @@ struct test_case {
void init_signals(void);
unsigned int parse_cid(const char *str);
int vsock_stream_connect(unsigned int cid, unsigned int port);
+int vsock_seqpacket_connect(unsigned int cid, unsigned int port);
int vsock_stream_accept(unsigned int cid, unsigned int port,
struct sockaddr_vm *clientaddrp);
+int vsock_seqpacket_accept(unsigned int cid, unsigned int port,
+  struct sockaddr_vm *clientaddrp);
void vsock_wait_remote_close(int fd);
void send_byte(int fd, int expected_ret, int flags);
void recv_byte(int fd, int expected_ret, int flags);
diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index 5a4fb80fa832..67766bfe176f 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -14,6 +14,8 @@
#include 
#include 
#include 
+#include 
+#include 

#include "timeout.h"
#include "control.h"
@@ -279,6 +281,110 @@ static void test_stream_msg_peek_server(const struct 
test_opts *opts)
close(fd);
}

+#define MESSAGES_CNT 7
+static void test_seqpacket_msg_bounds_client(const struct test_opts *opts)
+{
+   int fd;
+
+   fd = vsock_seqpacket_connect(opts->peer_cid, 1234);
+   if (fd < 0) {
+   perror("connect");
+   exit(EXIT_FAILURE);
+   }
+
+   /* Send several messages, one with MSG_EOR flag */
+   for (int i = 0; i < MESSAGES_CNT; i++)
+   send_byte(fd, 1, 0);
+
+   control_writeln("SENDDONE");
+   close(fd);
+}
+
+static void test_seqpacket_msg_bounds_server(const struct test_opts *opts)
+{
+   int fd;
+   

Re: [PATCH 08/30] mspro: use blk_mq_alloc_disk

2021-06-03 Thread Ulf Hansson
On Wed, 2 Jun 2021 at 08:54, Christoph Hellwig  wrote:
>
> Use the blk_mq_alloc_disk API to simplify the gendisk and request_queue
> allocation.
>
> Signed-off-by: Christoph Hellwig 

Acked-by: Ulf Hansson 

Kind regards
Uffe


> ---
>  drivers/memstick/core/mspro_block.c | 26 +++---
>  1 file changed, 11 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/memstick/core/mspro_block.c 
> b/drivers/memstick/core/mspro_block.c
> index cf7fe0d58ee7..22778d0e24f5 100644
> --- a/drivers/memstick/core/mspro_block.c
> +++ b/drivers/memstick/core/mspro_block.c
> @@ -1205,21 +1205,17 @@ static int mspro_block_init_disk(struct memstick_dev 
> *card)
> if (disk_id < 0)
> return disk_id;
>
> -   msb->disk = alloc_disk(1 << MSPRO_BLOCK_PART_SHIFT);
> -   if (!msb->disk) {
> -   rc = -ENOMEM;
> +   rc = blk_mq_alloc_sq_tag_set(>tag_set, _mq_ops, 2,
> +BLK_MQ_F_SHOULD_MERGE);
> +   if (rc)
> goto out_release_id;
> -   }
>
> -   msb->queue = blk_mq_init_sq_queue(>tag_set, _mq_ops, 2,
> -   BLK_MQ_F_SHOULD_MERGE);
> -   if (IS_ERR(msb->queue)) {
> -   rc = PTR_ERR(msb->queue);
> -   msb->queue = NULL;
> -   goto out_put_disk;
> +   msb->disk = blk_mq_alloc_disk(>tag_set, card);
> +   if (IS_ERR(msb->disk)) {
> +   rc = PTR_ERR(msb->disk);
> +   goto out_free_tag_set;
> }
> -
> -   msb->queue->queuedata = card;
> +   msb->queue = msb->disk->queue;
>
> blk_queue_max_hw_sectors(msb->queue, MSPRO_BLOCK_MAX_PAGES);
> blk_queue_max_segments(msb->queue, MSPRO_BLOCK_MAX_SEGS);
> @@ -1228,10 +1224,10 @@ static int mspro_block_init_disk(struct memstick_dev 
> *card)
>
> msb->disk->major = major;
> msb->disk->first_minor = disk_id << MSPRO_BLOCK_PART_SHIFT;
> +   msb->disk->minors = 1 << MSPRO_BLOCK_PART_SHIFT;
> msb->disk->fops = _block_bdops;
> msb->usage_count = 1;
> msb->disk->private_data = msb;
> -   msb->disk->queue = msb->queue;
>
> sprintf(msb->disk->disk_name, "mspblk%d", disk_id);
>
> @@ -1247,8 +1243,8 @@ static int mspro_block_init_disk(struct memstick_dev 
> *card)
> msb->active = 1;
> return 0;
>
> -out_put_disk:
> -   put_disk(msb->disk);
> +out_free_tag_set:
> +   blk_mq_free_tag_set(>tag_set);
>  out_release_id:
> mutex_lock(_block_disk_lock);
> idr_remove(_block_disk_idr, disk_id);
> --
> 2.30.2
>
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 07/30] ms_block: use blk_mq_alloc_disk

2021-06-03 Thread Ulf Hansson
On Wed, 2 Jun 2021 at 08:54, Christoph Hellwig  wrote:
>
> Use the blk_mq_alloc_disk API to simplify the gendisk and request_queue
> allocation.
>
> Signed-off-by: Christoph Hellwig 

Acked-by: Ulf Hansson 

Kind regards
Uffe


> ---
>  drivers/memstick/core/ms_block.c | 25 ++---
>  1 file changed, 10 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/memstick/core/ms_block.c 
> b/drivers/memstick/core/ms_block.c
> index 0bacf4268f83..dac258d12aca 100644
> --- a/drivers/memstick/core/ms_block.c
> +++ b/drivers/memstick/core/ms_block.c
> @@ -2110,21 +2110,17 @@ static int msb_init_disk(struct memstick_dev *card)
> if (msb->disk_id  < 0)
> return msb->disk_id;
>
> -   msb->disk = alloc_disk(0);
> -   if (!msb->disk) {
> -   rc = -ENOMEM;
> +   rc = blk_mq_alloc_sq_tag_set(>tag_set, _mq_ops, 2,
> +BLK_MQ_F_SHOULD_MERGE);
> +   if (rc)
> goto out_release_id;
> -   }
>
> -   msb->queue = blk_mq_init_sq_queue(>tag_set, _mq_ops, 2,
> -   BLK_MQ_F_SHOULD_MERGE);
> -   if (IS_ERR(msb->queue)) {
> -   rc = PTR_ERR(msb->queue);
> -   msb->queue = NULL;
> -   goto out_put_disk;
> +   msb->disk = blk_mq_alloc_disk(>tag_set, card);
> +   if (IS_ERR(msb->disk)) {
> +   rc = PTR_ERR(msb->disk);
> +   goto out_free_tag_set;
> }
> -
> -   msb->queue->queuedata = card;
> +   msb->queue = msb->disk->queue;
>
> blk_queue_max_hw_sectors(msb->queue, MS_BLOCK_MAX_PAGES);
> blk_queue_max_segments(msb->queue, MS_BLOCK_MAX_SEGS);
> @@ -2135,7 +2131,6 @@ static int msb_init_disk(struct memstick_dev *card)
> sprintf(msb->disk->disk_name, "msblk%d", msb->disk_id);
> msb->disk->fops = _bdops;
> msb->disk->private_data = msb;
> -   msb->disk->queue = msb->queue;
>
> capacity = msb->pages_in_block * msb->logical_block_count;
> capacity *= (msb->page_size / 512);
> @@ -2155,8 +2150,8 @@ static int msb_init_disk(struct memstick_dev *card)
> dbg("Disk added");
> return 0;
>
> -out_put_disk:
> -   put_disk(msb->disk);
> +out_free_tag_set:
> +   blk_mq_free_tag_set(>tag_set);
>  out_release_id:
> mutex_lock(_disk_lock);
> idr_remove(_disk_idr, msb->disk_id);
> --
> 2.30.2
>
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v10 15/18] vhost/vsock: support SEQPACKET for transport

2021-06-03 Thread Stefano Garzarella

On Thu, May 20, 2021 at 10:19:13PM +0300, Arseny Krasnov wrote:

Please describe better the changes included in this patch in the first 
part of the commit message.



As vhost places data in buffers of guest's rx queue, keep SEQ_EOR
bit set only when last piece of data is copied. Otherwise we get
sequence packets for one socket in guest's rx queue with SEQ_EOR bit
set. Also remove ignore of non-stream type of packets, handle SEQPACKET
feature bit.

Signed-off-by: Arseny Krasnov 
---
v9 -> v10:
1) Move 'restore_flag' handling to 'payload_len' calculation
   block.

drivers/vhost/vsock.c | 44 +++
1 file changed, 40 insertions(+), 4 deletions(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 5e78fb719602..63d15beaad05 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -31,7 +31,8 @@

enum {
VHOST_VSOCK_FEATURES = VHOST_FEATURES |
-  (1ULL << VIRTIO_F_ACCESS_PLATFORM)
+  (1ULL << VIRTIO_F_ACCESS_PLATFORM) |
+  (1ULL << VIRTIO_VSOCK_F_SEQPACKET)
};

enum {
@@ -56,6 +57,7 @@ struct vhost_vsock {
atomic_t queued_replies;

u32 guest_cid;
+   bool seqpacket_allow;
};

static u32 vhost_transport_get_local_cid(void)
@@ -112,6 +114,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
size_t nbytes;
size_t iov_len, payload_len;
int head;
+   bool restore_flag = false;

spin_lock_bh(>send_pkt_list_lock);
if (list_empty(>send_pkt_list)) {
@@ -168,9 +171,15 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
/* If the packet is greater than the space available in the
 * buffer, we split it using multiple buffers.
 */
-   if (payload_len > iov_len - sizeof(pkt->hdr))
+   if (payload_len > iov_len - sizeof(pkt->hdr)) {
payload_len = iov_len - sizeof(pkt->hdr);



Please, add a comment here to explain why we need this.

+			if (le32_to_cpu(pkt->hdr.flags) & 
VIRTIO_VSOCK_SEQ_EOR) {

+   pkt->hdr.flags &= 
~cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
+   restore_flag = true;
+   }
+   }
+
/* Set the correct length in the header */
pkt->hdr.len = cpu_to_le32(payload_len);

@@ -181,6 +190,9 @@ vhost_transport_do_send_pkt(struct vhost_vsock 
*vsock,

break;
}

+   if (restore_flag)
+   pkt->hdr.flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
+


Maybe we can restore the flag only if we are queueing again the same 
packet, I mean in the `if (pkt->off < pkt->len) {` branch below.


What do you think?


nbytes = copy_to_iter(pkt->buf + pkt->off, payload_len,
  _iter);
if (nbytes != payload_len) {
@@ -354,8 +366,7 @@ vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq,
return NULL;
}

-   if (le16_to_cpu(pkt->hdr.type) == VIRTIO_VSOCK_TYPE_STREAM)
-   pkt->len = le32_to_cpu(pkt->hdr.len);
+   pkt->len = le32_to_cpu(pkt->hdr.len);

/* No payload */
if (!pkt->len)
@@ -398,6 +409,8 @@ static bool vhost_vsock_more_replies(struct 
vhost_vsock *vsock)

return val < vq->num;
}

+static bool vhost_transport_seqpacket_allow(u32 remote_cid);
+
static struct virtio_transport vhost_transport = {
.transport = {
.module   = THIS_MODULE,
@@ -424,6 +437,10 @@ static struct virtio_transport vhost_transport = {
.stream_is_active = virtio_transport_stream_is_active,
.stream_allow = virtio_transport_stream_allow,

+   .seqpacket_dequeue= virtio_transport_seqpacket_dequeue,
+   .seqpacket_enqueue= virtio_transport_seqpacket_enqueue,
+   .seqpacket_allow  = vhost_transport_seqpacket_allow,
+
.notify_poll_in   = virtio_transport_notify_poll_in,
.notify_poll_out  = virtio_transport_notify_poll_out,
.notify_recv_init = virtio_transport_notify_recv_init,
@@ -441,6 +458,22 @@ static struct virtio_transport vhost_transport = {
.send_pkt = vhost_transport_send_pkt,
};

+static bool vhost_transport_seqpacket_allow(u32 remote_cid)
+{
+   struct vhost_vsock *vsock;
+   bool seqpacket_allow = false;
+
+   rcu_read_lock();
+   vsock = vhost_vsock_get(remote_cid);
+
+   if (vsock)
+   seqpacket_allow = vsock->seqpacket_allow;
+
+   rcu_read_unlock();
+
+   return seqpacket_allow;
+}
+
static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
{
struct vhost_virtqueue *vq = container_of(work, struct 

Re: [PATCH 0/3] virtio_blk: blk-mq io_poll support

2021-06-03 Thread Stefan Hajnoczi
On Thu, May 20, 2021 at 03:13:02PM +0100, Stefan Hajnoczi wrote:
> This patch series implements blk_mq_ops->poll() so REQ_HIPRI requests can be
> polled. IOPS for 4k and 16k block sizes increases by 5-18% on a virtio-blk
> device with 4 virtqueues backed by an NVMe drive.
> 
> - Benchmark: fio ioengine=pvsync2 numjobs=4 direct=1
> - Guest: 4 vCPUs with one virtio-blk device (4 virtqueues)
> - Disk: Intel Corporation NVMe Datacenter SSD [Optane] [8086:2701]
> - CPU: Intel(R) Xeon(R) Silver 4214 CPU @ 2.20GHz
> 
> rw  bs hipri=0 hipri=1
> --
> randread4k 149,426 170,763 +14%
> randread   16k 118,939 134,269 +12%
> randread   64k  34,886  34,906   0%
> randread  128k  17,655  17,667   0%
> randwrite   4k 138,578 163,600 +18%
> randwrite  16k 102,089 120,950 +18%
> randwrite  64k  32,364  32,561   0%
> randwrite 128k  16,154  16,237   0%
> read4k 146,032 170,620 +16%
> read   16k 117,097 130,437 +11%
> read   64k  34,834  35,037   0%
> read  128k  17,680  17,658   0%
> write   4k 134,562 151,422 +12%
> write  16k 101,796 107,606  +5%
> write  64k  32,364  32,594   0%
> write 128k  16,259  16,265   0%
> 
> Larger block sizes do not benefit from polling as much but the
> improvement is worthwhile for smaller block sizes.
> 
> Stefan Hajnoczi (3):
>   virtio: add virtioqueue_more_used()
>   virtio_blk: avoid repeating vblk->vqs[qid]
>   virtio_blk: implement blk_mq_ops->poll()
> 
>  include/linux/virtio.h   |   2 +
>  drivers/block/virtio_blk.c   | 126 +--
>  drivers/virtio/virtio_ring.c |  17 +
>  3 files changed, 123 insertions(+), 22 deletions(-)

Christoph and Jens: Any more thoughts on this irq toggling approach?

Stefan


signature.asc
Description: PGP signature
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH 3/3] virtio_blk: implement blk_mq_ops->poll()

2021-06-03 Thread Stefan Hajnoczi
On Thu, May 27, 2021 at 01:48:36PM +0800, Jason Wang wrote:
> 
> 在 2021/5/25 下午4:59, Stefan Hajnoczi 写道:
> > On Tue, May 25, 2021 at 11:21:41AM +0800, Jason Wang wrote:
> > > 在 2021/5/20 下午10:13, Stefan Hajnoczi 写道:
> > > > Request completion latency can be reduced by using polling instead of
> > > > irqs. Even Posted Interrupts or similar hardware support doesn't beat
> > > > polling. The reason is that disabling virtqueue notifications saves
> > > > critical-path CPU cycles on the host by skipping irq injection and in
> > > > the guest by skipping the irq handler. So let's add blk_mq_ops->poll()
> > > > support to virtio_blk.
> > > > 
> > > > The approach taken by this patch differs from the NVMe driver's
> > > > approach. NVMe dedicates hardware queues to polling and submits
> > > > REQ_HIPRI requests only on those queues. This patch does not require
> > > > exclusive polling queues for virtio_blk. Instead, it switches between
> > > > irqs and polling when one or more REQ_HIPRI requests are in flight on a
> > > > virtqueue.
> > > > 
> > > > This is possible because toggling virtqueue notifications is cheap even
> > > > while the virtqueue is running. NVMe cqs can't do this because irqs are
> > > > only enabled/disabled at queue creation time.
> > > > 
> > > > This toggling approach requires no configuration. There is no need to
> > > > dedicate queues ahead of time or to teach users and orchestration tools
> > > > how to set up polling queues.
> > > > 
> > > > Possible drawbacks of this approach:
> > > > 
> > > > - Hardware virtio_blk implementations may find virtqueue_disable_cb()
> > > > expensive since it requires DMA.
> > > 
> > > Note that it's probably not related to the behavior of the driver but the
> > > design of the event suppression mechanism.
> > > 
> > > Device can choose to ignore the suppression flag and keep sending
> > > interrupts.
> > Yes, it's the design of the event suppression mechanism.
> > 
> > If we use dedicated polling virtqueues then the hardware doesn't need to
> > check whether interrupts are enabled for each notification. However,
> > there's no mechanism to tell the device that virtqueue interrupts are
> > permanently disabled. This means that as of today, even dedicated
> > virtqueues cannot suppress interrupts without the device checking for
> > each notification.
> 
> 
> This can be detected via a transport specific way.
> 
> E.g in the case of MSI, VIRTIO_MSI_NO_VECTOR could be a hint.

Nice idea :). Then there would be no need for changes to the hardware
interface. IRQ-less virtqueues is could still be mentioned explicitly in
the VIRTIO spec so that driver/device authors are aware of the
VIRTIO_MSI_NO_VECTOR trick.

> > > > +static int virtblk_poll(struct blk_mq_hw_ctx *hctx)
> > > > +{
> > > > +   struct virtio_blk *vblk = hctx->queue->queuedata;
> > > > +   struct virtqueue *vq = vblk->vqs[hctx->queue_num].vq;
> > > > +
> > > > +   if (!virtqueue_more_used(vq))
> > > 
> > > I'm not familiar with block polling but what happens if a buffer is made
> > > available after virtqueue_more_used() returns false here?
> > Can you explain the scenario, I'm not sure I understand? "buffer is made
> > available" -> are you thinking about additional requests being submitted
> > by the driver or an in-flight request being marked used by the device?
> 
> 
> Something like that:
> 
> 1) requests are submitted
> 2) poll but virtqueue_more_used() return false
> 3) device make buffer used
> 
> In this case, will poll() be triggered again by somebody else? (I think
> interrupt is disabled here).

Yes. An example blk_poll() user is
fs/block_dev.c:__blkdev_direct_IO_simple():

  qc = submit_bio();
  for (;;) {
  set_current_state(TASK_UNINTERRUPTIBLE);
  if (!READ_ONCE(bio.bi_private))
  break;
  if (!(iocb->ki_flags & IOCB_HIPRI) ||
  !blk_poll(bdev_get_queue(bdev), qc, true))
  blk_io_schedule();
  }

That's the infinite loop. The block layer implements the generic portion
of blk_poll(). blk_poll() calls mq_ops->poll() (virtblk_poll()).

So in general the polling loop will keep iterating, but there are
exceptions:
1. need_resched() causes blk_poll() to return 0 and blk_io_schedule()
   will be called.
2. blk-mq has a fancier io_poll algorithm that estimates I/O time and
   sleeps until the expected completion time to save CPU cycles. I
   haven't looked into detail at this one.

Both these cases affect existing mq_ops->poll() implementations (e.g.
NVMe). What's new in this patch series is that virtio-blk could have
non-polling requests on the virtqueue which now has irqs disabled. So we
could wait for them.

I think there's an easy solution for this: don't disable virtqueue irqs
when there are non-REQ_HIPRI requests in flight. The disadvantage is
that we'll keep irqs disable in more situations so the performance
improvement may not apply in some configurations.

Stefan


signature.asc
Description: PGP signature

Re: [PATCH v10 14/18] virtio/vsock: enable SEQPACKET for transport

2021-06-03 Thread Stefano Garzarella

On Thu, May 20, 2021 at 10:18:57PM +0300, Arseny Krasnov wrote:

To make transport work with SOCK_SEQPACKET two updates were
added:


Present is better, and you can also mention that we enable it only if 
the feature is negotiated with the device.



1) SOCK_SEQPACKET ops for virtio transport and 'seqpacket_allow()'
  callback.
2) Handling of SEQPACKET bit: guest tries to negotiate it with vhost.

Signed-off-by: Arseny Krasnov 
---
v9 -> v10:
1) Use 'virtio_has_feature()' to check feature bit.
2) Move assignment to 'seqpacket_allow' before 'rcu_assign_pointer()'.

net/vmw_vsock/virtio_transport.c | 24 
1 file changed, 24 insertions(+)

diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 2700a63ab095..bc5ee8df723a 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -62,6 +62,7 @@ struct virtio_vsock {
struct virtio_vsock_event event_list[8];

u32 guest_cid;
+   bool seqpacket_allow;
};

static u32 virtio_transport_get_local_cid(void)
@@ -443,6 +444,8 @@ static void virtio_vsock_rx_done(struct virtqueue 
*vq)

queue_work(virtio_vsock_workqueue, >rx_work);
}

+static bool virtio_transport_seqpacket_allow(u32 remote_cid);
+
static struct virtio_transport virtio_transport = {
.transport = {
.module   = THIS_MODULE,
@@ -469,6 +472,10 @@ static struct virtio_transport virtio_transport = {
.stream_is_active = virtio_transport_stream_is_active,
.stream_allow = virtio_transport_stream_allow,

+		.seqpacket_dequeue= 
virtio_transport_seqpacket_dequeue,

+   .seqpacket_enqueue= virtio_transport_seqpacket_enqueue,
+   .seqpacket_allow  = virtio_transport_seqpacket_allow,
+
.notify_poll_in   = virtio_transport_notify_poll_in,
.notify_poll_out  = virtio_transport_notify_poll_out,
.notify_recv_init = virtio_transport_notify_recv_init,
@@ -485,6 +492,19 @@ static struct virtio_transport virtio_transport = {
.send_pkt = virtio_transport_send_pkt,
};

+static bool virtio_transport_seqpacket_allow(u32 remote_cid)
+{
+   struct virtio_vsock *vsock;
+   bool seqpacket_allow;
+
+   rcu_read_lock();
+   vsock = rcu_dereference(the_virtio_vsock);
+   seqpacket_allow = vsock->seqpacket_allow;
+   rcu_read_unlock();
+
+   return seqpacket_allow;
+}
+
static void virtio_transport_rx_work(struct work_struct *work)
{
struct virtio_vsock *vsock =
@@ -608,6 +628,9 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
vsock->event_run = true;
mutex_unlock(>event_lock);

+   if (virtio_has_feature(vdev, VIRTIO_VSOCK_F_SEQPACKET))
+   vsock->seqpacket_allow = true;
+
vdev->priv = vsock;
rcu_assign_pointer(the_virtio_vsock, vsock);

@@ -695,6 +718,7 @@ static struct virtio_device_id id_table[] = {
};

static unsigned int features[] = {
+   VIRTIO_VSOCK_F_SEQPACKET
};

static struct virtio_driver virtio_vsock_driver = {
--
2.25.1



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v10 13/18] virtio/vsock: rest of SOCK_SEQPACKET support

2021-06-03 Thread Stefano Garzarella

On Thu, May 20, 2021 at 10:18:37PM +0300, Arseny Krasnov wrote:

Small updates to make SOCK_SEQPACKET work:
1) Send SHUTDOWN on socket close for SEQPACKET type.
2) Set SEQPACKET packet type during send.
3) Set 'VIRTIO_VSOCK_SEQ_EOR' bit in flags for last
  packet of message.

Signed-off-by: Arseny Krasnov 
---
v9 -> v10:
1) Use 'msg_data_left()' instead of direct access to 'msg_hdr'.
2) Commit message updated.
3) Add check for socket type when setting SEQ_EOR bit.

include/linux/virtio_vsock.h|  4 
net/vmw_vsock/virtio_transport_common.c | 18 --
2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index 02acf6e9ae04..7360ab7ea0af 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -80,6 +80,10 @@ virtio_transport_dgram_dequeue(struct vsock_sock *vsk,
   struct msghdr *msg,
   size_t len, int flags);

+int
+virtio_transport_seqpacket_enqueue(struct vsock_sock *vsk,
+  struct msghdr *msg,
+  size_t len);
ssize_t
virtio_transport_seqpacket_dequeue(struct vsock_sock *vsk,
   struct msghdr *msg,
diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index a6f8b0f39775..f7a3281b3eab 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -74,6 +74,11 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info 
*info,
err = memcpy_from_msg(pkt->buf, info->msg, len);
if (err)
goto out;
+
+   if (msg_data_left(info->msg) == 0 &&
+   info->type == VIRTIO_VSOCK_TYPE_SEQPACKET)
+   pkt->hdr.flags = cpu_to_le32(info->flags |
+   VIRTIO_VSOCK_SEQ_EOR);


`pkt->hdr.flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR)` should be enough, 
no?


Stefano


}

trace_virtio_transport_alloc_pkt(src_cid, src_port,
@@ -187,7 +192,7 @@ static int virtio_transport_send_pkt_info(struct vsock_sock 
*vsk,
struct virtio_vsock_pkt *pkt;
u32 pkt_len = info->pkt_len;

-   info->type = VIRTIO_VSOCK_TYPE_STREAM;
+   info->type = virtio_transport_get_type(sk_vsock(vsk));

t_ops = virtio_transport_get_ops(vsk);
if (unlikely(!t_ops))
@@ -478,6 +483,15 @@ virtio_transport_seqpacket_dequeue(struct vsock_sock *vsk,
}
EXPORT_SYMBOL_GPL(virtio_transport_seqpacket_dequeue);

+int
+virtio_transport_seqpacket_enqueue(struct vsock_sock *vsk,
+  struct msghdr *msg,
+  size_t len)
+{
+   return virtio_transport_stream_enqueue(vsk, msg, len);
+}
+EXPORT_SYMBOL_GPL(virtio_transport_seqpacket_enqueue);
+
int
virtio_transport_dgram_dequeue(struct vsock_sock *vsk,
   struct msghdr *msg,
@@ -912,7 +926,7 @@ void virtio_transport_release(struct vsock_sock *vsk)
struct sock *sk = >sk;
bool remove_sock = true;

-   if (sk->sk_type == SOCK_STREAM)
+   if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)
remove_sock = virtio_transport_close(vsk);

if (remove_sock) {
--
2.25.1



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 3/3] virtio_blk: implement blk_mq_ops->poll()

2021-06-03 Thread Stefan Hajnoczi
On Thu, May 27, 2021 at 10:44:51AM +0800, Ming Lei wrote:
> On Thu, May 20, 2021 at 03:13:05PM +0100, Stefan Hajnoczi wrote:
> > Request completion latency can be reduced by using polling instead of
> > irqs. Even Posted Interrupts or similar hardware support doesn't beat
> > polling. The reason is that disabling virtqueue notifications saves
> > critical-path CPU cycles on the host by skipping irq injection and in
> > the guest by skipping the irq handler. So let's add blk_mq_ops->poll()
> > support to virtio_blk.
> > 
> > The approach taken by this patch differs from the NVMe driver's
> > approach. NVMe dedicates hardware queues to polling and submits
> > REQ_HIPRI requests only on those queues. This patch does not require
> > exclusive polling queues for virtio_blk. Instead, it switches between
> > irqs and polling when one or more REQ_HIPRI requests are in flight on a
> > virtqueue.
> > 
> > This is possible because toggling virtqueue notifications is cheap even
> > while the virtqueue is running. NVMe cqs can't do this because irqs are
> > only enabled/disabled at queue creation time.
> > 
> > This toggling approach requires no configuration. There is no need to
> > dedicate queues ahead of time or to teach users and orchestration tools
> > how to set up polling queues.
> 
> This approach looks good, and very neat thanks per-vq lock.
> 
> BTW, is there any virt-exit saved by disabling vq interrupt? I understand
> there isn't since virt-exit may only be involved in remote completion
> via sending IPI.

This patch doesn't eliminate vmexits. QEMU already has virtqueue polling
code that disables the vq notification (the virtio-pci hardware register
write that causes a vmexit).

However, when both the guest
driver and the emulated device are polling then there are no vmexits or
interrupt injections with this patch.

> > 
> > Possible drawbacks of this approach:
> > 
> > - Hardware virtio_blk implementations may find virtqueue_disable_cb()
> >   expensive since it requires DMA. If such devices become popular then
> 
> You mean the hardware need to consider order between DMA completion and
> interrupt notify? But it is disabling notify, guest just calls
> virtqueue_get_buf() to see if there is buffer available, if not, it will be
> polled again.

Software devices have cheap access to guest RAM for looking at the
virtqueue_disable_cb() state before injecting an irq. Hardware devices
need to perform a DMA transaction to read that state. They have to do
this every time they want to raise an irq because the guest driver may
have changed the value.

I'm not sure if the DMA overhead is acceptable. This problem is not
introduced by this patch, it's a VIRTIO spec design issue.

I was trying to express that dedicated polling queues would avoid the
DMA since the device knows that irqs are never needed for this virtqueue.

> 
> >   the virtio_blk driver could use a similar approach to NVMe when
> >   VIRTIO_F_ACCESS_PLATFORM is detected in the future.
> > 
> > - If a blk_poll() thread is descheduled it not only hurts polling
> >   performance but also delays completion of non-REQ_HIPRI requests on
> >   that virtqueue since vq notifications are disabled.
> > 
> > Performance:
> > 
> > - Benchmark: fio ioengine=pvsync2 numjobs=4 direct=1
> > - Guest: 4 vCPUs with one virtio-blk device (4 virtqueues)
> 
> 4 jobs can consume up all 4 vCPUs. Just run a quick fio test with
> 'ioengine=io_uring --numjobs=1' on single vq, and IOPS can be improved
> by ~20%(hipri=1 vs hipri=0) with the 3 patches, and the virtio-blk is
> still backed on NVMe SSD.

Nice, thank you for sharing the data!

Stefan


signature.asc
Description: PGP signature
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v10 12/18] virtio/vsock: add SEQPACKET receive logic

2021-06-03 Thread Stefano Garzarella

On Thu, May 20, 2021 at 10:18:21PM +0300, Arseny Krasnov wrote:

Update current receive logic for SEQPACKET support: performs
check for packet and socket types on receive(if mismatch, then
reset connection).


We also copy the flags. Please check better your commit messages.



Signed-off-by: Arseny Krasnov 
---
v9 -> v10:
1) Commit message updated.
2) Comment updated.
3) Updated way to to set 'last_pkt' flags.

net/vmw_vsock/virtio_transport_common.c | 30 ++---
1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index 61349b2ea7fe..a6f8b0f39775 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -165,6 +165,14 @@ void virtio_transport_deliver_tap_pkt(struct 
virtio_vsock_pkt *pkt)
}
EXPORT_SYMBOL_GPL(virtio_transport_deliver_tap_pkt);

+static u16 virtio_transport_get_type(struct sock *sk)
+{
+   if (sk->sk_type == SOCK_STREAM)
+   return VIRTIO_VSOCK_TYPE_STREAM;
+   else
+   return VIRTIO_VSOCK_TYPE_SEQPACKET;
+}
+
/* This function can only be used on connecting/connected sockets,
 * since a socket assigned to a transport is required.
 *
@@ -979,13 +987,17 @@ virtio_transport_recv_enqueue(struct vsock_sock *vsk,
   struct virtio_vsock_pkt, list);

/* If there is space in the last packet queued, we copy the
-* new packet in its buffer.
+* new packet in its buffer(except SEQPACKET case, when we
+* also check that last packet is not last packet of previous
+* record).


Is better to explain why we don't do this for SEQPACKET, something like this:

/* If there is space in the last packet queued, we copy the
 * new packet in its buffer.
 * We avoid this if the last packet queued has
 * VIRTIO_VSOCK_SEQ_EOR set, because it is the delimiter
 * of SEQPACKET record, so `pkt` is the first packet
 * of a new record.
 */


 */
-   if (pkt->len <= last_pkt->buf_len - last_pkt->len) {
+   if ((pkt->len <= last_pkt->buf_len - last_pkt->len) &&
+   !(le32_to_cpu(last_pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR)) 
{
memcpy(last_pkt->buf + last_pkt->len, pkt->buf,
   pkt->len);
last_pkt->len += pkt->len;
free_pkt = true;
+   last_pkt->hdr.flags |= pkt->hdr.flags;
goto out;
}
}
@@ -1151,6 +1163,12 @@ virtio_transport_recv_listen(struct sock *sk, struct 
virtio_vsock_pkt *pkt,
return 0;
}

+static bool virtio_transport_valid_type(u16 type)
+{
+   return (type == VIRTIO_VSOCK_TYPE_STREAM) ||
+  (type == VIRTIO_VSOCK_TYPE_SEQPACKET);
+}
+
/* We are under the virtio-vsock's vsock->rx_lock or vhost-vsock's vq->mutex
 * lock.
 */
@@ -1176,7 +1194,7 @@ void virtio_transport_recv_pkt(struct virtio_transport *t,
le32_to_cpu(pkt->hdr.buf_alloc),
le32_to_cpu(pkt->hdr.fwd_cnt));

-   if (le16_to_cpu(pkt->hdr.type) != VIRTIO_VSOCK_TYPE_STREAM) {
+   if (!virtio_transport_valid_type(le16_to_cpu(pkt->hdr.type))) {
(void)virtio_transport_reset_no_sock(t, pkt);
goto free_pkt;
}
@@ -1193,6 +1211,12 @@ void virtio_transport_recv_pkt(struct virtio_transport 
*t,
}
}

+   if (virtio_transport_get_type(sk) != le16_to_cpu(pkt->hdr.type)) {
+   (void)virtio_transport_reset_no_sock(t, pkt);
+   sock_put(sk);
+   goto free_pkt;
+   }
+
vsk = vsock_sk(sk);

lock_sock(sk);
--
2.25.1



The rest LGTM.

Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[CFP LPC 2021] Confidential Computing Microconference

2021-06-03 Thread Joerg Roedel
Hi,

I am pleased to announce that the

Confidential Computing Microconference[1]

has been accepted at this years Linux Plumbers Conference.

In this microconference we will discuss how Linux can support encryption
technologies which protect data during processing on the CPU. Examples
are AMD SEV, Intel TDX, IBM Secure Execution for s390x and ARM Secure
Virtualization.

Suggested Topics are:

- Live Migration of Confidential VMs
- Lazy Memory Validation
- APIC emulation/interrupt management
- Debug Support for Confidential VMs
- Required Memory Management changes for memory validation
- Safe Kernel entry for TDX and SEV exceptions
- Requirements for Confidential Containers
- Trusted Device Drivers Framework and driver fuzzing
- Remote Attestation

Please submit your proposals on the LPC website at:

https://www.linuxplumbersconf.org/event/11/abstracts/#submit-abstract

Make sure to select "Confidential Computing MC" in the Track pulldown
menu.

Looking forward to seeing you all there! :)

Thanks,

Joerg Roedel

[1] 
https://www.linuxplumbersconf.org/blog/2021/index.php/2021/05/14/confidential-computing-microconference-accepted-into-2021-linux-plumbers-conference/
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v10 11/18] virtio/vsock: dequeue callback for SOCK_SEQPACKET

2021-06-03 Thread Stefano Garzarella

On Thu, May 20, 2021 at 10:17:58PM +0300, Arseny Krasnov wrote:

Callback fetches RW packets from rx queue of socket until whole record
is copied(if user's buffer is full, user is not woken up). This is done
to not stall sender, because if we wake up user and it leaves syscall,
nobody will send credit update for rest of record, and sender will wait
for next enter of read syscall at receiver's side. So if user buffer is
full, we just send credit update and drop data.

Signed-off-by: Arseny Krasnov 
---
v9 -> v10:
1) Number of dequeued bytes incremented even in case when
   user's buffer is full.
2) Use 'msg_data_left()' instead of direct access to 'msg_hdr'.
3) Rename variable 'err' to 'dequeued_len', in case of error
   it has negative value.

include/linux/virtio_vsock.h|  5 ++
net/vmw_vsock/virtio_transport_common.c | 65 +
2 files changed, 70 insertions(+)

diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index dc636b727179..02acf6e9ae04 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -80,6 +80,11 @@ virtio_transport_dgram_dequeue(struct vsock_sock *vsk,
   struct msghdr *msg,
   size_t len, int flags);

+ssize_t
+virtio_transport_seqpacket_dequeue(struct vsock_sock *vsk,
+  struct msghdr *msg,
+  int flags,
+  bool *msg_ready);
s64 virtio_transport_stream_has_data(struct vsock_sock *vsk);
s64 virtio_transport_stream_has_space(struct vsock_sock *vsk);

diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index ad0d34d41444..61349b2ea7fe 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -393,6 +393,59 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
return err;
}

+static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk,
+struct msghdr *msg,
+int flags,
+bool *msg_ready)
+{
+   struct virtio_vsock_sock *vvs = vsk->trans;
+   struct virtio_vsock_pkt *pkt;
+   int dequeued_len = 0;
+   size_t user_buf_len = msg_data_left(msg);
+
+   *msg_ready = false;
+   spin_lock_bh(>rx_lock);
+
+   while (!*msg_ready && !list_empty(>rx_queue) && dequeued_len >= 0) 
{


I'


+   size_t bytes_to_copy;
+   size_t pkt_len;
+
+   pkt = list_first_entry(>rx_queue, struct virtio_vsock_pkt, 
list);
+   pkt_len = (size_t)le32_to_cpu(pkt->hdr.len);
+   bytes_to_copy = min(user_buf_len, pkt_len);
+
+   if (bytes_to_copy) {
+   /* sk_lock is held by caller so no one else can dequeue.
+* Unlock rx_lock since memcpy_to_msg() may sleep.
+*/
+   spin_unlock_bh(>rx_lock);
+
+   if (memcpy_to_msg(msg, pkt->buf, bytes_to_copy))
+   dequeued_len = -EINVAL;


I think here is better to return the error returned by memcpy_to_msg(), 
as we do in the other place where we use memcpy_to_msg().


I mean something like this:
err = memcpy_to_msgmsg, pkt->buf, bytes_to_copy);
if (err)
dequeued_len = err;


+   else
+   user_buf_len -= bytes_to_copy;
+
+   spin_lock_bh(>rx_lock);
+   }
+


Maybe here we can simply break the cycle if we have an error:
if (dequeued_len < 0)
break;

Or we can refactor a bit, simplifying the while() condition and also the 
code in this way (not tested):


while (!*msg_ready && !list_empty(>rx_queue)) {
...

if (bytes_to_copy) {
int err;

/* ...
*/
spin_unlock_bh(>rx_lock);
err = memcpy_to_msgmsg, pkt->buf, bytes_to_copy);
if (err) {
dequeued_len = err;
goto out;
}
spin_lock_bh(>rx_lock);

user_buf_len -= bytes_to_copy;
}

dequeued_len += pkt_len;

if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR)
*msg_ready = true;

virtio_transport_dec_rx_pkt(vvs, pkt);
list_del(>list);
virtio_transport_free_pkt(pkt);
}

out:
spin_unlock_bh(>rx_lock);

virtio_transport_send_credit_update(vsk);

return dequeued_len;
}


Re: vhost: multiple worker support

2021-06-03 Thread Stefan Hajnoczi
On Tue, May 25, 2021 at 01:05:51PM -0500, Mike Christie wrote:
> The following patches apply over linus's tree or mst's vhost branch
> and my cleanup patchset:
> 
> https://lists.linuxfoundation.org/pipermail/virtualization/2021-May/054354.html
> 
> These patches allow us to support multiple vhost workers per device. I
> ended up just doing Stefan's original idea where userspace has the
> kernel create a worker and we pass back the pid. This has the benefit
> over the workqueue and userspace thread approach where we only have
> one'ish code path in the kernel during setup to detect old tools. The
> main IO paths and device/vq setup/teardown paths all use common code.
> 
> The kernel patches here allow us to then do N workers device and also
> share workers across devices.
> 
> I've also included a patch for qemu so you can get an idea of how it
> works. If we are ok with the kernel code then I'll break that up into
> a patchset and send to qemu-devel.

It seems risky to allow userspace process A to "share" a vhost worker
thread with userspace process B based on a matching pid alone. Should
they have ptrace_may_access() or similar?

For example, two competing users could sabotague each other by running
lots of work items on their competitor's vhost worker thread.


signature.asc
Description: PGP signature
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH 9/9] vhost: support sharing workers across devs

2021-06-03 Thread Stefan Hajnoczi
On Tue, May 25, 2021 at 01:06:00PM -0500, Mike Christie wrote:
> This allows a worker to handle multiple device's vqs.
> 
> TODO:
> - The worker is attached to the cgroup of the device that created it. In
> this patch you can share workers with devices with different owners which
> could be in different cgroups. Do we want to restict sharing workers with
> devices that have the same owner (dev->mm value)?

Question for Michael or Jason.


signature.asc
Description: PGP signature
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH 8/9] vhost: add vhost_dev pointer to vhost_work

2021-06-03 Thread Stefan Hajnoczi
On Tue, May 25, 2021 at 01:05:59PM -0500, Mike Christie wrote:
> The next patch allows a vhost_worker to handle different devices. To
> prepare for that, this patch adds a pointer to the device on the work so
> we can get to the different mms in the vhost_worker thread.
> 
> Signed-off-by: Mike Christie 
> ---
>  drivers/vhost/scsi.c  |  7 ---
>  drivers/vhost/vhost.c | 24 ++--
>  drivers/vhost/vhost.h |  4 +++-
>  drivers/vhost/vsock.c |  3 ++-
>  4 files changed, 23 insertions(+), 15 deletions(-)

Reviewed-by: Stefan Hajnoczi 


signature.asc
Description: PGP signature
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH 7/9] vhost: allow userspace to create workers

2021-06-03 Thread Stefan Hajnoczi
On Tue, May 25, 2021 at 01:05:58PM -0500, Mike Christie wrote:
> This patch allows userspace to create workers and bind them to vqs, so you
> can have N workers per dev and also share N workers with M vqs. The next
> patch will allow sharing across devices.
> 
> Signed-off-by: Mike Christie 
> ---
>  drivers/vhost/vhost.c| 94 +++-
>  drivers/vhost/vhost.h|  3 +
>  include/uapi/linux/vhost.h   |  6 ++
>  include/uapi/linux/vhost_types.h | 12 
>  4 files changed, 113 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 345ade0af133..981e9bac7a31 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -30,6 +30,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include "vhost.h"
>  
> @@ -42,6 +43,9 @@ module_param(max_iotlb_entries, int, 0444);
>  MODULE_PARM_DESC(max_iotlb_entries,
>   "Maximum number of iotlb entries. (default: 2048)");
>  
> +static DEFINE_HASHTABLE(vhost_workers, 5);
> +static DEFINE_SPINLOCK(vhost_workers_lock);
> +
>  enum {
>   VHOST_MEMORY_F_LOG = 0x1,
>  };
> @@ -617,8 +621,17 @@ static void vhost_detach_mm(struct vhost_dev *dev)
>   dev->mm = NULL;
>  }
>  
> -static void vhost_worker_free(struct vhost_worker *worker)
> +static void vhost_worker_put(struct vhost_worker *worker)
>  {
> + spin_lock(_workers_lock);
> + if (!refcount_dec_and_test(>refcount)) {
> + spin_unlock(_workers_lock);
> + return;
> + }
> +
> + hash_del(>h_node);
> + spin_unlock(_workers_lock);
> +
>   WARN_ON(!llist_empty(>work_list));
>   kthread_stop(worker->task);
>   kfree(worker);
> @@ -632,7 +645,7 @@ static void vhost_workers_free(struct vhost_dev *dev)
>   return;
>  
>   for (i = 0; i < dev->num_workers; i++)
> - vhost_worker_free(dev->workers[i]);
> + vhost_worker_put(dev->workers[i]);
>  
>   kfree(dev->workers);
>   dev->num_workers = 0;
> @@ -652,6 +665,8 @@ static struct vhost_worker *vhost_worker_create(struct 
> vhost_dev *dev)
>   worker->id = dev->num_workers;
>   worker->dev = dev;
>   init_llist_head(>work_list);
> + INIT_HLIST_NODE(>h_node);
> + refcount_set(>refcount, 1);
>  
>   task = kthread_create(vhost_worker, worker, "vhost-%d", current->pid);
>   if (IS_ERR(task))
> @@ -664,6 +679,9 @@ static struct vhost_worker *vhost_worker_create(struct 
> vhost_dev *dev)
>   if (ret)
>   goto stop_worker;
>  
> + spin_lock(_workers_lock);
> + hash_add(vhost_workers, >h_node, worker->task->pid);
> + spin_unlock(_workers_lock);
>   return worker;
>  
>  stop_worker:
> @@ -673,6 +691,67 @@ static struct vhost_worker *vhost_worker_create(struct 
> vhost_dev *dev)
>   return NULL;
>  }
>  
> +static struct vhost_worker *vhost_worker_find(struct vhost_dev *dev, pid_t 
> pid)
> +{
> + struct vhost_worker *worker, *found_worker = NULL;
> +
> + spin_lock(_workers_lock);
> + hash_for_each_possible(vhost_workers, worker, h_node, pid) {
> + if (worker->task->pid == pid) {
> + /* tmp - next patch allows sharing across devs */
> + if (worker->dev != dev)
> + break;
> +
> + found_worker = worker;
> + refcount_inc(>refcount);
> + break;
> + }
> + }
> + spin_unlock(_workers_lock);
> + return found_worker;
> +}
> +
> +/* Caller must have device mutex */
> +static int vhost_vq_set_worker(struct vhost_virtqueue *vq,
> +struct vhost_vring_worker *info)
> +{
> + struct vhost_dev *dev = vq->dev;
> + struct vhost_worker *worker;
> +
> + if (vq->worker) {
> + /* TODO - support changing while works are running */
> + return -EBUSY;
> + }
> +
> + if (info->pid == VHOST_VRING_NEW_WORKER) {
> + worker = vhost_worker_create(dev);

The maximum number of kthreads created is limited by
vhost_dev_init(nvqs)? For example VHOST_SCSI_MAX_VQ 128.

IIUC kthread_create is not limited by or accounted against the current
task, so I'm a little worried that a process can create a lot of
kthreads.

I haven't investigated other kthread_create() users reachable from
userspace applications to see how they bound the number of threads
effectively.

Any thoughts?

> + if (!worker)
> + return -ENOMEM;
> +
> + info->pid = worker->task->pid;
> + } else {
> + worker = vhost_worker_find(dev, info->pid);
> + if (!worker)
> + return -ENODEV;
> + }
> +
> + if (!dev->workers) {
> + dev->workers = kcalloc(vq->dev->nvqs,
> +sizeof(struct vhost_worker *),
> +GFP_KERNEL);

Another candidate for 

Re: [PATCH 6/9] vhost-scsi: make SCSI cmd completion per vq

2021-06-03 Thread Stefan Hajnoczi
On Tue, May 25, 2021 at 01:05:57PM -0500, Mike Christie wrote:
> This patch separates the scsi cmd completion code paths so we can complete
> cmds based on their vq instead of having all cmds complete on the same
> worker/CPU. This will be useful with the next patch that allows us to
> create mulitple worker threads and bind them to different vqs, so we can
> have completions running on different threads/CPUs.
> 
> Signed-off-by: Mike Christie 
> ---
>  drivers/vhost/scsi.c | 48 +++-
>  1 file changed, 25 insertions(+), 23 deletions(-)

Reviewed-by: Stefan Hajnoczi 


signature.asc
Description: PGP signature
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v1 1/8] virtio: Force only split mode with protected guest

2021-06-03 Thread Andi Kleen



Ok, but what I meant is this, if we don't read from the descriptor 
ring, and validate all the other metadata supplied by the device (used 
id and len). Then there should be no way for the device to suppress 
the dma flags to write to the indirect descriptor table.


Or do you have an example how it can do that?


I don't. If you can validate everything it's probably ok

The only drawback is even more code to audit and test.

-Andi


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 5/9] vhost-scsi: flush IO vqs then send TMF rsp

2021-06-03 Thread Stefan Hajnoczi
On Tue, May 25, 2021 at 01:05:56PM -0500, Mike Christie wrote:
> With one worker we will always send the scsi cmd responses then send the
> TMF rsp, because LIO will always complete the scsi cmds first which
> calls vhost_scsi_release_cmd to add them to the work queue.
> 
> When the next patches adds multiple worker support, the worker threads
> could still be sending their responses when the tmf's work is run.
> So this patch has vhost-scsi flush the IO vqs on other worker threads
> before we send the tmf response.
> 
> Signed-off-by: Mike Christie 
> ---
>  drivers/vhost/scsi.c  | 17 ++---
>  drivers/vhost/vhost.c |  6 ++
>  drivers/vhost/vhost.h |  1 +
>  3 files changed, 21 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
> index 46f897e41217..e585f2180457 100644
> --- a/drivers/vhost/scsi.c
> +++ b/drivers/vhost/scsi.c
> @@ -1168,12 +1168,23 @@ static void vhost_scsi_tmf_resp_work(struct 
> vhost_work *work)
>  {
>   struct vhost_scsi_tmf *tmf = container_of(work, struct vhost_scsi_tmf,
> vwork);
> - int resp_code;
> + int resp_code, i;
> +
> + if (tmf->scsi_resp == TMR_FUNCTION_COMPLETE) {
> + /*
> +  * When processing a TMF lio completes the cmds then the TMF,
> +  * so with one worker the TMF always completes after cmds. For
> +  * multiple worker support we must flush the IO vqs to make
> +  * sure if they have differrent workers then the cmds have

s/differrent/different/

Reviewed-by: Stefan Hajnoczi 


signature.asc
Description: PGP signature
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH 4/9] vhost: allow vhost_polls to use different vhost_workers

2021-06-03 Thread Stefan Hajnoczi
On Tue, May 25, 2021 at 01:05:55PM -0500, Mike Christie wrote:
> This allows vhost_polls to use the worker the vq the poll is associated
> with.
> 
> This also exports a helper vhost_vq_work_queue which is used here
> internally, and will be used in the vhost-scsi patches.
> 
> Signed-off-by: Mike Christie 
> ---
>  drivers/vhost/net.c   |  6 --
>  drivers/vhost/vhost.c | 19 ---
>  drivers/vhost/vhost.h |  6 +-
>  3 files changed, 25 insertions(+), 6 deletions(-)

Reviewed-by: Stefan Hajnoczi 


signature.asc
Description: PGP signature
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

[PATCH v2 1/2] kexec: Allow architecture code to opt-out at runtime

2021-06-03 Thread Joerg Roedel
From: Joerg Roedel 

Allow a runtime opt-out of kexec support for architecture code in case
the kernel is running in an environment where kexec is not properly
supported yet.

This will be used on x86 when the kernel is running as an SEV-ES
guest. SEV-ES guests need special handling for kexec to hand over all
CPUs to the new kernel. This requires special hypervisor support and
handling code in the guest which is not yet implemented.

Cc: sta...@vger.kernel.org # v5.10+
Signed-off-by: Joerg Roedel 
---
 include/linux/kexec.h |  1 +
 kernel/kexec.c| 14 ++
 kernel/kexec_file.c   |  9 +
 3 files changed, 24 insertions(+)

diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 0c994ae37729..85c30dcd0bdc 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -201,6 +201,7 @@ int arch_kexec_kernel_verify_sig(struct kimage *image, void 
*buf,
 unsigned long buf_len);
 #endif
 int arch_kexec_locate_mem_hole(struct kexec_buf *kbuf);
+bool arch_kexec_supported(void);
 
 extern int kexec_add_buffer(struct kexec_buf *kbuf);
 int kexec_locate_mem_hole(struct kexec_buf *kbuf);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c82c6c06f051..d03134160458 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -195,11 +195,25 @@ static int do_kexec_load(unsigned long entry, unsigned 
long nr_segments,
  * that to happen you need to do that yourself.
  */
 
+bool __weak arch_kexec_supported(void)
+{
+   return true;
+}
+
 static inline int kexec_load_check(unsigned long nr_segments,
   unsigned long flags)
 {
int result;
 
+   /*
+* The architecture may support kexec in general, but the kernel could
+* run in an environment where it is not (yet) possible to execute a new
+* kernel. Allow the architecture code to opt-out of kexec support when
+* it is running in such an environment.
+*/
+   if (!arch_kexec_supported())
+   return -ENOSYS;
+
/* We only trust the superuser with rebooting the system. */
if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
return -EPERM;
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 33400ff051a8..96d08a512e9c 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -358,6 +358,15 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, 
initrd_fd,
int ret = 0, i;
struct kimage **dest_image, *image;
 
+   /*
+* The architecture may support kexec in general, but the kernel could
+* run in an environment where it is not (yet) possible to execute a new
+* kernel. Allow the architecture code to opt-out of kexec support when
+* it is running in such an environment.
+*/
+   if (!arch_kexec_supported())
+   return -ENOSYS;
+
/* We only trust the superuser with rebooting the system. */
if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
return -EPERM;
-- 
2.31.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v2 2/2] x86/kexec/64: Forbid kexec when running as an SEV-ES guest

2021-06-03 Thread Joerg Roedel
From: Joerg Roedel 

For now, kexec is not supported when running as an SEV-ES guest. Doing
so requires additional hypervisor support and special code to hand
over the CPUs to the new kernel in a safe way.

Until this is implemented, do not support kexec in SEV-ES guests.

Cc: sta...@vger.kernel.org # v5.10+
Signed-off-by: Joerg Roedel 
---
 arch/x86/kernel/machine_kexec_64.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/x86/kernel/machine_kexec_64.c 
b/arch/x86/kernel/machine_kexec_64.c
index c078b0d3ab0e..f902cc9cc634 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -620,3 +620,11 @@ void arch_kexec_pre_free_pages(void *vaddr, unsigned int 
pages)
 */
set_memory_encrypted((unsigned long)vaddr, pages);
 }
+
+/*
+ * Kexec is not supported in SEV-ES guests yet
+ */
+bool arch_kexec_supported(void)
+{
+   return !sev_es_active();
+}
-- 
2.31.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v2 0/2] x86: Disable kexec for SEV-ES guests

2021-06-03 Thread Joerg Roedel
From: Joerg Roedel 

Changes v1->v2:

- Rebased to v5.13-rc4
- Add the check also to the kexec_file_load system call

Original cover letter:

Hi,

two small patches to disable kexec on x86 when running as an SEV-ES
guest. Trying to kexec a new kernel would fail anyway because there is
no mechanism yet to hand over the APs from the old to the new kernel.
Supporting this needs changes in the Hypervisor and the guest kernel
as well.

This code is currently being work on, but disable kexec in SEV-ES
guests until it is ready.

Please review.

Regards,

Joerg

Joerg Roedel (2):
  kexec: Allow architecture code to opt-out at runtime
  x86/kexec/64: Forbid kexec when running as an SEV-ES guest

 arch/x86/kernel/machine_kexec_64.c |  8 
 include/linux/kexec.h  |  1 +
 kernel/kexec.c | 14 ++
 kernel/kexec_file.c|  9 +
 4 files changed, 32 insertions(+)

-- 
2.31.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v1 6/8] dma: Add return value to dma_unmap_page

2021-06-03 Thread Andi Kleen




What it looks like to me is abusing SWIOTLB's internal housekeeping to 
keep track of virtio-specific state. The DMA API does not attempt to 
validate calls in general since in many cases the additional overhead 
would be prohibitive. It has always been callers' responsibility to 
keep track of what they mapped and make sure sync/unmap calls match, 
and there are many, many, subtle and not-so-subtle ways for things to 
go wrong if they don't. If virtio is not doing a good enough job of 
that, what's the justification for making it the DMA API's problem?


In this case it's not prohibitive at all. Just adding a few error 
returns, and checking the overlap (which seems to have been already 
solved anyways) I would argue the error returns are good practice 
anyways, so that API users can check that something bad happening and 
abort.  The DMA API was never very good at proper error handling, but 
there's no reason at all to continue being bad it forever.


AFAIK the rest just works anyways, so it's not really a new problem to 
be solved.





A new callback is used to avoid changing all the IOMMU drivers.


Nit: presumably by "IOMMU drivers" you actually mean arch DMA API 
backends?

Yes


 Furthermore, AFAICS it's still not going to help against exfiltrating 
guest memory by over-unmapping the original SWIOTLB slot *without* 
going past the end of the whole buffer,


That would be just exfiltrating data that is already shared, unless I'm 
misunderstanding you.


-Andi


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH 3/9] vhost: modify internal functions to take a vhost_worker

2021-06-03 Thread Stefan Hajnoczi
On Tue, May 25, 2021 at 01:05:54PM -0500, Mike Christie wrote:
> -void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
> +static void vhost_work_queue_on(struct vhost_work *work,
> + struct vhost_worker *worker)
>  {
> - if (!dev->worker)
> - return;
> -
>   if (!test_and_set_bit(VHOST_WORK_QUEUED, >flags)) {
>   /* We can only add the work to the list after we're
>* sure it was not in the list.
>* test_and_set_bit() implies a memory barrier.
>*/
> - llist_add(>node, >worker->work_list);
> - wake_up_process(dev->worker->task);
> + llist_add(>node, >work_list);
> + wake_up_process(worker->task);
>   }
>  }
> +
> +void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)

When should this function still be used? A doc comment contrasting it to
vhost_work_queue_on() would be helpful. I would expect callers to switch
to that instead of queuing work on dev->workers[0].

>  /* A lockless hint for busy polling code to exit the loop */
>  bool vhost_has_work(struct vhost_dev *dev)
>  {
> - return dev->worker && !llist_empty(>worker->work_list);
> + int i;
> +
> + for (i = 0; i < dev->num_workers; i++) {
> + if (!llist_empty(>workers[i]->work_list))
> + return true;
> + }
> +
> + return false;
>  }
>  EXPORT_SYMBOL_GPL(vhost_has_work);

It's probably not necessary to poll all workers:

drivers/vhost/net.c calls vhost_has_work() to busy poll a specific
virtqueue. If the vq:worker mapping is 1:1 or N:1 then vhost_has_work()
should be extended to include the struct vhost_virtqueue so we can poll
just that vq worker's work_list.
>  /* Caller must have device mutex */
>  static int vhost_worker_try_create_def(struct vhost_dev *dev)
>  {
> - if (!dev->use_worker || dev->worker)
> + struct vhost_worker *worker;
> +
> + if (!dev->use_worker || dev->workers)
>   return 0;
>  
> - return vhost_worker_create(dev);
> + dev->workers = kcalloc(1, sizeof(struct vhost_worker *), GFP_KERNEL);

GFP_KERNEL_ACCOUNT so that vhost memory associated with a process (the
one that invoked the ioctl) is accounted? This may get trickier if the
workers are shared between processes.

The same applies for struct vhost_worker in vhost_worker_create().


signature.asc
Description: PGP signature
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH 2/9] vhost: move vhost worker creation to kick setup

2021-06-03 Thread Stefan Hajnoczi
On Tue, May 25, 2021 at 01:05:53PM -0500, Mike Christie wrote:
> The next patch will add new ioctls that allows userspace to create workers
> and bind them to devs and vqs after VHOST_SET_OWNER. To support older
> tools, newer tools that want to go wild with worker threads, and newer
> tools that want the old/default behavior this patch moves the default
> worker setup to the kick setup.
> 
> After the first vq's kick/poll setup is done we could start to get works
> queued, so this is the point when we must have a worker setup. If we are
> using older tools or the newer tools just want the default single vhost
> thread per dev behavior then it will automatically be done here. If the
> tools are using the newer ioctls that have already created the workers
> then we also detect that here and do nothing.
> 
> Signed-off-by: Mike Christie 
> ---
>  drivers/vhost/vhost.c | 23 ---
>  1 file changed, 16 insertions(+), 7 deletions(-)

Reviewed-by: Stefan Hajnoczi 


signature.asc
Description: PGP signature
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH 1/9] vhost: move worker thread fields to new struct

2021-06-03 Thread Stefan Hajnoczi
On Tue, May 25, 2021 at 01:05:52PM -0500, Mike Christie wrote:
> This is just a prep patch. It moves the worker related fields to a new
> vhost_worker struct and moves the code around to create some helpers that
> will be used in the next patches.
> 
> Signed-off-by: Mike Christie 
> ---
>  drivers/vhost/vhost.c | 94 +--
>  drivers/vhost/vhost.h |  9 -
>  2 files changed, 70 insertions(+), 33 deletions(-)

Reviewed-by: Stefan Hajnoczi 


signature.asc
Description: PGP signature
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: vhost: multiple worker support

2021-06-03 Thread Stefan Hajnoczi
On Tue, May 25, 2021 at 01:05:51PM -0500, Mike Christie wrote:
> Results:
> 
> When running with the null_blk driver and vhost-scsi I can get 1.2
> million IOPs by just running a simple
> 
> fio --filename=/dev/sda --direct=1 --rw=randrw --bs=4k --ioengine=libaio
> --iodepth=128  --numjobs=8 --time_based --group_reporting --name=iops
> --runtime=60 --eta-newline=1
> 
> The VM has 8 vCPUs and sda has 8 virtqueues and we can do a total of
> 1024 cmds per devices. To get 1.2 million IOPs I did have to tune and
> ran the virsh emulatorpin command so the vhost threads were running
> on different CPUs than the VM. If the vhost threads share CPUs then I
> get around 800K.
> 
> For a more real device that are also CPU hogs like iscsi, I can still
> get 1 million IOPs using 1 dm-multipath device over 8 iscsi paths
> (natively it gets 1.1 million IOPs).

There is no comparison against a baseline, but I guess it would be the
same 8 vCPU guest with single queue vhost-scsi?

> 
> Results/TODO Note:
> 
> - I ported the vdpa sim code to support multiple workers and as-is now
> it made perf much worse. If I increase vdpa_sim_blk's num queues to 4-8
> I get 700K IOPs with the fio command above. However with the multiple
> worker support it drops to 400K. The problem is the vdpa_sim lock
> and the iommu_lock. If I hack (like comment out locks or not worry about
> data corruption or crashes) then I can get around 1.2M - 1.6M IOPs with
> 8 queues and fio command above.
> 
> So these patches could help other drivers, but it will just take more
> work to remove those types of locks. I was hoping the 2 items could be
> done indepentently since it helps vhost-scsi immediately.

Cool, thank you for taking a look. That's useful info for Stefano. vDPA
and vhost can be handled independently though in the long term hopefully
they can share code.


signature.asc
Description: PGP signature
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v1 5/8] dma: Use size for swiotlb boundary checks

2021-06-03 Thread Robin Murphy

On 2021-06-03 01:41, Andi Kleen wrote:

swiotlb currently only uses the start address of a DMA to check if something
is in the swiotlb or not. But with virtio and untrusted hosts the host
could give some DMA mapping that crosses the swiotlb boundaries,
potentially leaking or corrupting data. Add size checks to all the swiotlb
checks and reject any DMAs that cross the swiotlb buffer boundaries.

Signed-off-by: Andi Kleen 
---
  drivers/iommu/dma-iommu.c   | 13 ++---
  drivers/xen/swiotlb-xen.c   | 11 ++-
  include/linux/dma-mapping.h |  4 ++--
  include/linux/swiotlb.h |  8 +---
  kernel/dma/direct.c |  8 
  kernel/dma/direct.h |  8 
  kernel/dma/mapping.c|  4 ++--
  net/xdp/xsk_buff_pool.c |  2 +-
  8 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 7bcdd1205535..7ef13198721b 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -504,7 +504,7 @@ static void __iommu_dma_unmap_swiotlb(struct device *dev, 
dma_addr_t dma_addr,
  
  	__iommu_dma_unmap(dev, dma_addr, size);


If you can't trust size below then you've already corrupted the IOMMU 
pagetables here :/


Robin.


-   if (unlikely(is_swiotlb_buffer(phys)))
+   if (unlikely(is_swiotlb_buffer(phys, size)))
swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
  }
  
@@ -575,7 +575,7 @@ static dma_addr_t __iommu_dma_map_swiotlb(struct device *dev, phys_addr_t phys,

}
  
  	iova = __iommu_dma_map(dev, phys, aligned_size, prot, dma_mask);

-   if (iova == DMA_MAPPING_ERROR && is_swiotlb_buffer(phys))
+   if (iova == DMA_MAPPING_ERROR && is_swiotlb_buffer(phys, org_size))
swiotlb_tbl_unmap_single(dev, phys, org_size, dir, attrs);
return iova;
  }
@@ -781,7 +781,7 @@ static void iommu_dma_sync_single_for_cpu(struct device 
*dev,
if (!dev_is_dma_coherent(dev))
arch_sync_dma_for_cpu(phys, size, dir);
  
-	if (is_swiotlb_buffer(phys))

+   if (is_swiotlb_buffer(phys, size))
swiotlb_sync_single_for_cpu(dev, phys, size, dir);
  }
  
@@ -794,7 +794,7 @@ static void iommu_dma_sync_single_for_device(struct device *dev,

return;
  
  	phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);

-   if (is_swiotlb_buffer(phys))
+   if (is_swiotlb_buffer(phys, size))
swiotlb_sync_single_for_device(dev, phys, size, dir);
  
  	if (!dev_is_dma_coherent(dev))

@@ -815,7 +815,7 @@ static void iommu_dma_sync_sg_for_cpu(struct device *dev,
if (!dev_is_dma_coherent(dev))
arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir);
  
-		if (is_swiotlb_buffer(sg_phys(sg)))

+   if (is_swiotlb_buffer(sg_phys(sg), sg->length))
swiotlb_sync_single_for_cpu(dev, sg_phys(sg),
sg->length, dir);
}
@@ -832,10 +832,9 @@ static void iommu_dma_sync_sg_for_device(struct device 
*dev,
return;
  
  	for_each_sg(sgl, sg, nelems, i) {

-   if (is_swiotlb_buffer(sg_phys(sg)))
+   if (is_swiotlb_buffer(sg_phys(sg), sg->length))
swiotlb_sync_single_for_device(dev, sg_phys(sg),
   sg->length, dir);
-
if (!dev_is_dma_coherent(dev))
arch_sync_dma_for_device(sg_phys(sg), sg->length, dir);
}
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 24d11861ac7d..333846af8d35 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -89,7 +89,8 @@ static inline int range_straddles_page_boundary(phys_addr_t 
p, size_t size)
return 0;
  }
  
-static int is_xen_swiotlb_buffer(struct device *dev, dma_addr_t dma_addr)

+static int is_xen_swiotlb_buffer(struct device *dev, dma_addr_t dma_addr,
+size_t size)
  {
unsigned long bfn = XEN_PFN_DOWN(dma_to_phys(dev, dma_addr));
unsigned long xen_pfn = bfn_to_local_pfn(bfn);
@@ -100,7 +101,7 @@ static int is_xen_swiotlb_buffer(struct device *dev, 
dma_addr_t dma_addr)
 * in our domain. Therefore _only_ check address within our domain.
 */
if (pfn_valid(PFN_DOWN(paddr)))
-   return is_swiotlb_buffer(paddr);
+   return is_swiotlb_buffer(paddr, size);
return 0;
  }
  
@@ -431,7 +432,7 @@ static void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,

}
  
  	/* NOTE: We use dev_addr here, not paddr! */

-   if (is_xen_swiotlb_buffer(hwdev, dev_addr))
+   if (is_xen_swiotlb_buffer(hwdev, dev_addr, size))
swiotlb_tbl_unmap_single(hwdev, paddr, size, dir, attrs);
  }
  
@@ -448,7 +449,7 @@ xen_swiotlb_sync_single_for_cpu(struct device *dev, dma_addr_t dma_addr,

  

Re: [PATCH v1 6/8] dma: Add return value to dma_unmap_page

2021-06-03 Thread Robin Murphy

Hi Andi,

On 2021-06-03 01:41, Andi Kleen wrote:

In some situations when we know swiotlb is forced and we have
to deal with untrusted hosts, it's useful to know if a mapping
was in the swiotlb or not. This allows us to abort any IO
operation that would access memory outside the swiotlb.

Otherwise it might be possible for a malicious host to inject
any guest page in a read operation. While it couldn't directly
access the results of the read() inside the guest, there
might scenarios where data is echoed back with a write(),
and that would then leak guest memory.

Add a return value to dma_unmap_single/page. Most users
of course will ignore it. The return value is set to EIO
if we're in forced swiotlb mode and the buffer is not inside
the swiotlb buffer. Otherwise it's always 0.


I have to say my first impression of this isn't too good :(

What it looks like to me is abusing SWIOTLB's internal housekeeping to 
keep track of virtio-specific state. The DMA API does not attempt to 
validate calls in general since in many cases the additional overhead 
would be prohibitive. It has always been callers' responsibility to keep 
track of what they mapped and make sure sync/unmap calls match, and 
there are many, many, subtle and not-so-subtle ways for things to go 
wrong if they don't. If virtio is not doing a good enough job of that, 
what's the justification for making it the DMA API's problem?



A new callback is used to avoid changing all the IOMMU drivers.


Nit: presumably by "IOMMU drivers" you actually mean arch DMA API backends?

As an aside, we'll take a look at the rest of the series for the 
perspective of our prototyping for Arm's Confidential Compute 
Architecture, but I'm not sure we'll need it, since accesses beyond the 
bounds of the shared SWIOTLB buffer shouldn't be an issue for us. 
Furthermore, AFAICS it's still not going to help against exfiltrating 
guest memory by over-unmapping the original SWIOTLB slot *without* going 
past the end of the whole buffer, but I think Martin's patch *has* 
addressed that already.


Robin.


Signed-off-by: Andi Kleen 
---
  drivers/iommu/dma-iommu.c   | 17 +++--
  include/linux/dma-map-ops.h |  3 +++
  include/linux/dma-mapping.h |  7 ---
  kernel/dma/mapping.c|  6 +-
  4 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 7ef13198721b..babe46f2ae3a 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -491,7 +491,8 @@ static void __iommu_dma_unmap(struct device *dev, 
dma_addr_t dma_addr,
iommu_dma_free_iova(cookie, dma_addr, size, iotlb_gather.freelist);
  }
  
-static void __iommu_dma_unmap_swiotlb(struct device *dev, dma_addr_t dma_addr,

+static int __iommu_dma_unmap_swiotlb_check(struct device *dev,
+   dma_addr_t dma_addr,
size_t size, enum dma_data_direction dir,
unsigned long attrs)
  {
@@ -500,12 +501,15 @@ static void __iommu_dma_unmap_swiotlb(struct device *dev, 
dma_addr_t dma_addr,
  
  	phys = iommu_iova_to_phys(domain, dma_addr);

if (WARN_ON(!phys))
-   return;
+   return -EIO;
  
  	__iommu_dma_unmap(dev, dma_addr, size);
  
  	if (unlikely(is_swiotlb_buffer(phys, size)))

swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
+   else if (swiotlb_force == SWIOTLB_FORCE)
+   return -EIO;
+   return 0;
  }
  
  static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,

@@ -856,12 +860,13 @@ static dma_addr_t iommu_dma_map_page(struct device *dev, 
struct page *page,
return dma_handle;
  }
  
-static void iommu_dma_unmap_page(struct device *dev, dma_addr_t dma_handle,

+static int iommu_dma_unmap_page_check(struct device *dev, dma_addr_t 
dma_handle,
size_t size, enum dma_data_direction dir, unsigned long attrs)
  {
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
iommu_dma_sync_single_for_cpu(dev, dma_handle, size, dir);
-   __iommu_dma_unmap_swiotlb(dev, dma_handle, size, dir, attrs);
+   return __iommu_dma_unmap_swiotlb_check(dev, dma_handle, size, dir,
+  attrs);
  }
  
  /*

@@ -946,7 +951,7 @@ static void iommu_dma_unmap_sg_swiotlb(struct device *dev, 
struct scatterlist *s
int i;
  
  	for_each_sg(sg, s, nents, i)

-   __iommu_dma_unmap_swiotlb(dev, sg_dma_address(s),
+   __iommu_dma_unmap_swiotlb_check(dev, sg_dma_address(s),
sg_dma_len(s), dir, attrs);
  }
  
@@ -1291,7 +1296,7 @@ static const struct dma_map_ops iommu_dma_ops = {

.mmap   = iommu_dma_mmap,
.get_sgtable= iommu_dma_get_sgtable,
.map_page   = iommu_dma_map_page,
-   .unmap_page = iommu_dma_unmap_page,
+   .unmap_page_check   = iommu_dma_unmap_page_check,
.map_sg   

Re: [PATCH v1] vdpa/mlx5: Add support for doorbell bypassing

2021-06-03 Thread Jason Wang


在 2021/6/3 下午3:38, Eli Cohen 写道:

On Thu, Jun 03, 2021 at 03:11:51PM +0800, Jason Wang wrote:

在 2021/6/2 下午5:53, Eli Cohen 写道:

Implement mlx5_get_vq_notification() to return the doorbell address.
Since the notification area is mapped to userspace, make sure that the
BAR size is at least PAGE_SIZE large.

Signed-off-by: Eli Cohen 
---
v0 --> v1:
Make sure SF bar size is not smaller than PAGE_SIZE

   drivers/vdpa/mlx5/core/mlx5_vdpa.h |  1 +
   drivers/vdpa/mlx5/core/resources.c |  1 +
   drivers/vdpa/mlx5/net/mlx5_vnet.c  | 17 +
   3 files changed, 19 insertions(+)

diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h 
b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
index 09a16a3d1b2a..0002b2136b48 100644
--- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h
+++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
@@ -42,6 +42,7 @@ struct mlx5_vdpa_resources {
u32 pdn;
struct mlx5_uars_page *uar;
void __iomem *kick_addr;
+   u64 phys_kick_addr;
u16 uid;
u32 null_mkey;
bool valid;
diff --git a/drivers/vdpa/mlx5/core/resources.c 
b/drivers/vdpa/mlx5/core/resources.c
index 836ab9ef0fa6..d4606213f88a 100644
--- a/drivers/vdpa/mlx5/core/resources.c
+++ b/drivers/vdpa/mlx5/core/resources.c
@@ -253,6 +253,7 @@ int mlx5_vdpa_alloc_resources(struct mlx5_vdpa_dev *mvdev)
goto err_key;
kick_addr = mdev->bar_addr + offset;
+   res->phys_kick_addr = kick_addr;
res->kick_addr = ioremap(kick_addr, PAGE_SIZE);
if (!res->kick_addr) {
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 5500bcfe84b4..1936039e05bd 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1871,8 +1871,25 @@ static void mlx5_vdpa_free(struct vdpa_device *vdev)
   static struct vdpa_notification_area mlx5_get_vq_notification(struct 
vdpa_device *vdev, u16 idx)
   {
+   struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
struct vdpa_notification_area ret = {};
+   struct mlx5_vdpa_net *ndev;
+   phys_addr_t addr;
+
+   /* If SF BAR size is smaller than PAGE_SIZE, do not use direct
+* notification to avoid the risk of mapping pages that contain BAR of 
more
+* than one SF
+*/
+   if (MLX5_CAP_GEN(mvdev->mdev, log_min_sf_size) + 12 < PAGE_SHIFT)
+   return ret;
+
+   ndev = to_mlx5_vdpa_ndev(mvdev);
+   addr = (phys_addr_t)ndev->mvdev.res.phys_kick_addr;
+   if (addr & ~PAGE_MASK)
+   return ret;


This has been checked by vhost-vDPA, and it's better to leave those policy
checking to them driver instead of checking it in the parent.


Not in all invocations of get_vq_notification(). For example, in
vhost_vdpa_fault() you call remap_pfn_range() with notify.addr >>
PAGE_SHIFT so it it was not aligned you mask this misalignment.



In order to have vhost_vdpa_fault() works, it should first pass the 
check of vhost_vdpa_mmap().


Othewise we won't install vma->vm_ops so there won't be a page fault for 
the doorbell.


Thanks





Thanks



+   ret.addr = (phys_addr_t)ndev->mvdev.res.phys_kick_addr;
+   ret.size = PAGE_SIZE;
return ret;
   }


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH] vdpa/mlx5: Clear vq ready indication upon device reset

2021-06-03 Thread Jason Wang


在 2021/6/3 下午3:30, Eli Cohen 写道:

On Thu, Jun 03, 2021 at 03:06:31PM +0800, Jason Wang wrote:

在 2021/6/3 下午3:00, Jason Wang 写道:

在 2021/6/2 下午4:59, Eli Cohen 写道:

After device reset, the virtqueues are not ready so clear the ready
field.

Failing to do so can result in virtio_vdpa failing to load if the device
was previously used by vhost_vdpa and the old values are ready.
virtio_vdpa expects to find VQs in "not ready" state.

Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5
devices")
Signed-off-by: Eli Cohen 


Acked-by: Jason Wang 


A second thought.

destroy_virtqueue() could be called many places.

One of them is the mlx5_vdpa_change_map(), if this is case, this looks
wrong.

Right, although most likely VQs become ready only after all map changes
occur becuase I did not encounter any issue while testing.



Yes, but it's not guaranteed that the map won't be changed. Userspace 
can update the mapping when new memory is plugged into the guest for 
example.




It looks to me it's simpler to do this in clear_virtqueues() which can only
be called during reset.

There is no clear_virtqueues() function. You probably mean to insert a
call in mlx5_vdpa_set_status() in case it performs reset. This function
will go over all virtqueues and clear their ready flag.



Right.




Alternatively we can add boolean argument to teardown_driver() that
signifies if we are in reset flow and in this case we clear ready.



Yes, but doing in set_status() seems easier.

Thanks





Thanks





---
   drivers/vdpa/mlx5/net/mlx5_vnet.c | 1 +
   1 file changed, 1 insertion(+)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 02a05492204c..e8bc0842b44c 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -862,6 +862,7 @@ static void destroy_virtqueue(struct
mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtq
   return;
   }
   umems_destroy(ndev, mvq);
+    mvq->ready = false;
   }
     static u32 get_rqpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [RFC PATCH] vdpa: mandate 1.0 device

2021-06-03 Thread Jason Wang


在 2021/6/2 下午6:30, Eli Cohen 写道:

On Wed, May 12, 2021 at 05:24:21PM +0800, Jason Wang wrote:

Michael,
Did you and Jason came into agreement regarding this?



Probably, let me send a formal patch and see what happens.

Thanks



Do you think we
can have the bits in 5.13 and still have time for me to push the vdpa
too stuff?



On Wed, May 12, 2021 at 3:54 PM Michael S. Tsirkin  wrote:

On Tue, May 11, 2021 at 04:43:13AM -0400, Jason Wang wrote:


- 原始邮件 -

在 2021/4/21 下午4:03, Michael S. Tsirkin 写道:

On Wed, Apr 21, 2021 at 03:41:36PM +0800, Jason Wang wrote:

在 2021/4/12 下午5:23, Jason Wang 写道:

在 2021/4/12 下午5:09, Michael S. Tsirkin 写道:

On Mon, Apr 12, 2021 at 02:35:07PM +0800, Jason Wang wrote:

在 2021/4/10 上午12:04, Michael S. Tsirkin 写道:

On Fri, Apr 09, 2021 at 12:47:55PM +0800, Jason Wang wrote:

在 2021/4/8 下午11:59, Michael S. Tsirkin 写道:

On Thu, Apr 08, 2021 at 04:26:48PM +0800, Jason Wang wrote:

This patch mandates 1.0 for vDPA devices. The goal is to have the
semantic of normative statement in the virtio
spec and eliminate the
burden of transitional device for both vDPA bus and vDPA parent.

uAPI seems fine since all the vDPA parent mandates
VIRTIO_F_ACCESS_PLATFORM which implies 1.0 devices.

For legacy guests, it can still work since Qemu will mediate when
necessary (e.g doing the endian conversion).

Signed-off-by: Jason Wang 

Hmm. If we do this, don't we still have a problem with
legacy drivers which don't ack 1.0?

Yes, but it's not something that is introduced in this
commit. The legacy
driver never work ...

My point is this neither fixes or prevents this.

So my suggestion is to finally add ioctls along the lines
of PROTOCOL_FEATURES of vhost-user.

Then that one can have bits for legacy le, legacy be and modern.

BTW I looked at vhost-user and it does not look like that
has a solution for this problem either, right?

Right.



Note 1.0 affects ring endianness which is not mediated in QEMU
so QEMU can't pretend to device guest is 1.0.

Right, I plan to send patches to do mediation in the
Qemu to unbreak legacy
drivers.

Thanks

I frankly think we'll need PROTOCOL_FEATURES anyway, it's
too useful ...
so why not teach drivers about it and be done with it? You
can't emulate
legacy on modern in a cross endian situation because of vring
endian-ness ...

So the problem still. This can only work when the hardware can support
legacy vring endian-ness.

Consider:

1) the leagcy driver support is non-normative in the spec
2) support a transitional device in the kenrel may requires the
hardware
support and a burden of kernel codes

I'd rather simply drop the legacy driver support

My point is this patch does not drop legacy support. It merely mandates
modern support.

I am not sure I get here. This patch fails the set_feature if VERSION_1
is not negotiated. This means:

1) vDPA presents a modern device instead of transitonal device
2) legacy driver can't be probed

What I'm missing?

Hi Michael:

Do you agree to find the way to present modern device? We need a
conclusion
to make the netlink API work to move forward.

Thanks

I think we need a way to support legacy with no data path overhead. qemu
setting VERSION_1 for a legacy guest affects the ring format so it does
not really work. This seems to rule out emulating config space entirely
in userspace.


So I'd rather drop the legacy support in this case. It never work for
vDPA in the past and virtio-vDPA doesn't even need that. Note that
ACCESS_PLATFORM is mandated for all the vDPA parents right now which
implies modern device and LE. I wonder what's the value for supporting
legacy in this case or do we really encourage vendors to ship card with
legacy support (e.g endian support in the hardware)?

Hi Michael:

Any thoughts on this approach?

My understanding is that dropping legacy support will simplify a lot of stuffs.

Thanks

So basically the main condition is that strong memory barriers aren't
needed for virtio, smp barriers are enough.
Are there architectures besides x86 (where it's kind of true - as long as
one does not use non-temporals) where that is true?
If all these architectures are LE then we don't need to worry
about endian support in the hardware.

So I agree it's better not to add those stuffs in either qemu or
kernel. See below.


In other words I guess yes we could have qemu limit things to x86 and
then just pretend to the card that it's virtio 1.
So endian-ness we can address.

Problem is virtio 1 has effects beyond this. things like header size
with mergeable buffers off for virtio net.

So I am inclined to say let us not do the "pretend it's virtio 1" game
in qemu.

I fully agree.

   Let us be honest to the card about what happens.

But if you want to limit things to x86 either in kernel or in qemu,
that's ok by me.

So what I want to do is:

1) mandate 1.0 device on the kernel
2) don't try to pretend transitional or legacy device on top of modern
device in Qemu, so qemu will fail to start if vhost-vDPA is 

Re: [PATCH v1] vdpa/mlx5: Add support for doorbell bypassing

2021-06-03 Thread Jason Wang


在 2021/6/2 下午5:53, Eli Cohen 写道:

Implement mlx5_get_vq_notification() to return the doorbell address.
Since the notification area is mapped to userspace, make sure that the
BAR size is at least PAGE_SIZE large.

Signed-off-by: Eli Cohen 
---
v0 --> v1:
   Make sure SF bar size is not smaller than PAGE_SIZE

  drivers/vdpa/mlx5/core/mlx5_vdpa.h |  1 +
  drivers/vdpa/mlx5/core/resources.c |  1 +
  drivers/vdpa/mlx5/net/mlx5_vnet.c  | 17 +
  3 files changed, 19 insertions(+)

diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h 
b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
index 09a16a3d1b2a..0002b2136b48 100644
--- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h
+++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
@@ -42,6 +42,7 @@ struct mlx5_vdpa_resources {
u32 pdn;
struct mlx5_uars_page *uar;
void __iomem *kick_addr;
+   u64 phys_kick_addr;
u16 uid;
u32 null_mkey;
bool valid;
diff --git a/drivers/vdpa/mlx5/core/resources.c 
b/drivers/vdpa/mlx5/core/resources.c
index 836ab9ef0fa6..d4606213f88a 100644
--- a/drivers/vdpa/mlx5/core/resources.c
+++ b/drivers/vdpa/mlx5/core/resources.c
@@ -253,6 +253,7 @@ int mlx5_vdpa_alloc_resources(struct mlx5_vdpa_dev *mvdev)
goto err_key;
  
  	kick_addr = mdev->bar_addr + offset;

+   res->phys_kick_addr = kick_addr;
  
  	res->kick_addr = ioremap(kick_addr, PAGE_SIZE);

if (!res->kick_addr) {
diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 5500bcfe84b4..1936039e05bd 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -1871,8 +1871,25 @@ static void mlx5_vdpa_free(struct vdpa_device *vdev)
  
  static struct vdpa_notification_area mlx5_get_vq_notification(struct vdpa_device *vdev, u16 idx)

  {
+   struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
struct vdpa_notification_area ret = {};
+   struct mlx5_vdpa_net *ndev;
+   phys_addr_t addr;
+
+   /* If SF BAR size is smaller than PAGE_SIZE, do not use direct
+* notification to avoid the risk of mapping pages that contain BAR of 
more
+* than one SF
+*/
+   if (MLX5_CAP_GEN(mvdev->mdev, log_min_sf_size) + 12 < PAGE_SHIFT)
+   return ret;
+
+   ndev = to_mlx5_vdpa_ndev(mvdev);
+   addr = (phys_addr_t)ndev->mvdev.res.phys_kick_addr;
+   if (addr & ~PAGE_MASK)
+   return ret;



This has been checked by vhost-vDPA, and it's better to leave those 
policy checking to them driver instead of checking it in the parent.


Thanks


  
+	ret.addr = (phys_addr_t)ndev->mvdev.res.phys_kick_addr;

+   ret.size = PAGE_SIZE;
return ret;
  }
  


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH] vdpa/mlx5: Clear vq ready indication upon device reset

2021-06-03 Thread Jason Wang


在 2021/6/3 下午3:00, Jason Wang 写道:


在 2021/6/2 下午4:59, Eli Cohen 写道:

After device reset, the virtqueues are not ready so clear the ready
field.

Failing to do so can result in virtio_vdpa failing to load if the device
was previously used by vhost_vdpa and the old values are ready.
virtio_vdpa expects to find VQs in "not ready" state.

Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 
devices")

Signed-off-by: Eli Cohen 



Acked-by: Jason Wang 



A second thought.

destroy_virtqueue() could be called many places.

One of them is the mlx5_vdpa_change_map(), if this is case, this looks 
wrong.


It looks to me it's simpler to do this in clear_virtqueues() which can 
only be called during reset.


Thanks






---
  drivers/vdpa/mlx5/net/mlx5_vnet.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c

index 02a05492204c..e8bc0842b44c 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -862,6 +862,7 @@ static void destroy_virtqueue(struct 
mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtq

  return;
  }
  umems_destroy(ndev, mvq);
+    mvq->ready = false;
  }
    static u32 get_rqpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH] vdpa/mlx5: Clear vq ready indication upon device reset

2021-06-03 Thread Jason Wang


在 2021/6/2 下午4:59, Eli Cohen 写道:

After device reset, the virtqueues are not ready so clear the ready
field.

Failing to do so can result in virtio_vdpa failing to load if the device
was previously used by vhost_vdpa and the old values are ready.
virtio_vdpa expects to find VQs in "not ready" state.

Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices")
Signed-off-by: Eli Cohen 



Acked-by: Jason Wang 



---
  drivers/vdpa/mlx5/net/mlx5_vnet.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c 
b/drivers/vdpa/mlx5/net/mlx5_vnet.c
index 02a05492204c..e8bc0842b44c 100644
--- a/drivers/vdpa/mlx5/net/mlx5_vnet.c
+++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c
@@ -862,6 +862,7 @@ static void destroy_virtqueue(struct mlx5_vdpa_net *ndev, 
struct mlx5_vdpa_virtq
return;
}
umems_destroy(ndev, mvq);
+   mvq->ready = false;
  }
  
  static u32 get_rqpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization