Re: [RFC v5 3/5] virtio_ring: add packed ring support

2018-05-28 Thread Tiwei Bie
On Tue, May 29, 2018 at 11:18:57AM +0800, Jason Wang wrote:
> On 2018年05月22日 16:16, Tiwei Bie wrote:
[...]
> > +static void detach_buf_packed(struct vring_virtqueue *vq,
> > + unsigned int id, void **ctx)
> > +{
> > +   struct vring_desc_state_packed *state;
> > +   struct vring_packed_desc *desc;
> > +   unsigned int i;
> > +   int *next;
> > +
> > +   /* Clear data ptr. */
> > +   vq->desc_state_packed[id].data = NULL;
> > +
> > +   next = 
> > +   for (i = 0; i < vq->desc_state_packed[id].num; i++) {
> > +   state = >desc_state_packed[*next];
> > +   vring_unmap_state_packed(vq, state);
> > +   next = >desc_state_packed[*next].next;
> 
> Have a short discussion with Michael. It looks like the only thing we care
> is DMA address and length here.

The length tracked by desc_state_packed[] is also necessary
when INDIRECT is used and the buffers are writable.

> 
> So a thought is to avoid using desc_state_packed() is vring_use_dma_api() is
> false? Probably need another id allocator or just use desc_state_packed for
> free id list.

I think it's a good suggestion. Thanks!

I won't track the addr/len/flags for non-indirect descs
when vring_use_dma_api() returns false.

> 
> > +   }
> > +
> > +   vq->vq.num_free += vq->desc_state_packed[id].num;
> > +   *next = vq->free_head;
> 
> Using pointer seems not elegant and unnecessary here. You can just track
> next in integer I think?

It's possible to use integer, but will need one more var
to track `prev` (desc_state_packed[prev].next == next in
this case).

> 
> > +   vq->free_head = id;
> > +
> > +   if (vq->indirect) {
> > +   u32 len;
> > +
> > +   /* Free the indirect table, if any, now that it's unmapped. */
> > +   desc = vq->desc_state_packed[id].indir_desc;
> > +   if (!desc)
> > +   return;
> > +
> > +   len = vq->desc_state_packed[id].len;
> > +   for (i = 0; i < len / sizeof(struct vring_packed_desc); i++)
> > +   vring_unmap_desc_packed(vq, [i]);
> > +
> > +   kfree(desc);
> > +   vq->desc_state_packed[id].indir_desc = NULL;
> > +   } else if (ctx) {
> > +   *ctx = vq->desc_state_packed[id].indir_desc;
> > +   }
> >   }
> >   static inline bool more_used_packed(const struct vring_virtqueue *vq)
> >   {
> > -   return false;
> > +   u16 last_used, flags;
> > +   u8 avail, used;
> > +
> > +   last_used = vq->last_used_idx;
> > +   flags = virtio16_to_cpu(vq->vq.vdev,
> > +   vq->vring_packed.desc[last_used].flags);
> > +   avail = !!(flags & VRING_DESC_F_AVAIL(1));
> > +   used = !!(flags & VRING_DESC_F_USED(1));
> > +
> > +   return avail == used && used == vq->used_wrap_counter;
> 
> Spec does not check avail == used? So there's probably a bug in either of
> the two places.
> 
> In what condition that avail != used but used == vq_used_wrap_counter? A
> buggy device or device set the two bits in two writes?

Currently, spec doesn't forbid devices to set the status
bits as: avail != used but used == vq_used_wrap_counter.
So I think driver cannot assume this won't happen.

> 
> >   }
[...]
> >   static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
> >   {
> > -   return 0;
> > +   struct vring_virtqueue *vq = to_vvq(_vq);
> > +
> > +   START_USE(vq);
> > +
> > +   /* We optimistically turn back on interrupts, then check if there was
> > +* more to do. */
> > +
> > +   if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> > +   virtio_wmb(vq->weak_barriers);
> 
> Missing comments for the barrier.

Will add some comments.

> 
> > +   vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
> > +   vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> > +   vq->event_flags_shadow);
> > +   }
> > +
> > +   END_USE(vq);
> > +   return vq->last_used_idx;
> >   }
> >   static bool virtqueue_poll_packed(struct virtqueue *_vq, unsigned 
> > last_used_idx)
> >   {
> > -   return false;
> > +   struct vring_virtqueue *vq = to_vvq(_vq);
> > +   u8 avail, used;
> > +   u16 flags;
> > +
> > +   virtio_mb(vq->weak_barriers);
> > +   flags = virtio16_to_cpu(vq->vq.vdev,
> > +   vq->vring_packed.desc[last_used_idx].flags);
> > +   avail = !!(flags & VRING_DESC_F_AVAIL(1));
> > +   used = !!(flags & VRING_DESC_F_USED(1));
> > +
> > +   return avail == used && used == vq->used_wrap_counter;
> >   }
> >   static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq)
> >   {
> > -   return false;
> > +   struct vring_virtqueue *vq = to_vvq(_vq);
> > +
> > +   START_USE(vq);
> > +
> > +   /* We optimistically turn back on interrupts, then check if there was
> > +* more to do. */
> > +
> > +   if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> > +   virtio_wmb(vq->weak_barriers);
> 
> Need comments for the barrier.

Will add some comments.

Best regards,
Tiwei Bie

Re: [PATCH net-next v12 1/5] net: Introduce generic failover module

2018-05-28 Thread Stephen Hemminger
On Fri, 25 May 2018 16:06:58 -0700
"Samudrala, Sridhar"  wrote:

> On 5/25/2018 3:38 PM, Stephen Hemminger wrote:
> > On Thu, 24 May 2018 09:55:13 -0700
> > Sridhar Samudrala  wrote:
> >  
> >> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> >> index 03ed492c4e14..0f4ba52b641d 100644
> >> --- a/include/linux/netdevice.h
> >> +++ b/include/linux/netdevice.h
> >> @@ -1421,6 +1421,8 @@ struct net_device_ops {
> >>*   entity (i.e. the master device for bridged veth)
> >>* @IFF_MACSEC: device is a MACsec device
> >>* @IFF_NO_RX_HANDLER: device doesn't support the rx_handler hook
> >> + * @IFF_FAILOVER: device is a failover master device
> >> + * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device
> >>*/
> >>   enum netdev_priv_flags {
> >>IFF_802_1Q_VLAN = 1<<0,
> >> @@ -1450,6 +1452,8 @@ enum netdev_priv_flags {
> >>IFF_PHONY_HEADROOM  = 1<<24,
> >>IFF_MACSEC  = 1<<25,
> >>IFF_NO_RX_HANDLER   = 1<<26,
> >> +  IFF_FAILOVER= 1<<27,
> >> +  IFF_FAILOVER_SLAVE  = 1<<28,
> >>   };  
> > Why is FAILOVER any different than other master/slave relationships.
> > I don't think you need to take up precious netdev flag bits for this.  
> 
> These are netdev priv flags.
> Jiri says that IFF_MASTER/IFF_SLAVE are bonding specific flags and cannot be 
> used
> with other failover mechanisms. Team also doesn't use this flags and it has 
> its own
> priv_flags.
> 

They are already used by bonding and team.
I don't see why this can't reuse them.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v5] virtio_blk: add DISCARD and WRIET ZEROES commands support

2018-05-28 Thread Changpeng Liu
Existing virtio-blk protocol doesn't have DISCARD/WRITE ZEROES commands
support, this will impact the performance when using SSD backend over
file systems.

Commit 88c85538 "virtio-blk: add discard and write zeroes features to
specification"(see https://github.com/oasis-tcs/virtio-spec) extended
existing virtio-blk protocol, adding extra DISCARD and WRITE ZEROES
commands support.

While here, using 16 bytes descriptor to describe one segment of DISCARD
or WRITE ZEROES commands, each command may contain one or more decriptors.

The following data structure shows the definition of one descriptor:

struct virtio_blk_discard_write_zeroes {
le64 sector;
le32 num_sectors;
le32 unmap;
};

Field 'sector' means the start sector for DISCARD and WRITE ZEROES,
filed 'num_sectors' means the number of sectors for DISCARD and WRITE
ZEROES, if both DISCARD and WRITE ZEROES are supported, field 'unmap'
maybe used for WRITE ZEROES command with DISCARD enabled.

We also extended the virtio-blk configuration space to let backend
device put DISCARD and WRITE ZEROES configuration parameters.

struct virtio_blk_config {
[...]

le32 max_discard_sectors;
le32 max_discard_seg;
le32 discard_sector_alignment;
le32 max_write_zeroes_sectors;
le32 max_write_zeroes_seg;
u8 write_zeroes_may_unmap;
}

New feature bit [VIRTIO_BLK_F_DISCARD (13)]: Device can support discard
command, maximum discard sectors size in field 'max_discard_sectors' and
maximum discard segment number in field 'max_discard_seg'.

New feature [VIRTIO_BLK_F_WRITE_ZEROES (14)]: Device can support write
zeroes command, maximum write zeroes sectors size in field
'max_write_zeroes_sectors' and maximum write zeroes segment number in
field 'max_write_zeroes_seg'.

The parameters in the configuration space of the device field
'max_discard_sectors' and field 'discard_sector_alignment' are expressed in
512-byte units if the VIRTIO_BLK_F_DISCARD feature bit is negotiated. The
field 'max_write_zeroes_sectors' is expressed in 512-byte units if the
VIRTIO_BLK_F_WRITE_ZEROES feature bit is negotiated.

Signed-off-by: Changpeng Liu 
---
CHANGELOG:
v5: use new block layer API: blk_queue_flag_set.
v4: several optimizations based on MST's comments, remove bit field usage for
command descriptor.
v3: define the virtio-blk protocol to add discard and write zeroes support,
first version implementation based on proposed specification.
v2: add write zeroes command support.
v1: initial proposal implementation for discard command.
---
 drivers/block/virtio_blk.c  | 92 +++--
 include/uapi/linux/virtio_blk.h | 43 +++
 2 files changed, 132 insertions(+), 3 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 4a07593c..fabab2a 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -172,10 +172,45 @@ static int virtblk_add_req(struct virtqueue *vq, struct 
virtblk_req *vbr,
return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
 }
 
+static inline int virtblk_setup_discard_write_zeroes(struct request *req,
+   bool unmap)
+{
+   unsigned short segments = blk_rq_nr_discard_segments(req);
+   unsigned short n = 0;
+   struct virtio_blk_discard_write_zeroes *range;
+   struct bio *bio;
+
+   range = kmalloc_array(segments, sizeof(*range), GFP_KERNEL);
+   if (!range)
+   return -ENOMEM;
+
+   __rq_for_each_bio(bio, req) {
+   u64 sector = bio->bi_iter.bi_sector;
+   u32 num_sectors = bio->bi_iter.bi_size >> 9;
+
+   range[n].unmap = cpu_to_le32(unmap);
+   range[n].num_sectors = cpu_to_le32(num_sectors);
+   range[n].sector = cpu_to_le64(sector);
+   n++;
+   }
+
+   req->special_vec.bv_page = virt_to_page(range);
+   req->special_vec.bv_offset = offset_in_page(range);
+   req->special_vec.bv_len = sizeof(*range) * segments;
+   req->rq_flags |= RQF_SPECIAL_PAYLOAD;
+
+   return 0;
+}
+
 static inline void virtblk_request_done(struct request *req)
 {
struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
 
+   if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
+   kfree(page_address(req->special_vec.bv_page) +
+ req->special_vec.bv_offset);
+   }
+
switch (req_op(req)) {
case REQ_OP_SCSI_IN:
case REQ_OP_SCSI_OUT:
@@ -225,6 +260,7 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx 
*hctx,
int qid = hctx->queue_num;
int err;
bool notify = false;
+   bool unmap = false;
u32 type;
 
BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
@@ -237,6 +273,13 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx 
*hctx,
case REQ_OP_FLUSH:
type = VIRTIO_BLK_T_FLUSH;
break;

[PATCH v5] virtio_blk: add DISCARD and WRIET ZEROES commands support

2018-05-28 Thread Changpeng Liu
Existing virtio-blk protocol doesn't have DISCARD/WRITE ZEROES commands
support, this will impact the performance when using SSD backend over
file systems.

Commit 88c85538 "virtio-blk: add discard and write zeroes features to
specification"(see https://github.com/oasis-tcs/virtio-spec) extended
existing virtio-blk protocol, adding extra DISCARD and WRITE ZEROES
commands support.

While here, using 16 bytes descriptor to describe one segment of DISCARD
or WRITE ZEROES commands, each command may contain one or more decriptors.

The following data structure shows the definition of one descriptor:

struct virtio_blk_discard_write_zeroes {
le64 sector;
le32 num_sectors;
le32 unmap;
};

Field 'sector' means the start sector for DISCARD and WRITE ZEROES,
filed 'num_sectors' means the number of sectors for DISCARD and WRITE
ZEROES, if both DISCARD and WRITE ZEROES are supported, field 'unmap'
maybe used for WRITE ZEROES command with DISCARD enabled.

We also extended the virtio-blk configuration space to let backend
device put DISCARD and WRITE ZEROES configuration parameters.

struct virtio_blk_config {
[...]

le32 max_discard_sectors;
le32 max_discard_seg;
le32 discard_sector_alignment;
le32 max_write_zeroes_sectors;
le32 max_write_zeroes_seg;
u8 write_zeroes_may_unmap;
}

New feature bit [VIRTIO_BLK_F_DISCARD (13)]: Device can support discard
command, maximum discard sectors size in field 'max_discard_sectors' and
maximum discard segment number in field 'max_discard_seg'.

New feature [VIRTIO_BLK_F_WRITE_ZEROES (14)]: Device can support write
zeroes command, maximum write zeroes sectors size in field
'max_write_zeroes_sectors' and maximum write zeroes segment number in
field 'max_write_zeroes_seg'.

The parameters in the configuration space of the device field
'max_discard_sectors' and field 'discard_sector_alignment' are expressed in
512-byte units if the VIRTIO_BLK_F_DISCARD feature bit is negotiated. The
field 'max_write_zeroes_sectors' is expressed in 512-byte units if the
VIRTIO_BLK_F_WRITE_ZEROES feature bit is negotiated.

Signed-off-by: Changpeng Liu 
---
CHANGELOG:
v5: use new block layer API: blk_queue_flag_set.
v4: several optimizations based on MST's comments, remove bit field usage for
command descriptor.
v3: define the virtio-blk protocol to add discard and write zeroes support,
first version implementation based on proposed specification.
v2: add write zeroes command support.
v1: initial proposal implementation for discard command.
---
 drivers/block/virtio_blk.c  | 92 +++--
 include/uapi/linux/virtio_blk.h | 43 +++
 2 files changed, 132 insertions(+), 3 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 4a07593c..a250fcc 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -172,10 +172,45 @@ static int virtblk_add_req(struct virtqueue *vq, struct 
virtblk_req *vbr,
return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
 }
 
+static inline int virtblk_setup_discard_write_zeroes(struct request *req,
+   bool unmap)
+{
+   unsigned short segments = blk_rq_nr_discard_segments(req);
+   unsigned short n = 0;
+   struct virtio_blk_discard_write_zeroes *range;
+   struct bio *bio;
+
+   range = kmalloc_array(segments, sizeof(*range), GFP_KERNEL);
+   if (!range)
+   return -ENOMEM;
+
+   __rq_for_each_bio(bio, req) {
+   u64 sector = bio->bi_iter.bi_sector;
+   u32 num_sectors = bio->bi_iter.bi_size >> 9;
+
+   range[n].unmap = cpu_to_le32(unmap);
+   range[n].num_sectors = cpu_to_le32(num_sectors);
+   range[n].sector = cpu_to_le64(sector);
+   n++;
+   }
+
+   req->special_vec.bv_page = virt_to_page(range);
+   req->special_vec.bv_offset = offset_in_page(range);
+   req->special_vec.bv_len = sizeof(*range) * segments;
+   req->rq_flags |= RQF_SPECIAL_PAYLOAD;
+
+   return 0;
+}
+
 static inline void virtblk_request_done(struct request *req)
 {
struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
 
+   if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
+   kfree(page_address(req->special_vec.bv_page) +
+ req->special_vec.bv_offset);
+   }
+
switch (req_op(req)) {
case REQ_OP_SCSI_IN:
case REQ_OP_SCSI_OUT:
@@ -225,6 +260,7 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx 
*hctx,
int qid = hctx->queue_num;
int err;
bool notify = false;
+   bool unmap = false;
u32 type;
 
BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
@@ -237,6 +273,13 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx 
*hctx,
case REQ_OP_FLUSH:
type = VIRTIO_BLK_T_FLUSH;
break;

Re: [RFC v5 3/5] virtio_ring: add packed ring support

2018-05-28 Thread Jason Wang



On 2018年05月22日 16:16, Tiwei Bie wrote:

This commit introduces the support (without EVENT_IDX) for
packed ring.

Signed-off-by: Tiwei Bie 
---
  drivers/virtio/virtio_ring.c | 474 ++-
  1 file changed, 468 insertions(+), 6 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index f5ef5f42a7cf..eb9fd5207a68 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -62,6 +62,12 @@ struct vring_desc_state {
  };
  
  struct vring_desc_state_packed {

+   void *data; /* Data for callback. */
+   struct vring_packed_desc *indir_desc; /* Indirect descriptor, if any. */
+   int num;/* Descriptor list length. */
+   dma_addr_t addr;/* Buffer DMA addr. */
+   u32 len;/* Buffer length. */
+   u16 flags;  /* Descriptor flags. */
int next;   /* The next desc state. */
  };
  
@@ -758,6 +764,72 @@ static inline unsigned vring_size_packed(unsigned int num, unsigned long align)

& ~(align - 1)) + sizeof(struct vring_packed_desc_event) * 2;
  }
  
+static void vring_unmap_state_packed(const struct vring_virtqueue *vq,

+struct vring_desc_state_packed *state)
+{
+   u16 flags;
+
+   if (!vring_use_dma_api(vq->vq.vdev))
+   return;
+
+   flags = state->flags;
+
+   if (flags & VRING_DESC_F_INDIRECT) {
+   dma_unmap_single(vring_dma_dev(vq),
+state->addr, state->len,
+(flags & VRING_DESC_F_WRITE) ?
+DMA_FROM_DEVICE : DMA_TO_DEVICE);
+   } else {
+   dma_unmap_page(vring_dma_dev(vq),
+  state->addr, state->len,
+  (flags & VRING_DESC_F_WRITE) ?
+  DMA_FROM_DEVICE : DMA_TO_DEVICE);
+   }
+}
+
+static void vring_unmap_desc_packed(const struct vring_virtqueue *vq,
+  struct vring_packed_desc *desc)
+{
+   u16 flags;
+
+   if (!vring_use_dma_api(vq->vq.vdev))
+   return;
+
+   flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
+
+   if (flags & VRING_DESC_F_INDIRECT) {
+   dma_unmap_single(vring_dma_dev(vq),
+virtio64_to_cpu(vq->vq.vdev, desc->addr),
+virtio32_to_cpu(vq->vq.vdev, desc->len),
+(flags & VRING_DESC_F_WRITE) ?
+DMA_FROM_DEVICE : DMA_TO_DEVICE);
+   } else {
+   dma_unmap_page(vring_dma_dev(vq),
+  virtio64_to_cpu(vq->vq.vdev, desc->addr),
+  virtio32_to_cpu(vq->vq.vdev, desc->len),
+  (flags & VRING_DESC_F_WRITE) ?
+  DMA_FROM_DEVICE : DMA_TO_DEVICE);
+   }
+}
+
+static struct vring_packed_desc *alloc_indirect_packed(struct virtqueue *_vq,
+  unsigned int total_sg,
+  gfp_t gfp)
+{
+   struct vring_packed_desc *desc;
+
+   /*
+* We require lowmem mappings for the descriptors because
+* otherwise virt_to_phys will give us bogus addresses in the
+* virtqueue.
+*/
+   gfp &= ~__GFP_HIGHMEM;
+
+   desc = kmalloc(total_sg * sizeof(struct vring_packed_desc), gfp);
+
+   return desc;
+}
+
  static inline int virtqueue_add_packed(struct virtqueue *_vq,
   struct scatterlist *sgs[],
   unsigned int total_sg,
@@ -767,47 +839,437 @@ static inline int virtqueue_add_packed(struct virtqueue 
*_vq,
   void *ctx,
   gfp_t gfp)
  {
+   struct vring_virtqueue *vq = to_vvq(_vq);
+   struct vring_packed_desc *desc;
+   struct scatterlist *sg;
+   unsigned int i, n, descs_used, uninitialized_var(prev), err_idx;
+   __virtio16 uninitialized_var(head_flags), flags;
+   u16 head, avail_wrap_counter, id, cur;
+   bool indirect;
+
+   START_USE(vq);
+
+   BUG_ON(data == NULL);
+   BUG_ON(ctx && vq->indirect);
+
+   if (unlikely(vq->broken)) {
+   END_USE(vq);
+   return -EIO;
+   }
+
+#ifdef DEBUG
+   {
+   ktime_t now = ktime_get();
+
+   /* No kick or get, with .1 second between?  Warn. */
+   if (vq->last_add_time_valid)
+   WARN_ON(ktime_to_ms(ktime_sub(now, vq->last_add_time))
+   > 100);
+   vq->last_add_time = now;
+   vq->last_add_time_valid = true;
+   }
+#endif
+
+   BUG_ON(total_sg 

Re: [PATCH net-next v12 0/5] Enable virtio_net to act as a standby for a passthru device

2018-05-28 Thread David Miller
From: Sridhar Samudrala 
Date: Thu, 24 May 2018 09:55:12 -0700

> The main motivation for this patch is to enable cloud service providers
> to provide an accelerated datapath to virtio-net enabled VMs in a 
> transparent manner with no/minimal guest userspace changes. This also
> enables hypervisor controlled live migration to be supported with VMs that
> have direct attached SR-IOV VF devices.
 ...

Series applied, thank you.

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC v5 2/5] virtio_ring: support creating packed ring

2018-05-28 Thread Jason Wang



On 2018年05月22日 16:16, Tiwei Bie wrote:

This commit introduces the support for creating packed ring.
All split ring specific functions are added _split suffix.
Some necessary stubs for packed ring are also added.

Signed-off-by: Tiwei Bie 
---
  drivers/virtio/virtio_ring.c | 801 +++
  include/linux/virtio_ring.h  |   8 +-
  2 files changed, 546 insertions(+), 263 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 71458f493cf8..f5ef5f42a7cf 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -61,11 +61,15 @@ struct vring_desc_state {
struct vring_desc *indir_desc;  /* Indirect descriptor, if any. */
  };
  
+struct vring_desc_state_packed {

+   int next;   /* The next desc state. */
+};
+
  struct vring_virtqueue {
struct virtqueue vq;
  
-	/* Actual memory layout for this queue */

-   struct vring vring;
+   /* Is this a packed ring? */
+   bool packed;
  
  	/* Can we use weak barriers? */

bool weak_barriers;
@@ -87,11 +91,39 @@ struct vring_virtqueue {
/* Last used index we've seen. */
u16 last_used_idx;
  
-	/* Last written value to avail->flags */

-   u16 avail_flags_shadow;
+   union {
+   /* Available for split ring */
+   struct {
+   /* Actual memory layout for this queue. */
+   struct vring vring;
  
-	/* Last written value to avail->idx in guest byte order */

-   u16 avail_idx_shadow;
+   /* Last written value to avail->flags */
+   u16 avail_flags_shadow;
+
+   /* Last written value to avail->idx in
+* guest byte order. */
+   u16 avail_idx_shadow;
+   };
+
+   /* Available for packed ring */
+   struct {
+   /* Actual memory layout for this queue. */
+   struct vring_packed vring_packed;
+
+   /* Driver ring wrap counter. */
+   u8 avail_wrap_counter;
+
+   /* Device ring wrap counter. */
+   u8 used_wrap_counter;


How about just use boolean?


+
+   /* Index of the next avail descriptor. */
+   u16 next_avail_idx;
+
+   /* Last written value to driver->flags in
+* guest byte order. */
+   u16 event_flags_shadow;
+   };
+   };
  
  	/* How to notify other side. FIXME: commonalize hcalls! */

bool (*notify)(struct virtqueue *vq);
@@ -111,11 +143,24 @@ struct vring_virtqueue {
  #endif
  
  	/* Per-descriptor state. */

-   struct vring_desc_state desc_state[];
+   union {
+   struct vring_desc_state desc_state[1];
+   struct vring_desc_state_packed desc_state_packed[1];
+   };
  };
  
  #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
  
+static inline bool virtqueue_use_indirect(struct virtqueue *_vq,

+ unsigned int total_sg)
+{
+   struct vring_virtqueue *vq = to_vvq(_vq);
+
+   /* If the host supports indirect descriptor tables, and we have multiple
+* buffers, then go indirect. FIXME: tune this threshold */
+   return (vq->indirect && total_sg > 1 && vq->vq.num_free);
+}
+
  /*
   * Modern virtio devices have feature bits to specify whether they need a
   * quirk and bypass the IOMMU. If not there, just use the DMA API.
@@ -201,8 +246,17 @@ static dma_addr_t vring_map_single(const struct 
vring_virtqueue *vq,
  cpu_addr, size, direction);
  }
  
-static void vring_unmap_one(const struct vring_virtqueue *vq,

-   struct vring_desc *desc)
+static int vring_mapping_error(const struct vring_virtqueue *vq,
+  dma_addr_t addr)
+{
+   if (!vring_use_dma_api(vq->vq.vdev))
+   return 0;
+
+   return dma_mapping_error(vring_dma_dev(vq), addr);
+}
+
+static void vring_unmap_one_split(const struct vring_virtqueue *vq,
+ struct vring_desc *desc)
  {
u16 flags;
  
@@ -226,17 +280,9 @@ static void vring_unmap_one(const struct vring_virtqueue *vq,

}
  }
  
-static int vring_mapping_error(const struct vring_virtqueue *vq,

-  dma_addr_t addr)
-{
-   if (!vring_use_dma_api(vq->vq.vdev))
-   return 0;
-
-   return dma_mapping_error(vring_dma_dev(vq), addr);
-}


It looks to me if you keep vring_mapping_error behind 
vring_unmap_one_split(), lots of changes were unncessary.



-
-static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
-unsigned int total_sg, gfp_t gfp)
+static struct vring_desc 

[RFC V5 PATCH 8/8] vhost: event suppression for packed ring

2018-05-28 Thread Jason Wang
This patch introduces basic support for event suppression aka driver
and device area.

Signed-off-by: Jason Wang 
---
 drivers/vhost/vhost.c| 191 ---
 drivers/vhost/vhost.h|  10 +-
 include/uapi/linux/virtio_ring.h |  19 
 3 files changed, 204 insertions(+), 16 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index a36e5ad2..112f680 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1112,10 +1112,15 @@ static int vq_access_ok_packed(struct vhost_virtqueue 
*vq, unsigned int num,
   struct vring_used __user *used)
 {
struct vring_desc_packed *packed = (struct vring_desc_packed *)desc;
+   struct vring_packed_desc_event *driver_event =
+   (struct vring_packed_desc_event *)avail;
+   struct vring_packed_desc_event *device_event =
+   (struct vring_packed_desc_event *)used;
 
-   /* FIXME: check device area and driver area */
return access_ok(VERIFY_READ, packed, num * sizeof(*packed)) &&
-  access_ok(VERIFY_WRITE, packed, num * sizeof(*packed));
+  access_ok(VERIFY_WRITE, packed, num * sizeof(*packed)) &&
+  access_ok(VERIFY_READ, driver_event, sizeof(*driver_event)) &&
+  access_ok(VERIFY_WRITE, device_event, sizeof(*device_event));
 }
 
 static int vq_access_ok_split(struct vhost_virtqueue *vq, unsigned int num,
@@ -1190,14 +1195,27 @@ static bool iotlb_access_ok(struct vhost_virtqueue *vq,
return true;
 }
 
-int vq_iotlb_prefetch(struct vhost_virtqueue *vq)
+int vq_iotlb_prefetch_packed(struct vhost_virtqueue *vq)
+{
+   int num = vq->num;
+
+   return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
+  num * sizeof(*vq->desc), VHOST_ADDR_DESC) &&
+  iotlb_access_ok(vq, VHOST_ACCESS_WO, (u64)(uintptr_t)vq->desc,
+  num * sizeof(*vq->desc), VHOST_ADDR_DESC) &&
+  iotlb_access_ok(vq, VHOST_ACCESS_RO,
+  (u64)(uintptr_t)vq->driver_event,
+  sizeof(*vq->driver_event), VHOST_ADDR_AVAIL) &&
+  iotlb_access_ok(vq, VHOST_ACCESS_WO,
+  (u64)(uintptr_t)vq->device_event,
+  sizeof(*vq->device_event), VHOST_ADDR_USED);
+}
+
+int vq_iotlb_prefetch_split(struct vhost_virtqueue *vq)
 {
size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
unsigned int num = vq->num;
 
-   if (!vq->iotlb)
-   return 1;
-
return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
   num * sizeof(*vq->desc), VHOST_ADDR_DESC) &&
   iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->avail,
@@ -1209,6 +1227,17 @@ int vq_iotlb_prefetch(struct vhost_virtqueue *vq)
   num * sizeof(*vq->used->ring) + s,
   VHOST_ADDR_USED);
 }
+
+int vq_iotlb_prefetch(struct vhost_virtqueue *vq)
+{
+   if (!vq->iotlb)
+   return 1;
+
+   if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+   return vq_iotlb_prefetch_packed(vq);
+   else
+   return vq_iotlb_prefetch_split(vq);
+}
 EXPORT_SYMBOL_GPL(vq_iotlb_prefetch);
 
 /* Can we log writes? */
@@ -1730,6 +1759,50 @@ static int vhost_update_used_flags(struct 
vhost_virtqueue *vq)
return 0;
 }
 
+static int vhost_update_device_flags(struct vhost_virtqueue *vq,
+__virtio16 device_flags)
+{
+   void __user *flags;
+
+   if (vhost_put_user(vq, device_flags, >device_event->flags,
+  VHOST_ADDR_USED) < 0)
+   return -EFAULT;
+   if (unlikely(vq->log_used)) {
+   /* Make sure the flag is seen before log. */
+   smp_wmb();
+   /* Log used flag write. */
+   flags = >device_event->flags;
+   log_write(vq->log_base, vq->log_addr +
+ (flags - (void __user *)vq->device_event),
+ sizeof(vq->device_event->flags));
+   if (vq->log_ctx)
+   eventfd_signal(vq->log_ctx, 1);
+   }
+   return 0;
+}
+
+static int vhost_update_device_off_wrap(struct vhost_virtqueue *vq,
+   __virtio16 device_off_wrap)
+{
+   void __user *off_wrap;
+
+   if (vhost_put_user(vq, device_off_wrap, >device_event->off_wrap,
+  VHOST_ADDR_USED) < 0)
+   return -EFAULT;
+   if (unlikely(vq->log_used)) {
+   /* Make sure the flag is seen before log. */
+   smp_wmb();
+   /* Log used flag write. */
+   off_wrap = >device_event->off_wrap;
+   log_write(vq->log_base, vq->log_addr +
+ (off_wrap - 

[RFC V5 PATCH 7/8] vhost: packed ring support

2018-05-28 Thread Jason Wang
Signed-off-by: Jason Wang 
---
 drivers/vhost/net.c   |  13 +-
 drivers/vhost/vhost.c | 585 ++
 drivers/vhost/vhost.h |  13 +-
 3 files changed, 566 insertions(+), 45 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 30273ad..4991aa4 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -71,7 +71,8 @@ enum {
VHOST_NET_FEATURES = VHOST_FEATURES |
 (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |
 (1ULL << VIRTIO_NET_F_MRG_RXBUF) |
-(1ULL << VIRTIO_F_IOMMU_PLATFORM)
+(1ULL << VIRTIO_F_IOMMU_PLATFORM) |
+(1ULL << VIRTIO_F_RING_PACKED)
 };
 
 enum {
@@ -576,7 +577,7 @@ static void handle_tx(struct vhost_net *net)
nvq->upend_idx = ((unsigned)nvq->upend_idx - 1)
% UIO_MAXIOV;
}
-   vhost_discard_vq_desc(vq, 1);
+   vhost_discard_vq_desc(vq, , 1);
vhost_net_enable_vq(net, vq);
break;
}
@@ -714,9 +715,11 @@ static void handle_rx(struct vhost_net *net)
mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
 
while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk))) {
+   struct vhost_used_elem *used = vq->heads + nheads;
+
sock_len += sock_hlen;
vhost_len = sock_len + vhost_hlen;
-   err = vhost_get_bufs(vq, vq->heads + nheads, vhost_len,
+   err = vhost_get_bufs(vq, used, vhost_len,
 , vq_log, ,
 likely(mergeable) ? UIO_MAXIOV : 1,
 );
@@ -762,7 +765,7 @@ static void handle_rx(struct vhost_net *net)
if (unlikely(err != sock_len)) {
pr_debug("Discarded rx packet: "
 " len %d, expected %zd\n", err, sock_len);
-   vhost_discard_vq_desc(vq, headcount);
+   vhost_discard_vq_desc(vq, used, 1);
continue;
}
/* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */
@@ -786,7 +789,7 @@ static void handle_rx(struct vhost_net *net)
copy_to_iter(_buffers, sizeof num_buffers,
 ) != sizeof num_buffers) {
vq_err(vq, "Failed num_buffers write");
-   vhost_discard_vq_desc(vq, headcount);
+   vhost_discard_vq_desc(vq, used, 1);
goto out;
}
nheads += headcount;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 4031a8f..a36e5ad2 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -323,6 +323,9 @@ static void vhost_vq_reset(struct vhost_dev *dev,
vhost_reset_is_le(vq);
vhost_disable_cross_endian(vq);
vq->busyloop_timeout = 0;
+   vq->used_wrap_counter = true;
+   vq->last_avail_wrap_counter = true;
+   vq->avail_wrap_counter = true;
vq->umem = NULL;
vq->iotlb = NULL;
__vhost_vq_meta_reset(vq);
@@ -1103,11 +1106,22 @@ static int vhost_iotlb_miss(struct vhost_virtqueue *vq, 
u64 iova, int access)
return 0;
 }
 
-static bool vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
-struct vring_desc __user *desc,
-struct vring_avail __user *avail,
-struct vring_used __user *used)
+static int vq_access_ok_packed(struct vhost_virtqueue *vq, unsigned int num,
+  struct vring_desc __user *desc,
+  struct vring_avail __user *avail,
+  struct vring_used __user *used)
+{
+   struct vring_desc_packed *packed = (struct vring_desc_packed *)desc;
+
+   /* FIXME: check device area and driver area */
+   return access_ok(VERIFY_READ, packed, num * sizeof(*packed)) &&
+  access_ok(VERIFY_WRITE, packed, num * sizeof(*packed));
+}
 
+static int vq_access_ok_split(struct vhost_virtqueue *vq, unsigned int num,
+ struct vring_desc __user *desc,
+ struct vring_avail __user *avail,
+ struct vring_used __user *used)
 {
size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
 
@@ -1118,6 +1132,17 @@ static bool vq_access_ok(struct vhost_virtqueue *vq, 
unsigned int num,
sizeof *used + num * sizeof *used->ring + s);
 }
 
+static int vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
+   struct vring_desc __user *desc,
+   struct vring_avail __user *avail,
+   struct 

[RFC V5 PATCH 6/8] virtio: introduce packed ring defines

2018-05-28 Thread Jason Wang
Signed-off-by: Jason Wang 
---
 include/uapi/linux/virtio_config.h |  9 +
 include/uapi/linux/virtio_ring.h   | 13 +
 2 files changed, 22 insertions(+)

diff --git a/include/uapi/linux/virtio_config.h 
b/include/uapi/linux/virtio_config.h
index 308e209..5903d51 100644
--- a/include/uapi/linux/virtio_config.h
+++ b/include/uapi/linux/virtio_config.h
@@ -71,4 +71,13 @@
  * this is for compatibility with legacy systems.
  */
 #define VIRTIO_F_IOMMU_PLATFORM33
+
+#define VIRTIO_F_RING_PACKED   34
+
+/*
+ * This feature indicates that all buffers are used by the device in
+ * the same order in which they have been made available.
+ */
+#define VIRTIO_F_IN_ORDER  35
+
 #endif /* _UAPI_LINUX_VIRTIO_CONFIG_H */
diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h
index 6d5d5fa..e297580 100644
--- a/include/uapi/linux/virtio_ring.h
+++ b/include/uapi/linux/virtio_ring.h
@@ -43,6 +43,8 @@
 #define VRING_DESC_F_WRITE 2
 /* This means the buffer contains a list of buffer descriptors. */
 #define VRING_DESC_F_INDIRECT  4
+#define VRING_DESC_F_AVAIL  7
+#define VRING_DESC_F_USED  15
 
 /* The Host uses this in used->flags to advise the Guest: don't kick me when
  * you add a buffer.  It's unreliable, so it's simply an optimization.  Guest
@@ -62,6 +64,17 @@
  * at the end of the used ring. Guest should ignore the used->flags field. */
 #define VIRTIO_RING_F_EVENT_IDX29
 
+struct vring_desc_packed {
+   /* Buffer Address. */
+   __virtio64 addr;
+   /* Buffer Length. */
+   __virtio32 len;
+   /* Buffer ID. */
+   __virtio16 id;
+   /* The flags depending on descriptor type. */
+   __virtio16 flags;
+};
+
 /* Virtio ring descriptors: 16 bytes.  These can chain together via "next". */
 struct vring_desc {
/* Address (guest-physical). */
-- 
2.7.4

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[RFC V5 PATCH 5/8] vhost: vhost_put_user() can accept metadata type

2018-05-28 Thread Jason Wang
We assumes used ring update is the only user for vhost_put_user() in
the past. This may not be the case for the incoming packed ring which
may update the descriptor ring for used. So introduce a new type
parameter.

Signed-off-by: Jason Wang 
---
 drivers/vhost/vhost.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index e0fcfec..4031a8f 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -811,7 +811,7 @@ static inline void __user *__vhost_get_user(struct 
vhost_virtqueue *vq,
return __vhost_get_user_slow(vq, addr, size, type);
 }
 
-#define vhost_put_user(vq, x, ptr) \
+#define vhost_put_user(vq, x, ptr, type)   \
 ({ \
int ret = -EFAULT; \
if (!vq->iotlb) { \
@@ -819,7 +819,7 @@ static inline void __user *__vhost_get_user(struct 
vhost_virtqueue *vq,
} else { \
__typeof__(ptr) to = \
(__typeof__(ptr)) __vhost_get_user(vq, ptr, \
- sizeof(*ptr), VHOST_ADDR_USED); \
+ sizeof(*ptr), type); \
if (to != NULL) \
ret = __put_user(x, to); \
else \
@@ -1683,7 +1683,7 @@ static int vhost_update_used_flags(struct vhost_virtqueue 
*vq)
 {
void __user *used;
if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
-  >used->flags) < 0)
+  >used->flags, VHOST_ADDR_USED) < 0)
return -EFAULT;
if (unlikely(vq->log_used)) {
/* Make sure the flag is seen before log. */
@@ -1702,7 +1702,7 @@ static int vhost_update_used_flags(struct vhost_virtqueue 
*vq)
 static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 
avail_event)
 {
if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
-  vhost_avail_event(vq)))
+  vhost_avail_event(vq), VHOST_ADDR_USED))
return -EFAULT;
if (unlikely(vq->log_used)) {
void __user *used;
@@ -2185,12 +2185,12 @@ static int __vhost_add_used_n(struct vhost_virtqueue 
*vq,
used = vq->used->ring + start;
for (i = 0; i < count; i++) {
if (unlikely(vhost_put_user(vq, heads[i].elem.id,
-   [i].id))) {
+   [i].id, VHOST_ADDR_USED))) {
vq_err(vq, "Failed to write used id");
return -EFAULT;
}
if (unlikely(vhost_put_user(vq, heads[i].elem.len,
-   [i].len))) {
+   [i].len, VHOST_ADDR_USED))) {
vq_err(vq, "Failed to write used len");
return -EFAULT;
}
@@ -2236,7 +2236,7 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct 
vhost_used_elem *heads,
/* Make sure buffer is written before we update index. */
smp_wmb();
if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
-  >used->idx)) {
+  >used->idx, VHOST_ADDR_USED)) {
vq_err(vq, "Failed to increment used idx");
return -EFAULT;
}
-- 
2.7.4

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[RFC V5 PATCH 4/8] vhost_net: do not explicitly manipulate vhost_used_elem

2018-05-28 Thread Jason Wang
Two helpers of setting/getting used len were introduced to avoid
explicitly manipulating vhost_used_elem in zerocopy code. This will be
used to hide used_elem internals and simplify packed ring
implementation.

Signed-off-by: Jason Wang 
---
 drivers/vhost/net.c   | 11 +--
 drivers/vhost/vhost.c | 12 ++--
 drivers/vhost/vhost.h |  5 +
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 3826f1f..30273ad 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -341,9 +341,10 @@ static void vhost_zerocopy_signal_used(struct vhost_net 
*net,
int j = 0;
 
for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) {
-   if (vq->heads[i].elem.len == VHOST_DMA_FAILED_LEN)
+   if (vhost_get_used_len(vq, >heads[i]) ==
+   VHOST_DMA_FAILED_LEN)
vhost_net_tx_err(net);
-   if (VHOST_DMA_IS_DONE(vq->heads[i].elem.len)) {
+   if (VHOST_DMA_IS_DONE(vhost_get_used_len(vq, >heads[i]))) {
vq->heads[i].elem.len = VHOST_DMA_CLEAR_LEN;
++j;
} else
@@ -542,10 +543,8 @@ static void handle_tx(struct vhost_net *net)
struct ubuf_info *ubuf;
ubuf = nvq->ubuf_info + nvq->upend_idx;
 
-   vq->heads[nvq->upend_idx].elem.id =
-   cpu_to_vhost32(vq, used.elem.id);
-   vq->heads[nvq->upend_idx].elem.len =
-   VHOST_DMA_IN_PROGRESS;
+   vhost_set_used_len(vq, , VHOST_DMA_IN_PROGRESS);
+   vq->heads[nvq->upend_idx] = used;
ubuf->callback = vhost_zerocopy_callback;
ubuf->ctx = nvq->ubufs;
ubuf->desc = nvq->upend_idx;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 2b2a776..e0fcfec 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2067,11 +2067,19 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 }
 EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
 
-static void vhost_set_used_len(struct vhost_virtqueue *vq,
-  struct vhost_used_elem *used, int len)
+void vhost_set_used_len(struct vhost_virtqueue *vq,
+   struct vhost_used_elem *used, int len)
 {
used->elem.len = cpu_to_vhost32(vq, len);
 }
+EXPORT_SYMBOL_GPL(vhost_set_used_len);
+
+int vhost_get_used_len(struct vhost_virtqueue *vq,
+  struct vhost_used_elem *used)
+{
+   return vhost32_to_cpu(vq, used->elem.len);
+}
+EXPORT_SYMBOL_GPL(vhost_get_used_len);
 
 /* This is a multi-buffer version of vhost_get_desc, that works if
  * vq has read descriptors only.
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 8dea44b..604821b 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -198,6 +198,11 @@ int vhost_get_bufs(struct vhost_virtqueue *vq,
   unsigned *log_num,
   unsigned int quota,
   s16 *count);
+void vhost_set_used_len(struct vhost_virtqueue *vq,
+   struct vhost_used_elem *used,
+   int len);
+int vhost_get_used_len(struct vhost_virtqueue *vq,
+  struct vhost_used_elem *used);
 void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
 
 int vhost_vq_init_access(struct vhost_virtqueue *);
-- 
2.7.4

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[RFC V5 PATCH 3/8] vhost: do not use vring_used_elem

2018-05-28 Thread Jason Wang
Instead of depending on the exported vring_used_elem, this patch
switches to use a new internal structure vhost_used_elem which embed
vring_used_elem in itself. This could be used to let vhost to record
extra metadata for the incoming packed ring layout.

Signed-off-by: Jason Wang 
---
 drivers/vhost/net.c   | 19 +++---
 drivers/vhost/scsi.c  | 10 
 drivers/vhost/vhost.c | 68 ---
 drivers/vhost/vhost.h | 18 --
 drivers/vhost/vsock.c |  6 ++---
 5 files changed, 45 insertions(+), 76 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 826489c..3826f1f 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -341,10 +341,10 @@ static void vhost_zerocopy_signal_used(struct vhost_net 
*net,
int j = 0;
 
for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) {
-   if (vq->heads[i].len == VHOST_DMA_FAILED_LEN)
+   if (vq->heads[i].elem.len == VHOST_DMA_FAILED_LEN)
vhost_net_tx_err(net);
-   if (VHOST_DMA_IS_DONE(vq->heads[i].len)) {
-   vq->heads[i].len = VHOST_DMA_CLEAR_LEN;
+   if (VHOST_DMA_IS_DONE(vq->heads[i].elem.len)) {
+   vq->heads[i].elem.len = VHOST_DMA_CLEAR_LEN;
++j;
} else
break;
@@ -367,7 +367,7 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, 
bool success)
rcu_read_lock_bh();
 
/* set len to mark this desc buffers done DMA */
-   vq->heads[ubuf->desc].len = success ?
+   vq->heads[ubuf->desc].elem.len = success ?
VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
cnt = vhost_net_ubuf_put(ubufs);
 
@@ -426,7 +426,7 @@ static int vhost_net_enable_vq(struct vhost_net *n,
 
 static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
struct vhost_virtqueue *vq,
-   struct vring_used_elem *used_elem,
+   struct vhost_used_elem *used_elem,
struct iovec iov[], unsigned int iov_size,
unsigned int *out_num, unsigned int *in_num)
 {
@@ -477,7 +477,7 @@ static void handle_tx(struct vhost_net *net)
size_t hdr_size;
struct socket *sock;
struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
-   struct vring_used_elem used;
+   struct vhost_used_elem used;
bool zcopy, zcopy_used;
int sent_pkts = 0;
 
@@ -542,9 +542,10 @@ static void handle_tx(struct vhost_net *net)
struct ubuf_info *ubuf;
ubuf = nvq->ubuf_info + nvq->upend_idx;
 
-   vq->heads[nvq->upend_idx].id =
-   cpu_to_vhost32(vq, used.id);
-   vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
+   vq->heads[nvq->upend_idx].elem.id =
+   cpu_to_vhost32(vq, used.elem.id);
+   vq->heads[nvq->upend_idx].elem.len =
+   VHOST_DMA_IN_PROGRESS;
ubuf->callback = vhost_zerocopy_callback;
ubuf->ctx = nvq->ubufs;
ubuf->desc = nvq->upend_idx;
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 654c71f..ac11412 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -67,7 +67,7 @@ struct vhost_scsi_inflight {
 
 struct vhost_scsi_cmd {
/* Descriptor from vhost_get_vq_desc() for virt_queue segment */
-   struct vring_used_elem tvc_vq_used;
+   struct vhost_used_elem tvc_vq_used;
/* virtio-scsi initiator task attribute */
int tvc_task_attr;
/* virtio-scsi response incoming iovecs */
@@ -441,7 +441,7 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct 
vhost_scsi_evt *evt)
struct vhost_virtqueue *vq = >vqs[VHOST_SCSI_VQ_EVT].vq;
struct virtio_scsi_event *event = >event;
struct virtio_scsi_event __user *eventp;
-   struct vring_used_elem used;
+   struct vhost_used_elem used;
unsigned out, in;
int ret;
 
@@ -785,7 +785,7 @@ static void vhost_scsi_submission_work(struct work_struct 
*work)
 static void
 vhost_scsi_send_bad_target(struct vhost_scsi *vs,
   struct vhost_virtqueue *vq,
-  struct vring_used_elem *used, unsigned out)
+  struct vhost_used_elem *used, unsigned out)
 {
struct virtio_scsi_cmd_resp __user *resp;
struct virtio_scsi_cmd_resp rsp;
@@ -808,7 +808,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct 
vhost_virtqueue *vq)
struct virtio_scsi_cmd_req v_req;
struct virtio_scsi_cmd_req_pi v_req_pi;
struct vhost_scsi_cmd *cmd;
-   struct vring_used_elem used;
+   struct 

[RFC V5 PATCH 2/8] vhost: hide used ring layout from device

2018-05-28 Thread Jason Wang
We used to return descriptor head by vhost_get_vq_desc() to device and
pass it back to vhost_add_used() and its friends. This exposes the
internal used ring layout to device which makes it hard to be extended for
e.g packed ring layout.

So this patch tries to hide the used ring layout by

- letting vhost_get_vq_desc() return pointer to struct vring_used_elem
- accepting pointer to struct vring_used_elem in vhost_add_used() and
  vhost_add_used_and_signal()

This could help to hide used ring layout and make it easier to
implement packed ring on top.

Signed-off-by: Jason Wang 
---
 drivers/vhost/net.c   | 46 +-
 drivers/vhost/scsi.c  | 62 +++
 drivers/vhost/vhost.c | 52 +-
 drivers/vhost/vhost.h |  9 +---
 drivers/vhost/vsock.c | 42 +-
 5 files changed, 112 insertions(+), 99 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 762aa81..826489c 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -426,22 +426,24 @@ static int vhost_net_enable_vq(struct vhost_net *n,
 
 static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
struct vhost_virtqueue *vq,
+   struct vring_used_elem *used_elem,
struct iovec iov[], unsigned int iov_size,
unsigned int *out_num, unsigned int *in_num)
 {
unsigned long uninitialized_var(endtime);
-   int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
+   int r = vhost_get_vq_desc(vq, used_elem, vq->iov, ARRAY_SIZE(vq->iov),
  out_num, in_num, NULL, NULL);
 
-   if (r == vq->num && vq->busyloop_timeout) {
+   if (r == -ENOSPC && vq->busyloop_timeout) {
preempt_disable();
endtime = busy_clock() + vq->busyloop_timeout;
while (vhost_can_busy_poll(vq->dev, endtime) &&
   vhost_vq_avail_empty(vq->dev, vq))
cpu_relax();
preempt_enable();
-   r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
- out_num, in_num, NULL, NULL);
+   r = vhost_get_vq_desc(vq, used_elem, vq->iov,
+ ARRAY_SIZE(vq->iov), out_num, in_num,
+ NULL, NULL);
}
 
return r;
@@ -463,7 +465,6 @@ static void handle_tx(struct vhost_net *net)
struct vhost_net_virtqueue *nvq = >vqs[VHOST_NET_VQ_TX];
struct vhost_virtqueue *vq = >vq;
unsigned out, in;
-   int head;
struct msghdr msg = {
.msg_name = NULL,
.msg_namelen = 0,
@@ -476,6 +477,7 @@ static void handle_tx(struct vhost_net *net)
size_t hdr_size;
struct socket *sock;
struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
+   struct vring_used_elem used;
bool zcopy, zcopy_used;
int sent_pkts = 0;
 
@@ -499,20 +501,20 @@ static void handle_tx(struct vhost_net *net)
vhost_zerocopy_signal_used(net, vq);
 
 
-   head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
-   ARRAY_SIZE(vq->iov),
-   , );
-   /* On error, stop handling until the next kick. */
-   if (unlikely(head < 0))
-   break;
+   err = vhost_net_tx_get_vq_desc(net, vq, , vq->iov,
+  ARRAY_SIZE(vq->iov),
+  , );
/* Nothing new?  Wait for eventfd to tell us they refilled. */
-   if (head == vq->num) {
+   if (err == -ENOSPC) {
if (unlikely(vhost_enable_notify(>dev, vq))) {
vhost_disable_notify(>dev, vq);
continue;
}
break;
}
+   /* On error, stop handling until the next kick. */
+   if (unlikely(err < 0))
+   break;
if (in) {
vq_err(vq, "Unexpected descriptor format for TX: "
   "out %d, int %d\n", out, in);
@@ -540,7 +542,8 @@ static void handle_tx(struct vhost_net *net)
struct ubuf_info *ubuf;
ubuf = nvq->ubuf_info + nvq->upend_idx;
 
-   vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
+   vq->heads[nvq->upend_idx].id =
+   cpu_to_vhost32(vq, used.id);
vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
ubuf->callback = 

[RFC V5 PATCH 1/8] vhost: move get_rx_bufs to vhost.c

2018-05-28 Thread Jason Wang
Move get_rx_bufs() to vhost.c and rename it to
vhost_get_bufs(). This helps to hide vring internal layout from
specific device implementation. Packed ring implementation will
benefit from this.

Signed-off-by: Jason Wang 
---
 drivers/vhost/net.c   | 83 ++-
 drivers/vhost/vhost.c | 78 +++
 drivers/vhost/vhost.h |  7 +
 3 files changed, 88 insertions(+), 80 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 986058a..762aa81 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -664,83 +664,6 @@ static int vhost_net_rx_peek_head_len(struct vhost_net 
*net, struct sock *sk)
return len;
 }
 
-/* This is a multi-buffer version of vhost_get_desc, that works if
- * vq has read descriptors only.
- * @vq - the relevant virtqueue
- * @datalen- data length we'll be reading
- * @iovcount   - returned count of io vectors we fill
- * @log- vhost log
- * @log_num- log offset
- * @quota   - headcount quota, 1 for big buffer
- * returns number of buffer heads allocated, negative on error
- */
-static int get_rx_bufs(struct vhost_virtqueue *vq,
-  struct vring_used_elem *heads,
-  int datalen,
-  unsigned *iovcount,
-  struct vhost_log *log,
-  unsigned *log_num,
-  unsigned int quota)
-{
-   unsigned int out, in;
-   int seg = 0;
-   int headcount = 0;
-   unsigned d;
-   int r, nlogs = 0;
-   /* len is always initialized before use since we are always called with
-* datalen > 0.
-*/
-   u32 uninitialized_var(len);
-
-   while (datalen > 0 && headcount < quota) {
-   if (unlikely(seg >= UIO_MAXIOV)) {
-   r = -ENOBUFS;
-   goto err;
-   }
-   r = vhost_get_vq_desc(vq, vq->iov + seg,
- ARRAY_SIZE(vq->iov) - seg, ,
- , log, log_num);
-   if (unlikely(r < 0))
-   goto err;
-
-   d = r;
-   if (d == vq->num) {
-   r = 0;
-   goto err;
-   }
-   if (unlikely(out || in <= 0)) {
-   vq_err(vq, "unexpected descriptor format for RX: "
-   "out %d, in %d\n", out, in);
-   r = -EINVAL;
-   goto err;
-   }
-   if (unlikely(log)) {
-   nlogs += *log_num;
-   log += *log_num;
-   }
-   heads[headcount].id = cpu_to_vhost32(vq, d);
-   len = iov_length(vq->iov + seg, in);
-   heads[headcount].len = cpu_to_vhost32(vq, len);
-   datalen -= len;
-   ++headcount;
-   seg += in;
-   }
-   heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
-   *iovcount = seg;
-   if (unlikely(log))
-   *log_num = nlogs;
-
-   /* Detect overrun */
-   if (unlikely(datalen > 0)) {
-   r = UIO_MAXIOV + 1;
-   goto err;
-   }
-   return headcount;
-err:
-   vhost_discard_vq_desc(vq, headcount);
-   return r;
-}
-
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
 static void handle_rx(struct vhost_net *net)
@@ -790,9 +713,9 @@ static void handle_rx(struct vhost_net *net)
while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk))) {
sock_len += sock_hlen;
vhost_len = sock_len + vhost_hlen;
-   headcount = get_rx_bufs(vq, vq->heads + nheads, vhost_len,
-   , vq_log, ,
-   likely(mergeable) ? UIO_MAXIOV : 1);
+   headcount = vhost_get_bufs(vq, vq->heads + nheads, vhost_len,
+  , vq_log, ,
+  likely(mergeable) ? UIO_MAXIOV : 1);
/* On error, stop handling until the next kick. */
if (unlikely(headcount < 0))
goto out;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index f0be5f3..096a688 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2100,6 +2100,84 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 }
 EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
 
+/* This is a multi-buffer version of vhost_get_desc, that works if
+ * vq has read descriptors only.
+ * @vq - the relevant virtqueue
+ * @datalen- data length we'll be reading
+ * @iovcount   - returned count of io vectors we fill
+ * @log- vhost log
+ * @log_num- log offset
+ * @quota   - headcount 

[RFC V5 PATCH 0/8] Packed ring layout for vhost

2018-05-28 Thread Jason Wang
Hi all:

This RFC implement packed ring layout. The code were tested with
Tiwei's RFC V5 at https://lkml.org/lkml/2018/5/22/138. Some fixups and
tweaks were needed on top of Tiwei's code to make it run for event
index.

Pktgen reports about 20% improvement on TX PPS when doing pktgen from
guest to host. No ovbious improvement on RX PPS. We can do lots of
optimizations on top but for simple and for correceness first, this
version does not do much.

This version were tested with:

- Zerocopy (Out of Order) support
- vIOMMU support
- mergeable buffer on/off
- busy polling on/off

Notes for tester:

- Start from this version, vhost need qemu co-operation to work
  correctly. Or you can comment out the packed specific code for
  GET/SET_VRING_BASE.

- Changes from V4:
- fix signalled_used index recording
- track avail index correctly
- various minor fixes

Changes from V3:
- Fix math on event idx checking
- Sync last avail wrap counter through GET/SET_VRING_BASE
- remove desc_event prefix in the driver/device structure

Changes from V2:
- do not use & in checking desc_event_flags
- off should be most significant bit
- remove the workaround of mergeable buffer for dpdk prototype
- id should be in the last descriptor in the chain
- keep _F_WRITE for write descriptor when adding used
- device flags updating should use ADDR_USED type
- return error on unexpected unavail descriptor in a chain
- return false in vhost_ve_avail_empty is descriptor is available
- track last seen avail_wrap_counter
- correctly examine available descriptor in get_indirect_packed()
- vhost_idx_diff should return u16 instead of bool

Changes from V1:

- Refactor vhost used elem code to avoid open coding on used elem
- Event suppression support (compile test only).
- Indirect descriptor support (compile test only).
- Zerocopy support.
- vIOMMU support.
- SCSI/VSOCK support (compile test only).
- Fix several bugs

Jason Wang (8):
  vhost: move get_rx_bufs to vhost.c
  vhost: hide used ring layout from device
  vhost: do not use vring_used_elem
  vhost_net: do not explicitly manipulate vhost_used_elem
  vhost: vhost_put_user() can accept metadata type
  virtio: introduce packed ring defines
  vhost: packed ring support
  vhost: event suppression for packed ring

 drivers/vhost/net.c| 144 ++
 drivers/vhost/scsi.c   |  62 +--
 drivers/vhost/vhost.c  | 926 +
 drivers/vhost/vhost.h  |  52 ++-
 drivers/vhost/vsock.c  |  42 +-
 include/uapi/linux/virtio_config.h |   9 +
 include/uapi/linux/virtio_ring.h   |  32 ++
 7 files changed, 1000 insertions(+), 267 deletions(-)

-- 
2.7.4

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v4] virtio_blk: add DISCARD and WRIET ZEROES commands support

2018-05-28 Thread Changpeng Liu
Existing virtio-blk protocol doesn't have DISCARD/WRITE ZEROES commands
support, this will impact the performance when using SSD backend over
file systems.

Commit 88c85538 "virtio-blk: add discard and write zeroes features to
specification"(see https://github.com/oasis-tcs/virtio-spec) extended
existing virtio-blk protocol, adding extra DISCARD and WRITE ZEROES
commands support.

While here, using 16 bytes descriptor to describe one segment of DISCARD
or WRITE ZEROES commands, each command may contain one or more decriptors.

The following data structure shows the definition of one descriptor:

struct virtio_blk_discard_write_zeroes {
le64 sector;
le32 num_sectors;
le32 unmap;
};

Field 'sector' means the start sector for DISCARD and WRITE ZEROES,
filed 'num_sectors' means the number of sectors for DISCARD and WRITE
ZEROES, if both DISCARD and WRITE ZEROES are supported, field 'unmap'
maybe used for WRITE ZEROES command with DISCARD enabled.

We also extended the virtio-blk configuration space to let backend
device put DISCARD and WRITE ZEROES configuration parameters.

struct virtio_blk_config {
[...]

le32 max_discard_sectors;
le32 max_discard_seg;
le32 discard_sector_alignment;
le32 max_write_zeroes_sectors;
le32 max_write_zeroes_seg;
u8 write_zeroes_may_unmap;
}

New feature bit [VIRTIO_BLK_F_DISCARD (13)]: Device can support discard
command, maximum discard sectors size in field 'max_discard_sectors' and
maximum discard segment number in field 'max_discard_seg'.

New feature [VIRTIO_BLK_F_WRITE_ZEROES (14)]: Device can support write
zeroes command, maximum write zeroes sectors size in field
'max_write_zeroes_sectors' and maximum write zeroes segment number in
field 'max_write_zeroes_seg'.

The parameters in the configuration space of the device field
'max_discard_sectors' and field 'discard_sector_alignment' are expressed in
512-byte units if the VIRTIO_BLK_F_DISCARD feature bit is negotiated. The
field 'max_write_zeroes_sectors' is expressed in 512-byte units if the
VIRTIO_BLK_F_WRITE_ZEROES feature bit is negotiated.

Signed-off-by: Changpeng Liu 
---
CHANGELOG:
v4: several optimizations based on MST's comments, remove bit field usage for
command descriptor.
v3: define the virtio-blk protocol to add discard and write zeroes support,
first version implementation based on proposed specification.
v2: add write zeroes command support.
v1: initial proposal implementation for discard command.
---
 drivers/block/virtio_blk.c  | 92 +++--
 include/uapi/linux/virtio_blk.h | 43 +++
 2 files changed, 132 insertions(+), 3 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 4a07593c..a250fcc 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -172,10 +172,45 @@ static int virtblk_add_req(struct virtqueue *vq, struct 
virtblk_req *vbr,
return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
 }
 
+static inline int virtblk_setup_discard_write_zeroes(struct request *req,
+   bool unmap)
+{
+   unsigned short segments = blk_rq_nr_discard_segments(req);
+   unsigned short n = 0;
+   struct virtio_blk_discard_write_zeroes *range;
+   struct bio *bio;
+
+   range = kmalloc_array(segments, sizeof(*range), GFP_KERNEL);
+   if (!range)
+   return -ENOMEM;
+
+   __rq_for_each_bio(bio, req) {
+   u64 sector = bio->bi_iter.bi_sector;
+   u32 num_sectors = bio->bi_iter.bi_size >> 9;
+
+   range[n].unmap = cpu_to_le32(unmap);
+   range[n].num_sectors = cpu_to_le32(num_sectors);
+   range[n].sector = cpu_to_le64(sector);
+   n++;
+   }
+
+   req->special_vec.bv_page = virt_to_page(range);
+   req->special_vec.bv_offset = offset_in_page(range);
+   req->special_vec.bv_len = sizeof(*range) * segments;
+   req->rq_flags |= RQF_SPECIAL_PAYLOAD;
+
+   return 0;
+}
+
 static inline void virtblk_request_done(struct request *req)
 {
struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
 
+   if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
+   kfree(page_address(req->special_vec.bv_page) +
+ req->special_vec.bv_offset);
+   }
+
switch (req_op(req)) {
case REQ_OP_SCSI_IN:
case REQ_OP_SCSI_OUT:
@@ -225,6 +260,7 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx 
*hctx,
int qid = hctx->queue_num;
int err;
bool notify = false;
+   bool unmap = false;
u32 type;
 
BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
@@ -237,6 +273,13 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx 
*hctx,
case REQ_OP_FLUSH:
type = VIRTIO_BLK_T_FLUSH;
break;
+   case REQ_OP_DISCARD:
+   

Re: [RFC V2] virtio: Add platform specific DMA API translation for virito devices

2018-05-28 Thread Benjamin Herrenschmidt
On Tue, 2018-05-29 at 09:48 +1000, Benjamin Herrenschmidt wrote:
> > Well it's not supposed to be much slower for the static case.
> > 
> > vhost has a cache so should be fine.
> > 
> > A while ago Paolo implemented a translation cache which should be
> > perfect for this case - most of the code got merged but
> > never enabled because of stability issues.
> > 
> > If all else fails, we could teach QEMU to handle the no-iommu case
> > as if VIRTIO_F_IOMMU_PLATFORM was off.
> 
> Any serious reason why not just getting that 2 line patch allowing our
> arch code to force virtio to use the DMA API ?
> 
> It's not particularly invasive and solves our problem rather nicely
> without adding overhead or additional knowledge to qemu/libvirt/mgmnt
> tools etc... that it doesn't need etc
> 
> The guest knows it's going secure so the guest arch code can do the
> right thing rather trivially.
> 
> Long term we should probably make virtio always use the DMA API anyway,
> and interpose "1:1" dma_ops for the traditional virtio case, that would
> reduce code clutter significantly. In that case, it would become just a
> matter of having a platform hook to override the dma_ops used.

To elaborate a bit 

What we are trying to solve here is entirely a guest problem, I don't
think involving qemu in the solution is the right thing to do.

The guest can only allow external parties (qemu, potentially PCI
devices, etc...) access to some restricted portions of memory (insecure
memory). Thus the guest need to do some bounce buffering/swiotlb type
tricks.

This is completely orthogonal to whether there is an actual iommu
between the guest and the device (or emulated device/virtio).

This is why I think the solution should reside in the guest kernel, by
proper manipulation (by the arch code) of the dma ops.

I don't think forcing the addition of an emulated iommu in the middle
just to work around the fact that virtio "cheats" and doesn't use the
dma API unless there is one, is the right "fix".

The right long term fix is to always use the DMA API, reducing code
path etc... and just have a single point where virtio can "chose"
alternate DMA ops (via an arch hook to deal with our case).

In the meantime, having the hook we propose gets us going, but if you
agree with the approach, we should also work on the long term approach.

Cheers,
Ben.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC V2] virtio: Add platform specific DMA API translation for virito devices

2018-05-28 Thread Benjamin Herrenschmidt
On Fri, 2018-05-25 at 20:45 +0300, Michael S. Tsirkin wrote:
> On Thu, May 24, 2018 at 08:27:04AM +1000, Benjamin Herrenschmidt wrote:
> > On Wed, 2018-05-23 at 21:50 +0300, Michael S. Tsirkin wrote:
> > 
> > > I re-read that discussion and I'm still unclear on the
> > > original question, since I got several apparently
> > > conflicting answers.
> > > 
> > > I asked:
> > > 
> > >   Why isn't setting VIRTIO_F_IOMMU_PLATFORM on the
> > >   hypervisor side sufficient?
> > 
> > I thought I had replied to this...
> > 
> > There are a couple of reasons:
> > 
> > - First qemu doesn't know that the guest will switch to "secure mode"
> > in advance. There is no difference between a normal and a secure
> > partition until the partition does the magic UV call to "enter secure
> > mode" and qemu doesn't see any of it. So who can set the flag here ?
> 
> Not sure I understand. Just set the flag e.g. on qemu command line.
> I might be wrong, but these secure mode things usually
> a. require hypervisor side tricks anyway

The way our secure mode architecture is designed, there doesn't need at
this point to be any knowledge at qemu level whatsoever. Well at least
until we do migration but that's a different kettle of fish. In any
case, the guest starts normally (which means as a non-secure guest, and
thus expects normal virtio, our FW today doesn't handle
VIRTIO_F_IOMMU_PLATFORM, though granted, we can fix this), and later
that guest issues some special Ultravisor call that turns it into a
secure guest.

There is some involvement of the hypervisor, but not qemu at this
stage. We would very much like to avoid that, as it would be a hassle
for users to have to use different libvirt options etc... bcs the guest
might turn itself into a secure VM.

> > - Second, when using VIRTIO_F_IOMMU_PLATFORM, we also make qemu (or
> > vhost) go through the emulated MMIO for every access to the guest,
> > which adds additional overhead.
> > 
> > Cheers,
> > Ben.
> 
> Well it's not supposed to be much slower for the static case.
> 
> vhost has a cache so should be fine.
> 
> A while ago Paolo implemented a translation cache which should be
> perfect for this case - most of the code got merged but
> never enabled because of stability issues.
> 
> If all else fails, we could teach QEMU to handle the no-iommu case
> as if VIRTIO_F_IOMMU_PLATFORM was off.

Any serious reason why not just getting that 2 line patch allowing our
arch code to force virtio to use the DMA API ?

It's not particularly invasive and solves our problem rather nicely
without adding overhead or additional knowledge to qemu/libvirt/mgmnt
tools etc... that it doesn't need etc

The guest knows it's going secure so the guest arch code can do the
right thing rather trivially.

Long term we should probably make virtio always use the DMA API anyway,
and interpose "1:1" dma_ops for the traditional virtio case, that would
reduce code clutter significantly. In that case, it would become just a
matter of having a platform hook to override the dma_ops used.

Cheers,
Ben.

> 
> 
> > > 
> > > 
> > > >  arch/powerpc/include/asm/dma-mapping.h |  6 ++
> > > >  arch/powerpc/platforms/pseries/iommu.c | 11 +++
> > > >  drivers/virtio/virtio_ring.c   | 10 ++
> > > >  3 files changed, 27 insertions(+)
> > > > 
> > > > diff --git a/arch/powerpc/include/asm/dma-mapping.h 
> > > > b/arch/powerpc/include/asm/dma-mapping.h
> > > > index 8fa3945..056e578 100644
> > > > --- a/arch/powerpc/include/asm/dma-mapping.h
> > > > +++ b/arch/powerpc/include/asm/dma-mapping.h
> > > > @@ -115,4 +115,10 @@ extern u64 __dma_get_required_mask(struct device 
> > > > *dev);
> > > >  #define ARCH_HAS_DMA_MMAP_COHERENT
> > > >  
> > > >  #endif /* __KERNEL__ */
> > > > +
> > > > +#define platform_forces_virtio_dma platform_forces_virtio_dma
> > > > +
> > > > +struct virtio_device;
> > > > +
> > > > +extern bool platform_forces_virtio_dma(struct virtio_device *vdev);
> > > >  #endif /* _ASM_DMA_MAPPING_H */
> > > > diff --git a/arch/powerpc/platforms/pseries/iommu.c 
> > > > b/arch/powerpc/platforms/pseries/iommu.c
> > > > index 06f0296..a2ec15a 100644
> > > > --- a/arch/powerpc/platforms/pseries/iommu.c
> > > > +++ b/arch/powerpc/platforms/pseries/iommu.c
> > > > @@ -38,6 +38,7 @@
> > > >  #include 
> > > >  #include 
> > > >  #include 
> > > > +#include 
> > > >  #include 
> > > >  #include 
> > > >  #include 
> > > > @@ -1396,3 +1397,13 @@ static int __init disable_multitce(char *str)
> > > >  __setup("multitce=", disable_multitce);
> > > >  
> > > >  machine_subsys_initcall_sync(pseries, tce_iommu_bus_notifier_init);
> > > > +
> > > > +bool platform_forces_virtio_dma(struct virtio_device *vdev)
> > > > +{
> > > > +   /*
> > > > +* On protected guest platforms, force virtio core to use DMA
> > > > +* MAP API for all virtio devices. But there can also be some
> > > > +* exceptions for individual devices like virtio balloon.
> > > > +*/
> > > > +