Re: [RFC v5 3/5] virtio_ring: add packed ring support
On Tue, May 29, 2018 at 11:18:57AM +0800, Jason Wang wrote: > On 2018年05月22日 16:16, Tiwei Bie wrote: [...] > > +static void detach_buf_packed(struct vring_virtqueue *vq, > > + unsigned int id, void **ctx) > > +{ > > + struct vring_desc_state_packed *state; > > + struct vring_packed_desc *desc; > > + unsigned int i; > > + int *next; > > + > > + /* Clear data ptr. */ > > + vq->desc_state_packed[id].data = NULL; > > + > > + next = > > + for (i = 0; i < vq->desc_state_packed[id].num; i++) { > > + state = >desc_state_packed[*next]; > > + vring_unmap_state_packed(vq, state); > > + next = >desc_state_packed[*next].next; > > Have a short discussion with Michael. It looks like the only thing we care > is DMA address and length here. The length tracked by desc_state_packed[] is also necessary when INDIRECT is used and the buffers are writable. > > So a thought is to avoid using desc_state_packed() is vring_use_dma_api() is > false? Probably need another id allocator or just use desc_state_packed for > free id list. I think it's a good suggestion. Thanks! I won't track the addr/len/flags for non-indirect descs when vring_use_dma_api() returns false. > > > + } > > + > > + vq->vq.num_free += vq->desc_state_packed[id].num; > > + *next = vq->free_head; > > Using pointer seems not elegant and unnecessary here. You can just track > next in integer I think? It's possible to use integer, but will need one more var to track `prev` (desc_state_packed[prev].next == next in this case). > > > + vq->free_head = id; > > + > > + if (vq->indirect) { > > + u32 len; > > + > > + /* Free the indirect table, if any, now that it's unmapped. */ > > + desc = vq->desc_state_packed[id].indir_desc; > > + if (!desc) > > + return; > > + > > + len = vq->desc_state_packed[id].len; > > + for (i = 0; i < len / sizeof(struct vring_packed_desc); i++) > > + vring_unmap_desc_packed(vq, [i]); > > + > > + kfree(desc); > > + vq->desc_state_packed[id].indir_desc = NULL; > > + } else if (ctx) { > > + *ctx = vq->desc_state_packed[id].indir_desc; > > + } > > } > > static inline bool more_used_packed(const struct vring_virtqueue *vq) > > { > > - return false; > > + u16 last_used, flags; > > + u8 avail, used; > > + > > + last_used = vq->last_used_idx; > > + flags = virtio16_to_cpu(vq->vq.vdev, > > + vq->vring_packed.desc[last_used].flags); > > + avail = !!(flags & VRING_DESC_F_AVAIL(1)); > > + used = !!(flags & VRING_DESC_F_USED(1)); > > + > > + return avail == used && used == vq->used_wrap_counter; > > Spec does not check avail == used? So there's probably a bug in either of > the two places. > > In what condition that avail != used but used == vq_used_wrap_counter? A > buggy device or device set the two bits in two writes? Currently, spec doesn't forbid devices to set the status bits as: avail != used but used == vq_used_wrap_counter. So I think driver cannot assume this won't happen. > > > } [...] > > static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq) > > { > > - return 0; > > + struct vring_virtqueue *vq = to_vvq(_vq); > > + > > + START_USE(vq); > > + > > + /* We optimistically turn back on interrupts, then check if there was > > +* more to do. */ > > + > > + if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) { > > + virtio_wmb(vq->weak_barriers); > > Missing comments for the barrier. Will add some comments. > > > + vq->event_flags_shadow = VRING_EVENT_F_ENABLE; > > + vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev, > > + vq->event_flags_shadow); > > + } > > + > > + END_USE(vq); > > + return vq->last_used_idx; > > } > > static bool virtqueue_poll_packed(struct virtqueue *_vq, unsigned > > last_used_idx) > > { > > - return false; > > + struct vring_virtqueue *vq = to_vvq(_vq); > > + u8 avail, used; > > + u16 flags; > > + > > + virtio_mb(vq->weak_barriers); > > + flags = virtio16_to_cpu(vq->vq.vdev, > > + vq->vring_packed.desc[last_used_idx].flags); > > + avail = !!(flags & VRING_DESC_F_AVAIL(1)); > > + used = !!(flags & VRING_DESC_F_USED(1)); > > + > > + return avail == used && used == vq->used_wrap_counter; > > } > > static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq) > > { > > - return false; > > + struct vring_virtqueue *vq = to_vvq(_vq); > > + > > + START_USE(vq); > > + > > + /* We optimistically turn back on interrupts, then check if there was > > +* more to do. */ > > + > > + if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) { > > + virtio_wmb(vq->weak_barriers); > > Need comments for the barrier. Will add some comments. Best regards, Tiwei Bie
Re: [PATCH net-next v12 1/5] net: Introduce generic failover module
On Fri, 25 May 2018 16:06:58 -0700 "Samudrala, Sridhar" wrote: > On 5/25/2018 3:38 PM, Stephen Hemminger wrote: > > On Thu, 24 May 2018 09:55:13 -0700 > > Sridhar Samudrala wrote: > > > >> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h > >> index 03ed492c4e14..0f4ba52b641d 100644 > >> --- a/include/linux/netdevice.h > >> +++ b/include/linux/netdevice.h > >> @@ -1421,6 +1421,8 @@ struct net_device_ops { > >>* entity (i.e. the master device for bridged veth) > >>* @IFF_MACSEC: device is a MACsec device > >>* @IFF_NO_RX_HANDLER: device doesn't support the rx_handler hook > >> + * @IFF_FAILOVER: device is a failover master device > >> + * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device > >>*/ > >> enum netdev_priv_flags { > >>IFF_802_1Q_VLAN = 1<<0, > >> @@ -1450,6 +1452,8 @@ enum netdev_priv_flags { > >>IFF_PHONY_HEADROOM = 1<<24, > >>IFF_MACSEC = 1<<25, > >>IFF_NO_RX_HANDLER = 1<<26, > >> + IFF_FAILOVER= 1<<27, > >> + IFF_FAILOVER_SLAVE = 1<<28, > >> }; > > Why is FAILOVER any different than other master/slave relationships. > > I don't think you need to take up precious netdev flag bits for this. > > These are netdev priv flags. > Jiri says that IFF_MASTER/IFF_SLAVE are bonding specific flags and cannot be > used > with other failover mechanisms. Team also doesn't use this flags and it has > its own > priv_flags. > They are already used by bonding and team. I don't see why this can't reuse them. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v5] virtio_blk: add DISCARD and WRIET ZEROES commands support
Existing virtio-blk protocol doesn't have DISCARD/WRITE ZEROES commands support, this will impact the performance when using SSD backend over file systems. Commit 88c85538 "virtio-blk: add discard and write zeroes features to specification"(see https://github.com/oasis-tcs/virtio-spec) extended existing virtio-blk protocol, adding extra DISCARD and WRITE ZEROES commands support. While here, using 16 bytes descriptor to describe one segment of DISCARD or WRITE ZEROES commands, each command may contain one or more decriptors. The following data structure shows the definition of one descriptor: struct virtio_blk_discard_write_zeroes { le64 sector; le32 num_sectors; le32 unmap; }; Field 'sector' means the start sector for DISCARD and WRITE ZEROES, filed 'num_sectors' means the number of sectors for DISCARD and WRITE ZEROES, if both DISCARD and WRITE ZEROES are supported, field 'unmap' maybe used for WRITE ZEROES command with DISCARD enabled. We also extended the virtio-blk configuration space to let backend device put DISCARD and WRITE ZEROES configuration parameters. struct virtio_blk_config { [...] le32 max_discard_sectors; le32 max_discard_seg; le32 discard_sector_alignment; le32 max_write_zeroes_sectors; le32 max_write_zeroes_seg; u8 write_zeroes_may_unmap; } New feature bit [VIRTIO_BLK_F_DISCARD (13)]: Device can support discard command, maximum discard sectors size in field 'max_discard_sectors' and maximum discard segment number in field 'max_discard_seg'. New feature [VIRTIO_BLK_F_WRITE_ZEROES (14)]: Device can support write zeroes command, maximum write zeroes sectors size in field 'max_write_zeroes_sectors' and maximum write zeroes segment number in field 'max_write_zeroes_seg'. The parameters in the configuration space of the device field 'max_discard_sectors' and field 'discard_sector_alignment' are expressed in 512-byte units if the VIRTIO_BLK_F_DISCARD feature bit is negotiated. The field 'max_write_zeroes_sectors' is expressed in 512-byte units if the VIRTIO_BLK_F_WRITE_ZEROES feature bit is negotiated. Signed-off-by: Changpeng Liu --- CHANGELOG: v5: use new block layer API: blk_queue_flag_set. v4: several optimizations based on MST's comments, remove bit field usage for command descriptor. v3: define the virtio-blk protocol to add discard and write zeroes support, first version implementation based on proposed specification. v2: add write zeroes command support. v1: initial proposal implementation for discard command. --- drivers/block/virtio_blk.c | 92 +++-- include/uapi/linux/virtio_blk.h | 43 +++ 2 files changed, 132 insertions(+), 3 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 4a07593c..fabab2a 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -172,10 +172,45 @@ static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr, return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); } +static inline int virtblk_setup_discard_write_zeroes(struct request *req, + bool unmap) +{ + unsigned short segments = blk_rq_nr_discard_segments(req); + unsigned short n = 0; + struct virtio_blk_discard_write_zeroes *range; + struct bio *bio; + + range = kmalloc_array(segments, sizeof(*range), GFP_KERNEL); + if (!range) + return -ENOMEM; + + __rq_for_each_bio(bio, req) { + u64 sector = bio->bi_iter.bi_sector; + u32 num_sectors = bio->bi_iter.bi_size >> 9; + + range[n].unmap = cpu_to_le32(unmap); + range[n].num_sectors = cpu_to_le32(num_sectors); + range[n].sector = cpu_to_le64(sector); + n++; + } + + req->special_vec.bv_page = virt_to_page(range); + req->special_vec.bv_offset = offset_in_page(range); + req->special_vec.bv_len = sizeof(*range) * segments; + req->rq_flags |= RQF_SPECIAL_PAYLOAD; + + return 0; +} + static inline void virtblk_request_done(struct request *req) { struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); + if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { + kfree(page_address(req->special_vec.bv_page) + + req->special_vec.bv_offset); + } + switch (req_op(req)) { case REQ_OP_SCSI_IN: case REQ_OP_SCSI_OUT: @@ -225,6 +260,7 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, int qid = hctx->queue_num; int err; bool notify = false; + bool unmap = false; u32 type; BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems); @@ -237,6 +273,13 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, case REQ_OP_FLUSH: type = VIRTIO_BLK_T_FLUSH; break;
[PATCH v5] virtio_blk: add DISCARD and WRIET ZEROES commands support
Existing virtio-blk protocol doesn't have DISCARD/WRITE ZEROES commands support, this will impact the performance when using SSD backend over file systems. Commit 88c85538 "virtio-blk: add discard and write zeroes features to specification"(see https://github.com/oasis-tcs/virtio-spec) extended existing virtio-blk protocol, adding extra DISCARD and WRITE ZEROES commands support. While here, using 16 bytes descriptor to describe one segment of DISCARD or WRITE ZEROES commands, each command may contain one or more decriptors. The following data structure shows the definition of one descriptor: struct virtio_blk_discard_write_zeroes { le64 sector; le32 num_sectors; le32 unmap; }; Field 'sector' means the start sector for DISCARD and WRITE ZEROES, filed 'num_sectors' means the number of sectors for DISCARD and WRITE ZEROES, if both DISCARD and WRITE ZEROES are supported, field 'unmap' maybe used for WRITE ZEROES command with DISCARD enabled. We also extended the virtio-blk configuration space to let backend device put DISCARD and WRITE ZEROES configuration parameters. struct virtio_blk_config { [...] le32 max_discard_sectors; le32 max_discard_seg; le32 discard_sector_alignment; le32 max_write_zeroes_sectors; le32 max_write_zeroes_seg; u8 write_zeroes_may_unmap; } New feature bit [VIRTIO_BLK_F_DISCARD (13)]: Device can support discard command, maximum discard sectors size in field 'max_discard_sectors' and maximum discard segment number in field 'max_discard_seg'. New feature [VIRTIO_BLK_F_WRITE_ZEROES (14)]: Device can support write zeroes command, maximum write zeroes sectors size in field 'max_write_zeroes_sectors' and maximum write zeroes segment number in field 'max_write_zeroes_seg'. The parameters in the configuration space of the device field 'max_discard_sectors' and field 'discard_sector_alignment' are expressed in 512-byte units if the VIRTIO_BLK_F_DISCARD feature bit is negotiated. The field 'max_write_zeroes_sectors' is expressed in 512-byte units if the VIRTIO_BLK_F_WRITE_ZEROES feature bit is negotiated. Signed-off-by: Changpeng Liu --- CHANGELOG: v5: use new block layer API: blk_queue_flag_set. v4: several optimizations based on MST's comments, remove bit field usage for command descriptor. v3: define the virtio-blk protocol to add discard and write zeroes support, first version implementation based on proposed specification. v2: add write zeroes command support. v1: initial proposal implementation for discard command. --- drivers/block/virtio_blk.c | 92 +++-- include/uapi/linux/virtio_blk.h | 43 +++ 2 files changed, 132 insertions(+), 3 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 4a07593c..a250fcc 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -172,10 +172,45 @@ static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr, return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); } +static inline int virtblk_setup_discard_write_zeroes(struct request *req, + bool unmap) +{ + unsigned short segments = blk_rq_nr_discard_segments(req); + unsigned short n = 0; + struct virtio_blk_discard_write_zeroes *range; + struct bio *bio; + + range = kmalloc_array(segments, sizeof(*range), GFP_KERNEL); + if (!range) + return -ENOMEM; + + __rq_for_each_bio(bio, req) { + u64 sector = bio->bi_iter.bi_sector; + u32 num_sectors = bio->bi_iter.bi_size >> 9; + + range[n].unmap = cpu_to_le32(unmap); + range[n].num_sectors = cpu_to_le32(num_sectors); + range[n].sector = cpu_to_le64(sector); + n++; + } + + req->special_vec.bv_page = virt_to_page(range); + req->special_vec.bv_offset = offset_in_page(range); + req->special_vec.bv_len = sizeof(*range) * segments; + req->rq_flags |= RQF_SPECIAL_PAYLOAD; + + return 0; +} + static inline void virtblk_request_done(struct request *req) { struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); + if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { + kfree(page_address(req->special_vec.bv_page) + + req->special_vec.bv_offset); + } + switch (req_op(req)) { case REQ_OP_SCSI_IN: case REQ_OP_SCSI_OUT: @@ -225,6 +260,7 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, int qid = hctx->queue_num; int err; bool notify = false; + bool unmap = false; u32 type; BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems); @@ -237,6 +273,13 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, case REQ_OP_FLUSH: type = VIRTIO_BLK_T_FLUSH; break;
Re: [RFC v5 3/5] virtio_ring: add packed ring support
On 2018年05月22日 16:16, Tiwei Bie wrote: This commit introduces the support (without EVENT_IDX) for packed ring. Signed-off-by: Tiwei Bie --- drivers/virtio/virtio_ring.c | 474 ++- 1 file changed, 468 insertions(+), 6 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index f5ef5f42a7cf..eb9fd5207a68 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -62,6 +62,12 @@ struct vring_desc_state { }; struct vring_desc_state_packed { + void *data; /* Data for callback. */ + struct vring_packed_desc *indir_desc; /* Indirect descriptor, if any. */ + int num;/* Descriptor list length. */ + dma_addr_t addr;/* Buffer DMA addr. */ + u32 len;/* Buffer length. */ + u16 flags; /* Descriptor flags. */ int next; /* The next desc state. */ }; @@ -758,6 +764,72 @@ static inline unsigned vring_size_packed(unsigned int num, unsigned long align) & ~(align - 1)) + sizeof(struct vring_packed_desc_event) * 2; } +static void vring_unmap_state_packed(const struct vring_virtqueue *vq, +struct vring_desc_state_packed *state) +{ + u16 flags; + + if (!vring_use_dma_api(vq->vq.vdev)) + return; + + flags = state->flags; + + if (flags & VRING_DESC_F_INDIRECT) { + dma_unmap_single(vring_dma_dev(vq), +state->addr, state->len, +(flags & VRING_DESC_F_WRITE) ? +DMA_FROM_DEVICE : DMA_TO_DEVICE); + } else { + dma_unmap_page(vring_dma_dev(vq), + state->addr, state->len, + (flags & VRING_DESC_F_WRITE) ? + DMA_FROM_DEVICE : DMA_TO_DEVICE); + } +} + +static void vring_unmap_desc_packed(const struct vring_virtqueue *vq, + struct vring_packed_desc *desc) +{ + u16 flags; + + if (!vring_use_dma_api(vq->vq.vdev)) + return; + + flags = virtio16_to_cpu(vq->vq.vdev, desc->flags); + + if (flags & VRING_DESC_F_INDIRECT) { + dma_unmap_single(vring_dma_dev(vq), +virtio64_to_cpu(vq->vq.vdev, desc->addr), +virtio32_to_cpu(vq->vq.vdev, desc->len), +(flags & VRING_DESC_F_WRITE) ? +DMA_FROM_DEVICE : DMA_TO_DEVICE); + } else { + dma_unmap_page(vring_dma_dev(vq), + virtio64_to_cpu(vq->vq.vdev, desc->addr), + virtio32_to_cpu(vq->vq.vdev, desc->len), + (flags & VRING_DESC_F_WRITE) ? + DMA_FROM_DEVICE : DMA_TO_DEVICE); + } +} + +static struct vring_packed_desc *alloc_indirect_packed(struct virtqueue *_vq, + unsigned int total_sg, + gfp_t gfp) +{ + struct vring_packed_desc *desc; + + /* +* We require lowmem mappings for the descriptors because +* otherwise virt_to_phys will give us bogus addresses in the +* virtqueue. +*/ + gfp &= ~__GFP_HIGHMEM; + + desc = kmalloc(total_sg * sizeof(struct vring_packed_desc), gfp); + + return desc; +} + static inline int virtqueue_add_packed(struct virtqueue *_vq, struct scatterlist *sgs[], unsigned int total_sg, @@ -767,47 +839,437 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq, void *ctx, gfp_t gfp) { + struct vring_virtqueue *vq = to_vvq(_vq); + struct vring_packed_desc *desc; + struct scatterlist *sg; + unsigned int i, n, descs_used, uninitialized_var(prev), err_idx; + __virtio16 uninitialized_var(head_flags), flags; + u16 head, avail_wrap_counter, id, cur; + bool indirect; + + START_USE(vq); + + BUG_ON(data == NULL); + BUG_ON(ctx && vq->indirect); + + if (unlikely(vq->broken)) { + END_USE(vq); + return -EIO; + } + +#ifdef DEBUG + { + ktime_t now = ktime_get(); + + /* No kick or get, with .1 second between? Warn. */ + if (vq->last_add_time_valid) + WARN_ON(ktime_to_ms(ktime_sub(now, vq->last_add_time)) + > 100); + vq->last_add_time = now; + vq->last_add_time_valid = true; + } +#endif + + BUG_ON(total_sg
Re: [PATCH net-next v12 0/5] Enable virtio_net to act as a standby for a passthru device
From: Sridhar Samudrala Date: Thu, 24 May 2018 09:55:12 -0700 > The main motivation for this patch is to enable cloud service providers > to provide an accelerated datapath to virtio-net enabled VMs in a > transparent manner with no/minimal guest userspace changes. This also > enables hypervisor controlled live migration to be supported with VMs that > have direct attached SR-IOV VF devices. ... Series applied, thank you. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [RFC v5 2/5] virtio_ring: support creating packed ring
On 2018年05月22日 16:16, Tiwei Bie wrote: This commit introduces the support for creating packed ring. All split ring specific functions are added _split suffix. Some necessary stubs for packed ring are also added. Signed-off-by: Tiwei Bie --- drivers/virtio/virtio_ring.c | 801 +++ include/linux/virtio_ring.h | 8 +- 2 files changed, 546 insertions(+), 263 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 71458f493cf8..f5ef5f42a7cf 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -61,11 +61,15 @@ struct vring_desc_state { struct vring_desc *indir_desc; /* Indirect descriptor, if any. */ }; +struct vring_desc_state_packed { + int next; /* The next desc state. */ +}; + struct vring_virtqueue { struct virtqueue vq; - /* Actual memory layout for this queue */ - struct vring vring; + /* Is this a packed ring? */ + bool packed; /* Can we use weak barriers? */ bool weak_barriers; @@ -87,11 +91,39 @@ struct vring_virtqueue { /* Last used index we've seen. */ u16 last_used_idx; - /* Last written value to avail->flags */ - u16 avail_flags_shadow; + union { + /* Available for split ring */ + struct { + /* Actual memory layout for this queue. */ + struct vring vring; - /* Last written value to avail->idx in guest byte order */ - u16 avail_idx_shadow; + /* Last written value to avail->flags */ + u16 avail_flags_shadow; + + /* Last written value to avail->idx in +* guest byte order. */ + u16 avail_idx_shadow; + }; + + /* Available for packed ring */ + struct { + /* Actual memory layout for this queue. */ + struct vring_packed vring_packed; + + /* Driver ring wrap counter. */ + u8 avail_wrap_counter; + + /* Device ring wrap counter. */ + u8 used_wrap_counter; How about just use boolean? + + /* Index of the next avail descriptor. */ + u16 next_avail_idx; + + /* Last written value to driver->flags in +* guest byte order. */ + u16 event_flags_shadow; + }; + }; /* How to notify other side. FIXME: commonalize hcalls! */ bool (*notify)(struct virtqueue *vq); @@ -111,11 +143,24 @@ struct vring_virtqueue { #endif /* Per-descriptor state. */ - struct vring_desc_state desc_state[]; + union { + struct vring_desc_state desc_state[1]; + struct vring_desc_state_packed desc_state_packed[1]; + }; }; #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq) +static inline bool virtqueue_use_indirect(struct virtqueue *_vq, + unsigned int total_sg) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + /* If the host supports indirect descriptor tables, and we have multiple +* buffers, then go indirect. FIXME: tune this threshold */ + return (vq->indirect && total_sg > 1 && vq->vq.num_free); +} + /* * Modern virtio devices have feature bits to specify whether they need a * quirk and bypass the IOMMU. If not there, just use the DMA API. @@ -201,8 +246,17 @@ static dma_addr_t vring_map_single(const struct vring_virtqueue *vq, cpu_addr, size, direction); } -static void vring_unmap_one(const struct vring_virtqueue *vq, - struct vring_desc *desc) +static int vring_mapping_error(const struct vring_virtqueue *vq, + dma_addr_t addr) +{ + if (!vring_use_dma_api(vq->vq.vdev)) + return 0; + + return dma_mapping_error(vring_dma_dev(vq), addr); +} + +static void vring_unmap_one_split(const struct vring_virtqueue *vq, + struct vring_desc *desc) { u16 flags; @@ -226,17 +280,9 @@ static void vring_unmap_one(const struct vring_virtqueue *vq, } } -static int vring_mapping_error(const struct vring_virtqueue *vq, - dma_addr_t addr) -{ - if (!vring_use_dma_api(vq->vq.vdev)) - return 0; - - return dma_mapping_error(vring_dma_dev(vq), addr); -} It looks to me if you keep vring_mapping_error behind vring_unmap_one_split(), lots of changes were unncessary. - -static struct vring_desc *alloc_indirect(struct virtqueue *_vq, -unsigned int total_sg, gfp_t gfp) +static struct vring_desc
[RFC V5 PATCH 8/8] vhost: event suppression for packed ring
This patch introduces basic support for event suppression aka driver and device area. Signed-off-by: Jason Wang --- drivers/vhost/vhost.c| 191 --- drivers/vhost/vhost.h| 10 +- include/uapi/linux/virtio_ring.h | 19 3 files changed, 204 insertions(+), 16 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index a36e5ad2..112f680 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1112,10 +1112,15 @@ static int vq_access_ok_packed(struct vhost_virtqueue *vq, unsigned int num, struct vring_used __user *used) { struct vring_desc_packed *packed = (struct vring_desc_packed *)desc; + struct vring_packed_desc_event *driver_event = + (struct vring_packed_desc_event *)avail; + struct vring_packed_desc_event *device_event = + (struct vring_packed_desc_event *)used; - /* FIXME: check device area and driver area */ return access_ok(VERIFY_READ, packed, num * sizeof(*packed)) && - access_ok(VERIFY_WRITE, packed, num * sizeof(*packed)); + access_ok(VERIFY_WRITE, packed, num * sizeof(*packed)) && + access_ok(VERIFY_READ, driver_event, sizeof(*driver_event)) && + access_ok(VERIFY_WRITE, device_event, sizeof(*device_event)); } static int vq_access_ok_split(struct vhost_virtqueue *vq, unsigned int num, @@ -1190,14 +1195,27 @@ static bool iotlb_access_ok(struct vhost_virtqueue *vq, return true; } -int vq_iotlb_prefetch(struct vhost_virtqueue *vq) +int vq_iotlb_prefetch_packed(struct vhost_virtqueue *vq) +{ + int num = vq->num; + + return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc, + num * sizeof(*vq->desc), VHOST_ADDR_DESC) && + iotlb_access_ok(vq, VHOST_ACCESS_WO, (u64)(uintptr_t)vq->desc, + num * sizeof(*vq->desc), VHOST_ADDR_DESC) && + iotlb_access_ok(vq, VHOST_ACCESS_RO, + (u64)(uintptr_t)vq->driver_event, + sizeof(*vq->driver_event), VHOST_ADDR_AVAIL) && + iotlb_access_ok(vq, VHOST_ACCESS_WO, + (u64)(uintptr_t)vq->device_event, + sizeof(*vq->device_event), VHOST_ADDR_USED); +} + +int vq_iotlb_prefetch_split(struct vhost_virtqueue *vq) { size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; unsigned int num = vq->num; - if (!vq->iotlb) - return 1; - return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc, num * sizeof(*vq->desc), VHOST_ADDR_DESC) && iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->avail, @@ -1209,6 +1227,17 @@ int vq_iotlb_prefetch(struct vhost_virtqueue *vq) num * sizeof(*vq->used->ring) + s, VHOST_ADDR_USED); } + +int vq_iotlb_prefetch(struct vhost_virtqueue *vq) +{ + if (!vq->iotlb) + return 1; + + if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) + return vq_iotlb_prefetch_packed(vq); + else + return vq_iotlb_prefetch_split(vq); +} EXPORT_SYMBOL_GPL(vq_iotlb_prefetch); /* Can we log writes? */ @@ -1730,6 +1759,50 @@ static int vhost_update_used_flags(struct vhost_virtqueue *vq) return 0; } +static int vhost_update_device_flags(struct vhost_virtqueue *vq, +__virtio16 device_flags) +{ + void __user *flags; + + if (vhost_put_user(vq, device_flags, >device_event->flags, + VHOST_ADDR_USED) < 0) + return -EFAULT; + if (unlikely(vq->log_used)) { + /* Make sure the flag is seen before log. */ + smp_wmb(); + /* Log used flag write. */ + flags = >device_event->flags; + log_write(vq->log_base, vq->log_addr + + (flags - (void __user *)vq->device_event), + sizeof(vq->device_event->flags)); + if (vq->log_ctx) + eventfd_signal(vq->log_ctx, 1); + } + return 0; +} + +static int vhost_update_device_off_wrap(struct vhost_virtqueue *vq, + __virtio16 device_off_wrap) +{ + void __user *off_wrap; + + if (vhost_put_user(vq, device_off_wrap, >device_event->off_wrap, + VHOST_ADDR_USED) < 0) + return -EFAULT; + if (unlikely(vq->log_used)) { + /* Make sure the flag is seen before log. */ + smp_wmb(); + /* Log used flag write. */ + off_wrap = >device_event->off_wrap; + log_write(vq->log_base, vq->log_addr + + (off_wrap -
[RFC V5 PATCH 7/8] vhost: packed ring support
Signed-off-by: Jason Wang --- drivers/vhost/net.c | 13 +- drivers/vhost/vhost.c | 585 ++ drivers/vhost/vhost.h | 13 +- 3 files changed, 566 insertions(+), 45 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 30273ad..4991aa4 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -71,7 +71,8 @@ enum { VHOST_NET_FEATURES = VHOST_FEATURES | (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) | (1ULL << VIRTIO_NET_F_MRG_RXBUF) | -(1ULL << VIRTIO_F_IOMMU_PLATFORM) +(1ULL << VIRTIO_F_IOMMU_PLATFORM) | +(1ULL << VIRTIO_F_RING_PACKED) }; enum { @@ -576,7 +577,7 @@ static void handle_tx(struct vhost_net *net) nvq->upend_idx = ((unsigned)nvq->upend_idx - 1) % UIO_MAXIOV; } - vhost_discard_vq_desc(vq, 1); + vhost_discard_vq_desc(vq, , 1); vhost_net_enable_vq(net, vq); break; } @@ -714,9 +715,11 @@ static void handle_rx(struct vhost_net *net) mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF); while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk))) { + struct vhost_used_elem *used = vq->heads + nheads; + sock_len += sock_hlen; vhost_len = sock_len + vhost_hlen; - err = vhost_get_bufs(vq, vq->heads + nheads, vhost_len, + err = vhost_get_bufs(vq, used, vhost_len, , vq_log, , likely(mergeable) ? UIO_MAXIOV : 1, ); @@ -762,7 +765,7 @@ static void handle_rx(struct vhost_net *net) if (unlikely(err != sock_len)) { pr_debug("Discarded rx packet: " " len %d, expected %zd\n", err, sock_len); - vhost_discard_vq_desc(vq, headcount); + vhost_discard_vq_desc(vq, used, 1); continue; } /* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */ @@ -786,7 +789,7 @@ static void handle_rx(struct vhost_net *net) copy_to_iter(_buffers, sizeof num_buffers, ) != sizeof num_buffers) { vq_err(vq, "Failed num_buffers write"); - vhost_discard_vq_desc(vq, headcount); + vhost_discard_vq_desc(vq, used, 1); goto out; } nheads += headcount; diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 4031a8f..a36e5ad2 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -323,6 +323,9 @@ static void vhost_vq_reset(struct vhost_dev *dev, vhost_reset_is_le(vq); vhost_disable_cross_endian(vq); vq->busyloop_timeout = 0; + vq->used_wrap_counter = true; + vq->last_avail_wrap_counter = true; + vq->avail_wrap_counter = true; vq->umem = NULL; vq->iotlb = NULL; __vhost_vq_meta_reset(vq); @@ -1103,11 +1106,22 @@ static int vhost_iotlb_miss(struct vhost_virtqueue *vq, u64 iova, int access) return 0; } -static bool vq_access_ok(struct vhost_virtqueue *vq, unsigned int num, -struct vring_desc __user *desc, -struct vring_avail __user *avail, -struct vring_used __user *used) +static int vq_access_ok_packed(struct vhost_virtqueue *vq, unsigned int num, + struct vring_desc __user *desc, + struct vring_avail __user *avail, + struct vring_used __user *used) +{ + struct vring_desc_packed *packed = (struct vring_desc_packed *)desc; + + /* FIXME: check device area and driver area */ + return access_ok(VERIFY_READ, packed, num * sizeof(*packed)) && + access_ok(VERIFY_WRITE, packed, num * sizeof(*packed)); +} +static int vq_access_ok_split(struct vhost_virtqueue *vq, unsigned int num, + struct vring_desc __user *desc, + struct vring_avail __user *avail, + struct vring_used __user *used) { size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; @@ -1118,6 +1132,17 @@ static bool vq_access_ok(struct vhost_virtqueue *vq, unsigned int num, sizeof *used + num * sizeof *used->ring + s); } +static int vq_access_ok(struct vhost_virtqueue *vq, unsigned int num, + struct vring_desc __user *desc, + struct vring_avail __user *avail, + struct
[RFC V5 PATCH 6/8] virtio: introduce packed ring defines
Signed-off-by: Jason Wang --- include/uapi/linux/virtio_config.h | 9 + include/uapi/linux/virtio_ring.h | 13 + 2 files changed, 22 insertions(+) diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h index 308e209..5903d51 100644 --- a/include/uapi/linux/virtio_config.h +++ b/include/uapi/linux/virtio_config.h @@ -71,4 +71,13 @@ * this is for compatibility with legacy systems. */ #define VIRTIO_F_IOMMU_PLATFORM33 + +#define VIRTIO_F_RING_PACKED 34 + +/* + * This feature indicates that all buffers are used by the device in + * the same order in which they have been made available. + */ +#define VIRTIO_F_IN_ORDER 35 + #endif /* _UAPI_LINUX_VIRTIO_CONFIG_H */ diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h index 6d5d5fa..e297580 100644 --- a/include/uapi/linux/virtio_ring.h +++ b/include/uapi/linux/virtio_ring.h @@ -43,6 +43,8 @@ #define VRING_DESC_F_WRITE 2 /* This means the buffer contains a list of buffer descriptors. */ #define VRING_DESC_F_INDIRECT 4 +#define VRING_DESC_F_AVAIL 7 +#define VRING_DESC_F_USED 15 /* The Host uses this in used->flags to advise the Guest: don't kick me when * you add a buffer. It's unreliable, so it's simply an optimization. Guest @@ -62,6 +64,17 @@ * at the end of the used ring. Guest should ignore the used->flags field. */ #define VIRTIO_RING_F_EVENT_IDX29 +struct vring_desc_packed { + /* Buffer Address. */ + __virtio64 addr; + /* Buffer Length. */ + __virtio32 len; + /* Buffer ID. */ + __virtio16 id; + /* The flags depending on descriptor type. */ + __virtio16 flags; +}; + /* Virtio ring descriptors: 16 bytes. These can chain together via "next". */ struct vring_desc { /* Address (guest-physical). */ -- 2.7.4 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[RFC V5 PATCH 5/8] vhost: vhost_put_user() can accept metadata type
We assumes used ring update is the only user for vhost_put_user() in the past. This may not be the case for the incoming packed ring which may update the descriptor ring for used. So introduce a new type parameter. Signed-off-by: Jason Wang --- drivers/vhost/vhost.c | 14 +++--- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index e0fcfec..4031a8f 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -811,7 +811,7 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq, return __vhost_get_user_slow(vq, addr, size, type); } -#define vhost_put_user(vq, x, ptr) \ +#define vhost_put_user(vq, x, ptr, type) \ ({ \ int ret = -EFAULT; \ if (!vq->iotlb) { \ @@ -819,7 +819,7 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq, } else { \ __typeof__(ptr) to = \ (__typeof__(ptr)) __vhost_get_user(vq, ptr, \ - sizeof(*ptr), VHOST_ADDR_USED); \ + sizeof(*ptr), type); \ if (to != NULL) \ ret = __put_user(x, to); \ else \ @@ -1683,7 +1683,7 @@ static int vhost_update_used_flags(struct vhost_virtqueue *vq) { void __user *used; if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags), - >used->flags) < 0) + >used->flags, VHOST_ADDR_USED) < 0) return -EFAULT; if (unlikely(vq->log_used)) { /* Make sure the flag is seen before log. */ @@ -1702,7 +1702,7 @@ static int vhost_update_used_flags(struct vhost_virtqueue *vq) static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 avail_event) { if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx), - vhost_avail_event(vq))) + vhost_avail_event(vq), VHOST_ADDR_USED)) return -EFAULT; if (unlikely(vq->log_used)) { void __user *used; @@ -2185,12 +2185,12 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq, used = vq->used->ring + start; for (i = 0; i < count; i++) { if (unlikely(vhost_put_user(vq, heads[i].elem.id, - [i].id))) { + [i].id, VHOST_ADDR_USED))) { vq_err(vq, "Failed to write used id"); return -EFAULT; } if (unlikely(vhost_put_user(vq, heads[i].elem.len, - [i].len))) { + [i].len, VHOST_ADDR_USED))) { vq_err(vq, "Failed to write used len"); return -EFAULT; } @@ -2236,7 +2236,7 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vhost_used_elem *heads, /* Make sure buffer is written before we update index. */ smp_wmb(); if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx), - >used->idx)) { + >used->idx, VHOST_ADDR_USED)) { vq_err(vq, "Failed to increment used idx"); return -EFAULT; } -- 2.7.4 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[RFC V5 PATCH 4/8] vhost_net: do not explicitly manipulate vhost_used_elem
Two helpers of setting/getting used len were introduced to avoid explicitly manipulating vhost_used_elem in zerocopy code. This will be used to hide used_elem internals and simplify packed ring implementation. Signed-off-by: Jason Wang --- drivers/vhost/net.c | 11 +-- drivers/vhost/vhost.c | 12 ++-- drivers/vhost/vhost.h | 5 + 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 3826f1f..30273ad 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -341,9 +341,10 @@ static void vhost_zerocopy_signal_used(struct vhost_net *net, int j = 0; for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) { - if (vq->heads[i].elem.len == VHOST_DMA_FAILED_LEN) + if (vhost_get_used_len(vq, >heads[i]) == + VHOST_DMA_FAILED_LEN) vhost_net_tx_err(net); - if (VHOST_DMA_IS_DONE(vq->heads[i].elem.len)) { + if (VHOST_DMA_IS_DONE(vhost_get_used_len(vq, >heads[i]))) { vq->heads[i].elem.len = VHOST_DMA_CLEAR_LEN; ++j; } else @@ -542,10 +543,8 @@ static void handle_tx(struct vhost_net *net) struct ubuf_info *ubuf; ubuf = nvq->ubuf_info + nvq->upend_idx; - vq->heads[nvq->upend_idx].elem.id = - cpu_to_vhost32(vq, used.elem.id); - vq->heads[nvq->upend_idx].elem.len = - VHOST_DMA_IN_PROGRESS; + vhost_set_used_len(vq, , VHOST_DMA_IN_PROGRESS); + vq->heads[nvq->upend_idx] = used; ubuf->callback = vhost_zerocopy_callback; ubuf->ctx = nvq->ubufs; ubuf->desc = nvq->upend_idx; diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 2b2a776..e0fcfec 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2067,11 +2067,19 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, } EXPORT_SYMBOL_GPL(vhost_get_vq_desc); -static void vhost_set_used_len(struct vhost_virtqueue *vq, - struct vhost_used_elem *used, int len) +void vhost_set_used_len(struct vhost_virtqueue *vq, + struct vhost_used_elem *used, int len) { used->elem.len = cpu_to_vhost32(vq, len); } +EXPORT_SYMBOL_GPL(vhost_set_used_len); + +int vhost_get_used_len(struct vhost_virtqueue *vq, + struct vhost_used_elem *used) +{ + return vhost32_to_cpu(vq, used->elem.len); +} +EXPORT_SYMBOL_GPL(vhost_get_used_len); /* This is a multi-buffer version of vhost_get_desc, that works if * vq has read descriptors only. diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 8dea44b..604821b 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -198,6 +198,11 @@ int vhost_get_bufs(struct vhost_virtqueue *vq, unsigned *log_num, unsigned int quota, s16 *count); +void vhost_set_used_len(struct vhost_virtqueue *vq, + struct vhost_used_elem *used, + int len); +int vhost_get_used_len(struct vhost_virtqueue *vq, + struct vhost_used_elem *used); void vhost_discard_vq_desc(struct vhost_virtqueue *, int n); int vhost_vq_init_access(struct vhost_virtqueue *); -- 2.7.4 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[RFC V5 PATCH 3/8] vhost: do not use vring_used_elem
Instead of depending on the exported vring_used_elem, this patch switches to use a new internal structure vhost_used_elem which embed vring_used_elem in itself. This could be used to let vhost to record extra metadata for the incoming packed ring layout. Signed-off-by: Jason Wang --- drivers/vhost/net.c | 19 +++--- drivers/vhost/scsi.c | 10 drivers/vhost/vhost.c | 68 --- drivers/vhost/vhost.h | 18 -- drivers/vhost/vsock.c | 6 ++--- 5 files changed, 45 insertions(+), 76 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 826489c..3826f1f 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -341,10 +341,10 @@ static void vhost_zerocopy_signal_used(struct vhost_net *net, int j = 0; for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) { - if (vq->heads[i].len == VHOST_DMA_FAILED_LEN) + if (vq->heads[i].elem.len == VHOST_DMA_FAILED_LEN) vhost_net_tx_err(net); - if (VHOST_DMA_IS_DONE(vq->heads[i].len)) { - vq->heads[i].len = VHOST_DMA_CLEAR_LEN; + if (VHOST_DMA_IS_DONE(vq->heads[i].elem.len)) { + vq->heads[i].elem.len = VHOST_DMA_CLEAR_LEN; ++j; } else break; @@ -367,7 +367,7 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success) rcu_read_lock_bh(); /* set len to mark this desc buffers done DMA */ - vq->heads[ubuf->desc].len = success ? + vq->heads[ubuf->desc].elem.len = success ? VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN; cnt = vhost_net_ubuf_put(ubufs); @@ -426,7 +426,7 @@ static int vhost_net_enable_vq(struct vhost_net *n, static int vhost_net_tx_get_vq_desc(struct vhost_net *net, struct vhost_virtqueue *vq, - struct vring_used_elem *used_elem, + struct vhost_used_elem *used_elem, struct iovec iov[], unsigned int iov_size, unsigned int *out_num, unsigned int *in_num) { @@ -477,7 +477,7 @@ static void handle_tx(struct vhost_net *net) size_t hdr_size; struct socket *sock; struct vhost_net_ubuf_ref *uninitialized_var(ubufs); - struct vring_used_elem used; + struct vhost_used_elem used; bool zcopy, zcopy_used; int sent_pkts = 0; @@ -542,9 +542,10 @@ static void handle_tx(struct vhost_net *net) struct ubuf_info *ubuf; ubuf = nvq->ubuf_info + nvq->upend_idx; - vq->heads[nvq->upend_idx].id = - cpu_to_vhost32(vq, used.id); - vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS; + vq->heads[nvq->upend_idx].elem.id = + cpu_to_vhost32(vq, used.elem.id); + vq->heads[nvq->upend_idx].elem.len = + VHOST_DMA_IN_PROGRESS; ubuf->callback = vhost_zerocopy_callback; ubuf->ctx = nvq->ubufs; ubuf->desc = nvq->upend_idx; diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index 654c71f..ac11412 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -67,7 +67,7 @@ struct vhost_scsi_inflight { struct vhost_scsi_cmd { /* Descriptor from vhost_get_vq_desc() for virt_queue segment */ - struct vring_used_elem tvc_vq_used; + struct vhost_used_elem tvc_vq_used; /* virtio-scsi initiator task attribute */ int tvc_task_attr; /* virtio-scsi response incoming iovecs */ @@ -441,7 +441,7 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct vhost_scsi_evt *evt) struct vhost_virtqueue *vq = >vqs[VHOST_SCSI_VQ_EVT].vq; struct virtio_scsi_event *event = >event; struct virtio_scsi_event __user *eventp; - struct vring_used_elem used; + struct vhost_used_elem used; unsigned out, in; int ret; @@ -785,7 +785,7 @@ static void vhost_scsi_submission_work(struct work_struct *work) static void vhost_scsi_send_bad_target(struct vhost_scsi *vs, struct vhost_virtqueue *vq, - struct vring_used_elem *used, unsigned out) + struct vhost_used_elem *used, unsigned out) { struct virtio_scsi_cmd_resp __user *resp; struct virtio_scsi_cmd_resp rsp; @@ -808,7 +808,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) struct virtio_scsi_cmd_req v_req; struct virtio_scsi_cmd_req_pi v_req_pi; struct vhost_scsi_cmd *cmd; - struct vring_used_elem used; + struct
[RFC V5 PATCH 2/8] vhost: hide used ring layout from device
We used to return descriptor head by vhost_get_vq_desc() to device and pass it back to vhost_add_used() and its friends. This exposes the internal used ring layout to device which makes it hard to be extended for e.g packed ring layout. So this patch tries to hide the used ring layout by - letting vhost_get_vq_desc() return pointer to struct vring_used_elem - accepting pointer to struct vring_used_elem in vhost_add_used() and vhost_add_used_and_signal() This could help to hide used ring layout and make it easier to implement packed ring on top. Signed-off-by: Jason Wang --- drivers/vhost/net.c | 46 +- drivers/vhost/scsi.c | 62 +++ drivers/vhost/vhost.c | 52 +- drivers/vhost/vhost.h | 9 +--- drivers/vhost/vsock.c | 42 +- 5 files changed, 112 insertions(+), 99 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 762aa81..826489c 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -426,22 +426,24 @@ static int vhost_net_enable_vq(struct vhost_net *n, static int vhost_net_tx_get_vq_desc(struct vhost_net *net, struct vhost_virtqueue *vq, + struct vring_used_elem *used_elem, struct iovec iov[], unsigned int iov_size, unsigned int *out_num, unsigned int *in_num) { unsigned long uninitialized_var(endtime); - int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), + int r = vhost_get_vq_desc(vq, used_elem, vq->iov, ARRAY_SIZE(vq->iov), out_num, in_num, NULL, NULL); - if (r == vq->num && vq->busyloop_timeout) { + if (r == -ENOSPC && vq->busyloop_timeout) { preempt_disable(); endtime = busy_clock() + vq->busyloop_timeout; while (vhost_can_busy_poll(vq->dev, endtime) && vhost_vq_avail_empty(vq->dev, vq)) cpu_relax(); preempt_enable(); - r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), - out_num, in_num, NULL, NULL); + r = vhost_get_vq_desc(vq, used_elem, vq->iov, + ARRAY_SIZE(vq->iov), out_num, in_num, + NULL, NULL); } return r; @@ -463,7 +465,6 @@ static void handle_tx(struct vhost_net *net) struct vhost_net_virtqueue *nvq = >vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *vq = >vq; unsigned out, in; - int head; struct msghdr msg = { .msg_name = NULL, .msg_namelen = 0, @@ -476,6 +477,7 @@ static void handle_tx(struct vhost_net *net) size_t hdr_size; struct socket *sock; struct vhost_net_ubuf_ref *uninitialized_var(ubufs); + struct vring_used_elem used; bool zcopy, zcopy_used; int sent_pkts = 0; @@ -499,20 +501,20 @@ static void handle_tx(struct vhost_net *net) vhost_zerocopy_signal_used(net, vq); - head = vhost_net_tx_get_vq_desc(net, vq, vq->iov, - ARRAY_SIZE(vq->iov), - , ); - /* On error, stop handling until the next kick. */ - if (unlikely(head < 0)) - break; + err = vhost_net_tx_get_vq_desc(net, vq, , vq->iov, + ARRAY_SIZE(vq->iov), + , ); /* Nothing new? Wait for eventfd to tell us they refilled. */ - if (head == vq->num) { + if (err == -ENOSPC) { if (unlikely(vhost_enable_notify(>dev, vq))) { vhost_disable_notify(>dev, vq); continue; } break; } + /* On error, stop handling until the next kick. */ + if (unlikely(err < 0)) + break; if (in) { vq_err(vq, "Unexpected descriptor format for TX: " "out %d, int %d\n", out, in); @@ -540,7 +542,8 @@ static void handle_tx(struct vhost_net *net) struct ubuf_info *ubuf; ubuf = nvq->ubuf_info + nvq->upend_idx; - vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head); + vq->heads[nvq->upend_idx].id = + cpu_to_vhost32(vq, used.id); vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS; ubuf->callback =
[RFC V5 PATCH 1/8] vhost: move get_rx_bufs to vhost.c
Move get_rx_bufs() to vhost.c and rename it to vhost_get_bufs(). This helps to hide vring internal layout from specific device implementation. Packed ring implementation will benefit from this. Signed-off-by: Jason Wang --- drivers/vhost/net.c | 83 ++- drivers/vhost/vhost.c | 78 +++ drivers/vhost/vhost.h | 7 + 3 files changed, 88 insertions(+), 80 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 986058a..762aa81 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -664,83 +664,6 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk) return len; } -/* This is a multi-buffer version of vhost_get_desc, that works if - * vq has read descriptors only. - * @vq - the relevant virtqueue - * @datalen- data length we'll be reading - * @iovcount - returned count of io vectors we fill - * @log- vhost log - * @log_num- log offset - * @quota - headcount quota, 1 for big buffer - * returns number of buffer heads allocated, negative on error - */ -static int get_rx_bufs(struct vhost_virtqueue *vq, - struct vring_used_elem *heads, - int datalen, - unsigned *iovcount, - struct vhost_log *log, - unsigned *log_num, - unsigned int quota) -{ - unsigned int out, in; - int seg = 0; - int headcount = 0; - unsigned d; - int r, nlogs = 0; - /* len is always initialized before use since we are always called with -* datalen > 0. -*/ - u32 uninitialized_var(len); - - while (datalen > 0 && headcount < quota) { - if (unlikely(seg >= UIO_MAXIOV)) { - r = -ENOBUFS; - goto err; - } - r = vhost_get_vq_desc(vq, vq->iov + seg, - ARRAY_SIZE(vq->iov) - seg, , - , log, log_num); - if (unlikely(r < 0)) - goto err; - - d = r; - if (d == vq->num) { - r = 0; - goto err; - } - if (unlikely(out || in <= 0)) { - vq_err(vq, "unexpected descriptor format for RX: " - "out %d, in %d\n", out, in); - r = -EINVAL; - goto err; - } - if (unlikely(log)) { - nlogs += *log_num; - log += *log_num; - } - heads[headcount].id = cpu_to_vhost32(vq, d); - len = iov_length(vq->iov + seg, in); - heads[headcount].len = cpu_to_vhost32(vq, len); - datalen -= len; - ++headcount; - seg += in; - } - heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen); - *iovcount = seg; - if (unlikely(log)) - *log_num = nlogs; - - /* Detect overrun */ - if (unlikely(datalen > 0)) { - r = UIO_MAXIOV + 1; - goto err; - } - return headcount; -err: - vhost_discard_vq_desc(vq, headcount); - return r; -} - /* Expects to be always run from workqueue - which acts as * read-size critical section for our kind of RCU. */ static void handle_rx(struct vhost_net *net) @@ -790,9 +713,9 @@ static void handle_rx(struct vhost_net *net) while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk))) { sock_len += sock_hlen; vhost_len = sock_len + vhost_hlen; - headcount = get_rx_bufs(vq, vq->heads + nheads, vhost_len, - , vq_log, , - likely(mergeable) ? UIO_MAXIOV : 1); + headcount = vhost_get_bufs(vq, vq->heads + nheads, vhost_len, + , vq_log, , + likely(mergeable) ? UIO_MAXIOV : 1); /* On error, stop handling until the next kick. */ if (unlikely(headcount < 0)) goto out; diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index f0be5f3..096a688 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -2100,6 +2100,84 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, } EXPORT_SYMBOL_GPL(vhost_get_vq_desc); +/* This is a multi-buffer version of vhost_get_desc, that works if + * vq has read descriptors only. + * @vq - the relevant virtqueue + * @datalen- data length we'll be reading + * @iovcount - returned count of io vectors we fill + * @log- vhost log + * @log_num- log offset + * @quota - headcount
[RFC V5 PATCH 0/8] Packed ring layout for vhost
Hi all: This RFC implement packed ring layout. The code were tested with Tiwei's RFC V5 at https://lkml.org/lkml/2018/5/22/138. Some fixups and tweaks were needed on top of Tiwei's code to make it run for event index. Pktgen reports about 20% improvement on TX PPS when doing pktgen from guest to host. No ovbious improvement on RX PPS. We can do lots of optimizations on top but for simple and for correceness first, this version does not do much. This version were tested with: - Zerocopy (Out of Order) support - vIOMMU support - mergeable buffer on/off - busy polling on/off Notes for tester: - Start from this version, vhost need qemu co-operation to work correctly. Or you can comment out the packed specific code for GET/SET_VRING_BASE. - Changes from V4: - fix signalled_used index recording - track avail index correctly - various minor fixes Changes from V3: - Fix math on event idx checking - Sync last avail wrap counter through GET/SET_VRING_BASE - remove desc_event prefix in the driver/device structure Changes from V2: - do not use & in checking desc_event_flags - off should be most significant bit - remove the workaround of mergeable buffer for dpdk prototype - id should be in the last descriptor in the chain - keep _F_WRITE for write descriptor when adding used - device flags updating should use ADDR_USED type - return error on unexpected unavail descriptor in a chain - return false in vhost_ve_avail_empty is descriptor is available - track last seen avail_wrap_counter - correctly examine available descriptor in get_indirect_packed() - vhost_idx_diff should return u16 instead of bool Changes from V1: - Refactor vhost used elem code to avoid open coding on used elem - Event suppression support (compile test only). - Indirect descriptor support (compile test only). - Zerocopy support. - vIOMMU support. - SCSI/VSOCK support (compile test only). - Fix several bugs Jason Wang (8): vhost: move get_rx_bufs to vhost.c vhost: hide used ring layout from device vhost: do not use vring_used_elem vhost_net: do not explicitly manipulate vhost_used_elem vhost: vhost_put_user() can accept metadata type virtio: introduce packed ring defines vhost: packed ring support vhost: event suppression for packed ring drivers/vhost/net.c| 144 ++ drivers/vhost/scsi.c | 62 +-- drivers/vhost/vhost.c | 926 + drivers/vhost/vhost.h | 52 ++- drivers/vhost/vsock.c | 42 +- include/uapi/linux/virtio_config.h | 9 + include/uapi/linux/virtio_ring.h | 32 ++ 7 files changed, 1000 insertions(+), 267 deletions(-) -- 2.7.4 ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v4] virtio_blk: add DISCARD and WRIET ZEROES commands support
Existing virtio-blk protocol doesn't have DISCARD/WRITE ZEROES commands support, this will impact the performance when using SSD backend over file systems. Commit 88c85538 "virtio-blk: add discard and write zeroes features to specification"(see https://github.com/oasis-tcs/virtio-spec) extended existing virtio-blk protocol, adding extra DISCARD and WRITE ZEROES commands support. While here, using 16 bytes descriptor to describe one segment of DISCARD or WRITE ZEROES commands, each command may contain one or more decriptors. The following data structure shows the definition of one descriptor: struct virtio_blk_discard_write_zeroes { le64 sector; le32 num_sectors; le32 unmap; }; Field 'sector' means the start sector for DISCARD and WRITE ZEROES, filed 'num_sectors' means the number of sectors for DISCARD and WRITE ZEROES, if both DISCARD and WRITE ZEROES are supported, field 'unmap' maybe used for WRITE ZEROES command with DISCARD enabled. We also extended the virtio-blk configuration space to let backend device put DISCARD and WRITE ZEROES configuration parameters. struct virtio_blk_config { [...] le32 max_discard_sectors; le32 max_discard_seg; le32 discard_sector_alignment; le32 max_write_zeroes_sectors; le32 max_write_zeroes_seg; u8 write_zeroes_may_unmap; } New feature bit [VIRTIO_BLK_F_DISCARD (13)]: Device can support discard command, maximum discard sectors size in field 'max_discard_sectors' and maximum discard segment number in field 'max_discard_seg'. New feature [VIRTIO_BLK_F_WRITE_ZEROES (14)]: Device can support write zeroes command, maximum write zeroes sectors size in field 'max_write_zeroes_sectors' and maximum write zeroes segment number in field 'max_write_zeroes_seg'. The parameters in the configuration space of the device field 'max_discard_sectors' and field 'discard_sector_alignment' are expressed in 512-byte units if the VIRTIO_BLK_F_DISCARD feature bit is negotiated. The field 'max_write_zeroes_sectors' is expressed in 512-byte units if the VIRTIO_BLK_F_WRITE_ZEROES feature bit is negotiated. Signed-off-by: Changpeng Liu --- CHANGELOG: v4: several optimizations based on MST's comments, remove bit field usage for command descriptor. v3: define the virtio-blk protocol to add discard and write zeroes support, first version implementation based on proposed specification. v2: add write zeroes command support. v1: initial proposal implementation for discard command. --- drivers/block/virtio_blk.c | 92 +++-- include/uapi/linux/virtio_blk.h | 43 +++ 2 files changed, 132 insertions(+), 3 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 4a07593c..a250fcc 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -172,10 +172,45 @@ static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr, return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC); } +static inline int virtblk_setup_discard_write_zeroes(struct request *req, + bool unmap) +{ + unsigned short segments = blk_rq_nr_discard_segments(req); + unsigned short n = 0; + struct virtio_blk_discard_write_zeroes *range; + struct bio *bio; + + range = kmalloc_array(segments, sizeof(*range), GFP_KERNEL); + if (!range) + return -ENOMEM; + + __rq_for_each_bio(bio, req) { + u64 sector = bio->bi_iter.bi_sector; + u32 num_sectors = bio->bi_iter.bi_size >> 9; + + range[n].unmap = cpu_to_le32(unmap); + range[n].num_sectors = cpu_to_le32(num_sectors); + range[n].sector = cpu_to_le64(sector); + n++; + } + + req->special_vec.bv_page = virt_to_page(range); + req->special_vec.bv_offset = offset_in_page(range); + req->special_vec.bv_len = sizeof(*range) * segments; + req->rq_flags |= RQF_SPECIAL_PAYLOAD; + + return 0; +} + static inline void virtblk_request_done(struct request *req) { struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); + if (req->rq_flags & RQF_SPECIAL_PAYLOAD) { + kfree(page_address(req->special_vec.bv_page) + + req->special_vec.bv_offset); + } + switch (req_op(req)) { case REQ_OP_SCSI_IN: case REQ_OP_SCSI_OUT: @@ -225,6 +260,7 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, int qid = hctx->queue_num; int err; bool notify = false; + bool unmap = false; u32 type; BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems); @@ -237,6 +273,13 @@ static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx, case REQ_OP_FLUSH: type = VIRTIO_BLK_T_FLUSH; break; + case REQ_OP_DISCARD: +
Re: [RFC V2] virtio: Add platform specific DMA API translation for virito devices
On Tue, 2018-05-29 at 09:48 +1000, Benjamin Herrenschmidt wrote: > > Well it's not supposed to be much slower for the static case. > > > > vhost has a cache so should be fine. > > > > A while ago Paolo implemented a translation cache which should be > > perfect for this case - most of the code got merged but > > never enabled because of stability issues. > > > > If all else fails, we could teach QEMU to handle the no-iommu case > > as if VIRTIO_F_IOMMU_PLATFORM was off. > > Any serious reason why not just getting that 2 line patch allowing our > arch code to force virtio to use the DMA API ? > > It's not particularly invasive and solves our problem rather nicely > without adding overhead or additional knowledge to qemu/libvirt/mgmnt > tools etc... that it doesn't need etc > > The guest knows it's going secure so the guest arch code can do the > right thing rather trivially. > > Long term we should probably make virtio always use the DMA API anyway, > and interpose "1:1" dma_ops for the traditional virtio case, that would > reduce code clutter significantly. In that case, it would become just a > matter of having a platform hook to override the dma_ops used. To elaborate a bit What we are trying to solve here is entirely a guest problem, I don't think involving qemu in the solution is the right thing to do. The guest can only allow external parties (qemu, potentially PCI devices, etc...) access to some restricted portions of memory (insecure memory). Thus the guest need to do some bounce buffering/swiotlb type tricks. This is completely orthogonal to whether there is an actual iommu between the guest and the device (or emulated device/virtio). This is why I think the solution should reside in the guest kernel, by proper manipulation (by the arch code) of the dma ops. I don't think forcing the addition of an emulated iommu in the middle just to work around the fact that virtio "cheats" and doesn't use the dma API unless there is one, is the right "fix". The right long term fix is to always use the DMA API, reducing code path etc... and just have a single point where virtio can "chose" alternate DMA ops (via an arch hook to deal with our case). In the meantime, having the hook we propose gets us going, but if you agree with the approach, we should also work on the long term approach. Cheers, Ben. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [RFC V2] virtio: Add platform specific DMA API translation for virito devices
On Fri, 2018-05-25 at 20:45 +0300, Michael S. Tsirkin wrote: > On Thu, May 24, 2018 at 08:27:04AM +1000, Benjamin Herrenschmidt wrote: > > On Wed, 2018-05-23 at 21:50 +0300, Michael S. Tsirkin wrote: > > > > > I re-read that discussion and I'm still unclear on the > > > original question, since I got several apparently > > > conflicting answers. > > > > > > I asked: > > > > > > Why isn't setting VIRTIO_F_IOMMU_PLATFORM on the > > > hypervisor side sufficient? > > > > I thought I had replied to this... > > > > There are a couple of reasons: > > > > - First qemu doesn't know that the guest will switch to "secure mode" > > in advance. There is no difference between a normal and a secure > > partition until the partition does the magic UV call to "enter secure > > mode" and qemu doesn't see any of it. So who can set the flag here ? > > Not sure I understand. Just set the flag e.g. on qemu command line. > I might be wrong, but these secure mode things usually > a. require hypervisor side tricks anyway The way our secure mode architecture is designed, there doesn't need at this point to be any knowledge at qemu level whatsoever. Well at least until we do migration but that's a different kettle of fish. In any case, the guest starts normally (which means as a non-secure guest, and thus expects normal virtio, our FW today doesn't handle VIRTIO_F_IOMMU_PLATFORM, though granted, we can fix this), and later that guest issues some special Ultravisor call that turns it into a secure guest. There is some involvement of the hypervisor, but not qemu at this stage. We would very much like to avoid that, as it would be a hassle for users to have to use different libvirt options etc... bcs the guest might turn itself into a secure VM. > > - Second, when using VIRTIO_F_IOMMU_PLATFORM, we also make qemu (or > > vhost) go through the emulated MMIO for every access to the guest, > > which adds additional overhead. > > > > Cheers, > > Ben. > > Well it's not supposed to be much slower for the static case. > > vhost has a cache so should be fine. > > A while ago Paolo implemented a translation cache which should be > perfect for this case - most of the code got merged but > never enabled because of stability issues. > > If all else fails, we could teach QEMU to handle the no-iommu case > as if VIRTIO_F_IOMMU_PLATFORM was off. Any serious reason why not just getting that 2 line patch allowing our arch code to force virtio to use the DMA API ? It's not particularly invasive and solves our problem rather nicely without adding overhead or additional knowledge to qemu/libvirt/mgmnt tools etc... that it doesn't need etc The guest knows it's going secure so the guest arch code can do the right thing rather trivially. Long term we should probably make virtio always use the DMA API anyway, and interpose "1:1" dma_ops for the traditional virtio case, that would reduce code clutter significantly. In that case, it would become just a matter of having a platform hook to override the dma_ops used. Cheers, Ben. > > > > > > > > > > > > arch/powerpc/include/asm/dma-mapping.h | 6 ++ > > > > arch/powerpc/platforms/pseries/iommu.c | 11 +++ > > > > drivers/virtio/virtio_ring.c | 10 ++ > > > > 3 files changed, 27 insertions(+) > > > > > > > > diff --git a/arch/powerpc/include/asm/dma-mapping.h > > > > b/arch/powerpc/include/asm/dma-mapping.h > > > > index 8fa3945..056e578 100644 > > > > --- a/arch/powerpc/include/asm/dma-mapping.h > > > > +++ b/arch/powerpc/include/asm/dma-mapping.h > > > > @@ -115,4 +115,10 @@ extern u64 __dma_get_required_mask(struct device > > > > *dev); > > > > #define ARCH_HAS_DMA_MMAP_COHERENT > > > > > > > > #endif /* __KERNEL__ */ > > > > + > > > > +#define platform_forces_virtio_dma platform_forces_virtio_dma > > > > + > > > > +struct virtio_device; > > > > + > > > > +extern bool platform_forces_virtio_dma(struct virtio_device *vdev); > > > > #endif /* _ASM_DMA_MAPPING_H */ > > > > diff --git a/arch/powerpc/platforms/pseries/iommu.c > > > > b/arch/powerpc/platforms/pseries/iommu.c > > > > index 06f0296..a2ec15a 100644 > > > > --- a/arch/powerpc/platforms/pseries/iommu.c > > > > +++ b/arch/powerpc/platforms/pseries/iommu.c > > > > @@ -38,6 +38,7 @@ > > > > #include > > > > #include > > > > #include > > > > +#include > > > > #include > > > > #include > > > > #include > > > > @@ -1396,3 +1397,13 @@ static int __init disable_multitce(char *str) > > > > __setup("multitce=", disable_multitce); > > > > > > > > machine_subsys_initcall_sync(pseries, tce_iommu_bus_notifier_init); > > > > + > > > > +bool platform_forces_virtio_dma(struct virtio_device *vdev) > > > > +{ > > > > + /* > > > > +* On protected guest platforms, force virtio core to use DMA > > > > +* MAP API for all virtio devices. But there can also be some > > > > +* exceptions for individual devices like virtio balloon. > > > > +*/ > > > > +