On Sat, Dec 27, 2025 at 6:09 PM Michael S. Tsirkin <[email protected]> wrote: > > On Fri, Dec 26, 2025 at 02:57:03PM -0500, Michael S. Tsirkin wrote: > > On Thu, Dec 25, 2025 at 12:26:08PM +0800, Jason Wang wrote: > > > This patch implements in order support for both split virtqueue and > > > packed virtqueue. Performance could be gained for the device where the > > > memory access could be expensive (e.g vhost-net or a real PCI device): > > > > > > Benchmark with KVM guest: > > > > > > Vhost-net on the host: (pktgen + XDP_DROP): > > > > > > in_order=off | in_order=on | +% > > > TX: 4.51Mpps | 5.30Mpps | +17% > > > RX: 3.47Mpps | 3.61Mpps | + 4% > > > > > > Vhost-user(testpmd) on the host: (pktgen/XDP_DROP): > > > > > > For split virtqueue: > > > > > > in_order=off | in_order=on | +% > > > TX: 5.60Mpps | 5.60Mpps | +0.0% > > > RX: 9.16Mpps | 9.61Mpps | +4.9% > > > > > > For packed virtqueue: > > > > > > in_order=off | in_order=on | +% > > > TX: 5.60Mpps | 5.70Mpps | +1.7% > > > RX: 10.6Mpps | 10.8Mpps | +1.8% > > > > > > Benchmark also shows no performance impact for in_order=off for queue > > > size with 256 and 1024. > > > > > > Reviewed-by: Eugenio Pérez <[email protected]> > > > Signed-off-by: Jason Wang <[email protected]> > > > --- > > > drivers/virtio/virtio_ring.c | 455 +++++++++++++++++++++++++++++++++-- > > > 1 file changed, 432 insertions(+), 23 deletions(-) > > > > > > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c > > > index 61884e031b94..d1bcd1d8c66b 100644 > > > --- a/drivers/virtio/virtio_ring.c > > > +++ b/drivers/virtio/virtio_ring.c > > > @@ -70,6 +70,8 @@ > > > enum vq_layout { > > > SPLIT = 0, > > > PACKED, > > > + SPLIT_IN_ORDER, > > > + PACKED_IN_ORDER, > > > VQ_TYPE_MAX, > > > }; > > > > > > @@ -80,6 +82,7 @@ struct vring_desc_state_split { > > > * allocated together. So we won't stress more to the memory > > > allocator. > > > */ > > > struct vring_desc *indir_desc; > > > + u32 total_in_len; > > > }; > > > > > > struct vring_desc_state_packed { > > > @@ -91,6 +94,7 @@ struct vring_desc_state_packed { > > > struct vring_packed_desc *indir_desc; > > > u16 num; /* Descriptor list length. */ > > > u16 last; /* The last desc state in a list. */ > > > + u32 total_in_len; > > > }; > > > > > > > So let's be clear. Is this the device-written length or the driver supplied > > length? > > > > > > > struct vring_desc_extra { > > > @@ -205,8 +209,24 @@ struct vring_virtqueue { > > > > > > enum vq_layout layout; > > > > > > - /* Head of free buffer list. */ > > > + /* > > > + * Without IN_ORDER it's the head of free buffer list. With > > > + * IN_ORDER and SPLIT, it's the next available buffer > > > + * index. With IN_ORDER and PACKED, it's unused. > > > + */ > > > unsigned int free_head; > > > + > > > + /* > > > + * With IN_ORDER, once we see an in-order batch, this stores > > > + * this last entry, and until we return the last buffer. > > > + * After this, id is set to UINT_MAX to mark it invalid. > > > + * Unused without IN_ORDER. > > > + */ > > > + struct used_entry { > > > + u32 id; > > > + u32 len; > > > + } batch_last; > > > + > > > /* Number we've added since last sync. */ > > > unsigned int num_added; > > > > > > @@ -218,6 +238,11 @@ struct vring_virtqueue { > > > */ > > > u16 last_used_idx; > > > > > > + /* With IN_ORDER and SPLIT, last descriptor id we used to > > > + * detach buffer. > > > + */ > > > + u16 last_used; > > > + > > > /* Hint for event idx: already triggered no need to disable. */ > > > bool event_triggered; > > > > > > @@ -259,7 +284,12 @@ static void vring_free(struct virtqueue *_vq); > > > > > > static inline bool virtqueue_is_packed(const struct vring_virtqueue *vq) > > > { > > > - return vq->layout == PACKED; > > > + return vq->layout == PACKED || vq->layout == PACKED_IN_ORDER; > > > +} > > > + > > > +static inline bool virtqueue_is_in_order(const struct vring_virtqueue > > > *vq) > > > +{ > > > + return vq->layout == SPLIT_IN_ORDER || vq->layout == PACKED_IN_ORDER; > > > } > > > > > > static bool virtqueue_use_indirect(const struct vring_virtqueue *vq, > > > @@ -469,6 +499,8 @@ static void virtqueue_init(struct vring_virtqueue > > > *vq, u32 num) > > > else > > > vq->last_used_idx = 0; > > > > > > + vq->last_used = 0; > > > + > > > vq->event_triggered = false; > > > vq->num_added = 0; > > > > > > @@ -576,6 +608,8 @@ static inline int virtqueue_add_split(struct > > > vring_virtqueue *vq, > > > struct scatterlist *sg; > > > struct vring_desc *desc; > > > unsigned int i, n, avail, descs_used, err_idx, sg_count = 0; > > > + /* Total length for in-order */ > > > + unsigned int total_in_len = 0; > > > int head; > > > bool indirect; > > > > > > @@ -667,6 +701,7 @@ static inline int virtqueue_add_split(struct > > > vring_virtqueue *vq, > > > */ > > > i = virtqueue_add_desc_split(vq, desc, extra, i, addr, > > > len, flags, premapped); > > > + total_in_len += len; > > > } > > > } > > > > > > @@ -689,7 +724,12 @@ static inline int virtqueue_add_split(struct > > > vring_virtqueue *vq, > > > vq->vq.num_free -= descs_used; > > > > > > /* Update free pointer */ > > > - if (indirect) > > > + if (virtqueue_is_in_order(vq)) { > > > + vq->free_head += descs_used; > > > + if (vq->free_head >= vq->split.vring.num) > > > + vq->free_head -= vq->split.vring.num; > > > + vq->split.desc_state[head].total_in_len = total_in_len; > > > > > > Looks like total_in_len is the driver supplied length? > > > > > + } else if (indirect) > > > vq->free_head = vq->split.desc_extra[head].next; > > > else > > > vq->free_head = i; > > > @@ -862,6 +902,14 @@ static bool more_used_split(const struct > > > vring_virtqueue *vq) > > > return virtqueue_poll_split(vq, vq->last_used_idx); > > > } > > > > > > +static bool more_used_split_in_order(const struct vring_virtqueue *vq) > > > +{ > > > + if (vq->batch_last.id != UINT_MAX) > > > + return true; > > > + > > > + return virtqueue_poll_split(vq, vq->last_used_idx); > > > +} > > > + > > > static void *virtqueue_get_buf_ctx_split(struct vring_virtqueue *vq, > > > unsigned int *len, > > > void **ctx) > > > @@ -919,6 +967,76 @@ static void *virtqueue_get_buf_ctx_split(struct > > > vring_virtqueue *vq, > > > return ret; > > > } > > > > > > +static void *virtqueue_get_buf_ctx_split_in_order(struct vring_virtqueue > > > *vq, > > > + unsigned int *len, > > > + void **ctx) > > > +{ > > > + void *ret; > > > + unsigned int num = vq->split.vring.num; > > > + unsigned int num_free = vq->vq.num_free; > > > + u16 last_used, last_used_idx; > > > + > > > + START_USE(vq); > > > + > > > + if (unlikely(vq->broken)) { > > > + END_USE(vq); > > > + return NULL; > > > + } > > > + > > > + last_used = vq->last_used & (num - 1); > > > + last_used_idx = vq->last_used_idx & (num - 1); > > > + > > > + if (vq->batch_last.id == UINT_MAX) { > > > + if (!more_used_split_in_order(vq)) { > > > + pr_debug("No more buffers in queue\n"); > > > + END_USE(vq); > > > + return NULL; > > > + } > > > + > > > + /* > > > + * Only get used array entries after they have been > > > + * exposed by host. > > > + */ > > > + virtio_rmb(vq->weak_barriers); > > > + > > > + vq->batch_last.id = virtio32_to_cpu(vq->vq.vdev, > > > + > > > vq->split.vring.used->ring[last_used_idx].id); > > > + vq->batch_last.len = virtio32_to_cpu(vq->vq.vdev, > > > + > > > vq->split.vring.used->ring[last_used_idx].len); > > > + } > > > + > > > + if (vq->batch_last.id == last_used) { > > > + vq->batch_last.id = UINT_MAX; > > > + *len = vq->batch_last.len; > > > + } else { > > > + *len = vq->split.desc_state[last_used].total_in_len; > > > + } > > > > > > but now we return this as buffer length? I think the expected value > > here is the used length, not the driver supplied one? > > > > > > > > Same questions apply to packed. > > > > Ah, I got it. These are the skipped buffers: > > The skipped buffers (for which no used descriptor was written) > are assumed to have been used (read or written) by the > device completely. > > maybe a comment won't hurt here. Can be a patch on top. >
I've fixed this in the new version. Thanks

