On Tue, Dec 16, 2025 at 2:59 AM Wafer Xie <[email protected]> wrote:
>
> From: wafer Xie <[email protected]>
>
> Retrieve the target buffer from the indirect buffers by index,
> add the elements sent by the guest into the buffer’s indirect descriptors,
> and update freed_head and freed_number. If freed_number is zero,
> or if the current buffer’s freed_number is less than the number of elements,
> update the buffer state to SVQ_INDIRECT_BUF_FREEING.
>
> Signed-off-by: wafer Xie <[email protected]>
> ---
> hw/virtio/vhost-shadow-virtqueue.c | 257 ++++++++++++++++++++++++++---
> 1 file changed, 235 insertions(+), 22 deletions(-)
>
> diff --git a/hw/virtio/vhost-shadow-virtqueue.c
> b/hw/virtio/vhost-shadow-virtqueue.c
> index adee52f50b..c6064fb839 100644
> --- a/hw/virtio/vhost-shadow-virtqueue.c
> +++ b/hw/virtio/vhost-shadow-virtqueue.c
> @@ -189,36 +189,246 @@ static bool
> vhost_svq_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
> return true;
> }
>
> -static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
> +/**
> + * Write descriptors to indirect descriptor table
> + *
> + * @svq: The shadow virtqueue
> + * @sg: Cache for hwaddr
> + * @iovec: The iovec from the guest
> + * @num: iovec length
> + * @addr: Descriptors' GPAs, if backed by guest memory
> + * @buf: The indirect descriptor buffer
> + * @offset_idx: Offset for write position
> + * and next field (0 for out, out_num for in)
> + * @more_descs: True if more descriptors come in the chain
> + * @write: True if they are writeable descriptors
> + *
> + * Return true if success, false otherwise and print error.
> + */
> +static bool vhost_svq_vring_write_indirect_descs(VhostShadowVirtqueue *svq,
> + hwaddr *sg,
> + const struct iovec *iovec,
> + size_t num,
> + const hwaddr *addr,
> + SVQIndirectDescBuf *buf,
> + size_t offset_idx,
> + bool more_descs,
> + bool write)
> +{
> + bool ok;
> + uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
> + vring_desc_t *descs = buf->desc;
> + uint16_t i = buf->freed_head + offset_idx;
> +
> + if (num == 0) {
> + return true;
> + }
> +
> + ok = vhost_svq_translate_addr(svq, sg, iovec, num, addr);
> + if (unlikely(!ok)) {
> + return false;
> + }
> +
> + for (size_t n = 0; n < num; n++) {
> + descs[i].addr = cpu_to_le64(sg[n]);
> + descs[i].len = cpu_to_le32(iovec[n].iov_len);
> + if (more_descs || (n + 1 < num)) {
> + descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
> + descs[i].next = cpu_to_le16(offset_idx + n + 1);
> + } else {
> + descs[i].flags = flags;
> + }
> + i++;
> + }
> +
> + return true;
Most of this function is the same as current
vhost_svq_vring_write_descs in master. Why not make a common one that
accepts vring_desc_t * as a parameter and make the two of them call
it?
> +}
> +
> +/**
> + * Add descriptors to SVQ vring using indirect descriptors (dual-buffer)
> + *
> + * @svq: The shadow virtqueue
> + * @out_sg: The out iovec from the guest
> + * @out_num: The out iovec length
> + * @out_addr: The out descriptors' GPAs
> + * @in_sg: The in iovec from the guest
> + * @in_num: The in iovec length
> + * @in_addr: The in descriptors' GPAs
> + * @sgs: Cache for hwaddr
> + * @buf_idx: Index of the indirect buffer to use
> + *
> + * Return true if success, false otherwise and print error.
> + */
> +static bool vhost_svq_add_split_indirect(VhostShadowVirtqueue *svq,
> + const struct iovec *out_sg,
> + size_t out_num,
> + const hwaddr *out_addr,
> + const struct iovec *in_sg,
> + size_t in_num,
> + const hwaddr *in_addr,
> + hwaddr *sgs, int buf_idx)
> +{
> + SVQIndirectDescBuf *buf = &svq->indirect_bufs[buf_idx];
> + uint16_t start_idx = buf->freed_head;
> + size_t total_descs = out_num + in_num;
> + hwaddr indirect_iova;
> + bool ok;
> +
> + /* Populate indirect descriptor table for out descriptors */
> + ok = vhost_svq_vring_write_indirect_descs(svq, sgs, out_sg, out_num,
> + out_addr, buf, 0,
> + in_num > 0, false);
> + if (unlikely(!ok)) {
> + return false;
> + }
> +
> + /* Populate indirect descriptor table for in descriptors */
> + ok = vhost_svq_vring_write_indirect_descs(svq, sgs, in_sg, in_num,
> + in_addr, buf, out_num,
> + false, true);
> + if (unlikely(!ok)) {
> + return false;
> + }
> +
> + /* Calculate IOVA for this indirect descriptor range */
> + indirect_iova = buf->iova + start_idx * sizeof(vring_desc_t);
> +
> + /* Add a single descriptor pointing to the indirect table */
> + svq->vring.desc[svq->free_head].addr = cpu_to_le64(indirect_iova);
> + svq->vring.desc[svq->free_head].len =
> + cpu_to_le32(total_descs * sizeof(vring_desc_t));
> + svq->vring.desc[svq->free_head].flags =
> cpu_to_le16(VRING_DESC_F_INDIRECT);
> +
> + /* Store indirect descriptor info in desc_state */
> + svq->desc_state[svq->free_head].indirect_buf_idx = buf_idx;
> +
> + /* Update buffer state */
> + buf->freed_head = start_idx + total_descs;
> + buf->freed_descs -= total_descs;
> +
> + /* Move free_head forward */
> + svq->free_head = svq->desc_next[svq->free_head];
> +
> + return true;
Same here, most logic is duplicated in vhost_svq_add_split even when
vhost_svq_add_split calls vhost_svq_add_split_indirect.
> +}
> +
> +/**
> + * Try to get a freed indirect buffer for use
> + *
> + * @svq: The shadow virtqueue
> + * @total_descs: Number of descriptors needed
> + *
> + * Returns buffer index (0 to SVQ_NUM_INDIRECT_BUFS-1)
> + * if available, -1 if none available.
> + */
> +static int vhost_svq_get_indirect_buf(VhostShadowVirtqueue *svq,
> + size_t total_descs)
> +{
> + int cur = svq->current_indirect_buf;
> + SVQIndirectDescBuf *buf;
> +
> + if (!svq->indirect_enabled) {
> + return -1;
> + }
> +
> + /* Try current buffer first if it's in FREED state */
> + if (cur >= 0) {
> + buf = &svq->indirect_bufs[cur];
> + if (buf->state == SVQ_INDIRECT_BUF_FREED) {
> + /* Check if we have enough free descriptors */
> + if (buf->freed_descs >= total_descs) {
> + return cur;
> + }
> + /* Not enough space, switch to FREEING and try next buffer */
> + buf->state = SVQ_INDIRECT_BUF_FREEING;
> + }
> + }
> +
> + /* Try all other buffers */
> + for (int i = 0; i < SVQ_NUM_INDIRECT_BUFS; i++) {
> + if (i == cur) {
> + continue;
> + }
> + buf = &svq->indirect_bufs[i];
> + if (buf->state == SVQ_INDIRECT_BUF_FREED &&
> + buf->freed_descs >= total_descs) {
> + svq->current_indirect_buf = i;
> + return i;
> + }
> + }
> +
Why not store the free indirect descriptors as a linked list instead
of a circular buffer, the same way as regular descriptors? That way we
don't need to traverse all the circular buffer. Also, we reduce the
complexity as you don't need to keep the state in each descriptor or
the SVQ_INDIRECT_BUF_FREEING state.
> + /* All buffers unavailable, fallback to chain mode */
> + return -1;
> +}
> +
> +static int vhost_svq_add_split(VhostShadowVirtqueue *svq,
> const struct iovec *out_sg, size_t out_num,
> const hwaddr *out_addr,
> const struct iovec *in_sg, size_t in_num,
> - const hwaddr *in_addr, unsigned *head)
> + const hwaddr *in_addr, unsigned *head,
> + bool *used_indirect)
> {
> unsigned avail_idx;
> vring_avail_t *avail = svq->vring.avail;
> bool ok;
> g_autofree hwaddr *sgs = g_new(hwaddr, MAX(out_num, in_num));
> + size_t total_descs = out_num + in_num;
> + int indirect_buf_idx = -1;
>
> *head = svq->free_head;
> + *used_indirect = false;
>
> /* We need some descriptors here */
> if (unlikely(!out_num && !in_num)) {
> qemu_log_mask(LOG_GUEST_ERROR,
> "Guest provided element with no descriptors");
> - return false;
> + return -EINVAL;
> }
>
> - ok = vhost_svq_vring_write_descs(svq, sgs, out_sg, out_num, out_addr,
> - in_num > 0, false);
> - if (unlikely(!ok)) {
> - return false;
> + /* Try to use indirect descriptors if feature is negotiated and total >
> 1 */
> + if (virtio_vdev_has_feature(svq->vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
> + total_descs > 1) {
> + indirect_buf_idx = vhost_svq_get_indirect_buf(svq, total_descs);
> }
>
> - ok = vhost_svq_vring_write_descs(svq, sgs, in_sg, in_num, in_addr, false,
> - true);
> - if (unlikely(!ok)) {
> - return false;
> + if (indirect_buf_idx >= 0) {
> + /* Indirect mode: only need 1 main descriptor slot */
> + if (unlikely(vhost_svq_available_slots(svq) < 1)) {
> + return -ENOSPC;
> + }
> +
> + /* Use indirect mode */
> + ok = vhost_svq_add_split_indirect(svq, out_sg, out_num, out_addr,
> + in_sg, in_num, in_addr,
> + sgs, indirect_buf_idx);
> + if (unlikely(!ok)) {
> + error_report("indirect error, out_num %zu in_num %zu "
> + "avail index %u head %u",
> + out_num, in_num, svq->shadow_avail_idx, *head);
> + return -EINVAL;
> + }
> + *used_indirect = true;
> + } else {
> + /* Chain mode: need total_descs descriptor slots */
> + if (unlikely(vhost_svq_available_slots(svq) < total_descs)) {
> + return -ENOSPC;
> + }
> +
> + /* Use direct (chain) mode */
> + svq->desc_state[svq->free_head].indirect_buf_idx = -1;
> +
> + ok = vhost_svq_vring_write_descs(svq, sgs, out_sg, out_num, out_addr,
> + in_num > 0, false);
> + if (unlikely(!ok)) {
> + return -EINVAL;
> + }
> +
> + ok = vhost_svq_vring_write_descs(svq, sgs, in_sg, in_num, in_addr,
> + false,
> + true);
> + if (unlikely(!ok)) {
> + return -EINVAL;
> + }
> }
>
> /*
> @@ -233,7 +443,7 @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
> smp_wmb();
> avail->idx = cpu_to_le16(svq->shadow_avail_idx);
>
> - return true;
> + return 0;
> }
>
> static void vhost_svq_kick(VhostShadowVirtqueue *svq)
> @@ -249,7 +459,8 @@ static void vhost_svq_kick(VhostShadowVirtqueue *svq)
> if (virtio_vdev_has_feature(svq->vdev, VIRTIO_RING_F_EVENT_IDX)) {
> uint16_t avail_event = le16_to_cpu(
> *(uint16_t *)(&svq->vring.used->ring[svq->vring.num]));
> - needs_kick = vring_need_event(avail_event, svq->shadow_avail_idx,
> svq->shadow_avail_idx - 1);
> + needs_kick = vring_need_event(avail_event, svq->shadow_avail_idx,
> + svq->shadow_avail_idx - 1);
> } else {
> needs_kick =
> !(svq->vring.used->flags &
> cpu_to_le16(VRING_USED_F_NO_NOTIFY));
> @@ -274,19 +485,21 @@ int vhost_svq_add(VhostShadowVirtqueue *svq, const
> struct iovec *out_sg,
> {
> unsigned qemu_head;
> unsigned ndescs = in_num + out_num;
> - bool ok;
> + int r;
> + bool used_indirect = false;
>
> - if (unlikely(ndescs > vhost_svq_available_slots(svq))) {
> - return -ENOSPC;
> + r = vhost_svq_add_split(svq, out_sg, out_num, out_addr, in_sg, in_num,
> + in_addr, &qemu_head, &used_indirect);
> + if (unlikely(r != 0)) {
> + return r;
> }
>
> - ok = vhost_svq_add_split(svq, out_sg, out_num, out_addr, in_sg, in_num,
> - in_addr, &qemu_head);
> - if (unlikely(!ok)) {
> - return -EINVAL;
> + /* If using indirect, only 1 main descriptor is used; otherwise ndescs */
> + if (used_indirect) {
> + svq->num_free -= 1;
> + } else {
> + svq->num_free -= ndescs;
> }
> -
> - svq->num_free -= ndescs;
> svq->desc_state[qemu_head].elem = elem;
> svq->desc_state[qemu_head].ndescs = ndescs;
> vhost_svq_kick(svq);
> --
> 2.48.1
>