> -----Original Message-----
> From: Eugenio Perez Martin <[email protected]>
> Sent: 2025年12月23日 2:20
> To: Wafer <[email protected]>
> Cc: [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; Angus Chen
> <[email protected]>
> Subject: Re: [PATCH v3 2/4] vdpa: implement a statically allocated buffer for
> SVQ
> 
> External Mail: This email originated from OUTSIDE of the organization!
> Do not click links, open attachments or provide ANY information unless you
> recognize the sender and know the content is safe.
> 
> 
> On Mon, Dec 22, 2025 at 6:10 AM Wafer <[email protected]> wrote:
> >
> >
> >
> > > -----Original Message-----
> > > From: Eugenio Perez Martin <[email protected]>
> > > Sent: 2025年12月19日 16:12
> > > To: Wafer <[email protected]>
> > > Cc: [email protected]; [email protected]; [email protected];
> > > [email protected]; [email protected]; Angus Chen
> > > <[email protected]>
> > > Subject: Re: [PATCH v3 2/4] vdpa: implement a statically allocated
> > > buffer for SVQ
> > >
> > > External Mail: This email originated from OUTSIDE of the organization!
> > > Do not click links, open attachments or provide ANY information
> > > unless you recognize the sender and know the content is safe.
> > >
> > >
> > > On Tue, Dec 16, 2025 at 2:55 AM Wafer Xie <[email protected]>
> wrote:
> > > >
> > > > From: wafer Xie <[email protected]>
> > > >
> > > > allocated and initialized when creating the vhost-vdpa device, and
> > > > release the indirect buffer when vhost-vdpa is stopped.
> > > >
> > > > Signed-off-by: wafer Xie <[email protected]>
> > > > ---
> > > >  hw/virtio/vhost-shadow-virtqueue.c |  25 +++++
> > > >  hw/virtio/vhost-vdpa.c             | 163
> ++++++++++++++++++++++++++++-
> > > >  2 files changed, 187 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git a/hw/virtio/vhost-shadow-virtqueue.c
> > > > b/hw/virtio/vhost-shadow-virtqueue.c
> > > > index 2481d49345..85eff1d841 100644
> > > > --- a/hw/virtio/vhost-shadow-virtqueue.c
> > > > +++ b/hw/virtio/vhost-shadow-virtqueue.c
> > > > @@ -708,6 +708,25 @@ void vhost_svq_start(VhostShadowVirtqueue
> > > *svq, VirtIODevice *vdev,
> > > >      for (unsigned i = 0; i < svq->vring.num - 1; i++) {
> > > >          svq->desc_next[i] = i + 1;
> > > >      }
> > > > +
> > > > +    /* Initialize indirect descriptor state */
> > > > +    svq->indirect_enabled = false;
> > > > +    svq->current_indirect_buf = -1;
> > > > +    for (int i = 0; i < SVQ_NUM_INDIRECT_BUFS; i++) {
> > > > +        svq->indirect_bufs[i].desc = NULL;
> > > > +        svq->indirect_bufs[i].iova = 0;
> > > > +        svq->indirect_bufs[i].size = 0;
> > > > +        svq->indirect_bufs[i].state = SVQ_INDIRECT_BUF_FREED;
> > > > +        svq->indirect_bufs[i].num_descs = 0;
> > > > +        svq->indirect_bufs[i].freed_descs = 0;
> > > > +        svq->indirect_bufs[i].freeing_descs = 0;
> > > > +        svq->indirect_bufs[i].freed_head = 0;
> > > > +    }
> > > > +
> > > > +    /* Initialize desc_state indirect_buf_idx to -1 */
> > > > +    for (unsigned i = 0; i < svq->vring.num; i++) {
> > > > +        svq->desc_state[i].indirect_buf_idx = -1;
> > > > +    }
> > > >  }
> > > >
> > > >  /**
> > > > @@ -748,6 +767,10 @@ void vhost_svq_stop(VhostShadowVirtqueue
> *svq)
> > > >      munmap(svq->vring.desc, vhost_svq_driver_area_size(svq));
> > > >      munmap(svq->vring.used, vhost_svq_device_area_size(svq));
> > > >      event_notifier_set_handler(&svq->hdev_call, NULL);
> > > > +
> > > > +    /* Reset indirect descriptor state (memory is freed by vhost-vdpa)
> */
> > > > +    svq->indirect_enabled = false;
> > > > +    svq->current_indirect_buf = -1;
> > > >  }
> > > >
> > > >  /**
> > > > @@ -765,6 +788,8 @@ VhostShadowVirtqueue *vhost_svq_new(const
> > > VhostShadowVirtqueueOps *ops,
> > > >      event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
> > > >      svq->ops = ops;
> > > >      svq->ops_opaque = ops_opaque;
> > > > +    svq->indirect_enabled = false;
> > > > +    svq->current_indirect_buf = -1;
> > > >      return svq;
> > > >  }
> > > >
> > > > diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index
> > > > 7061b6e1a3..816868d153 100644
> > > > --- a/hw/virtio/vhost-vdpa.c
> > > > +++ b/hw/virtio/vhost-vdpa.c
> > > > @@ -584,6 +584,155 @@ static int
> > > > vhost_vdpa_get_dev_features(struct
> > > vhost_dev *dev,
> > > >      return ret;
> > > >  }
> > > >
> > > > +/**
> > > > + * Allocate a single indirect descriptor buffer for SVQ
> > > > + *
> > > > + * @v: vhost_vdpa instance
> > > > + * @num_descs: Number of descriptors (should be vring.num)
> > > > + * @buf: Output buffer structure to fill
> > > > + *
> > > > + * Returns true on success, false on failure.
> > > > + */
> > > > +static bool vhost_vdpa_alloc_indirect_buf(struct vhost_vdpa *v,
> > > > +                                           uint16_t num_descs,
> > > > +                                           SVQIndirectDescBuf *buf) {
> > > > +    size_t desc_size = sizeof(vring_desc_t) * num_descs;
> > > > +    size_t alloc_size = ROUND_UP(desc_size,
> > > qemu_real_host_page_size());
> > > > +    DMAMap needle = {
> > > > +        .size = alloc_size - 1,
> > > > +        .perm = IOMMU_RO,
> > > > +    };
> > > > +    vring_desc_t *indirect_desc;
> > > > +    int r;
> > > > +
> > > > +    /* Allocate memory for indirect descriptors */
> > > > +    indirect_desc = mmap(NULL, alloc_size, PROT_READ | PROT_WRITE,
> > > > +                         MAP_SHARED | MAP_ANONYMOUS, -1, 0);
> > > > +    if (indirect_desc == MAP_FAILED) {
> > > > +        error_report("Cannot allocate indirect descriptor buffer");
> > > > +        return false;
> > > > +    }
> > > > +
> > > > +    /* Allocate IOVA for the indirect descriptor buffer */
> > > > +    r = vhost_iova_tree_map_alloc(v->shared->iova_tree, &needle,
> > > > +                                  (hwaddr)(uintptr_t)indirect_desc);
> > > > +    if (unlikely(r != IOVA_OK)) {
> > > > +        error_report("Cannot allocate iova for indirect descriptors 
> > > > (%d)",
> r);
> > > > +        munmap(indirect_desc, alloc_size);
> > > > +        return false;
> > > > +    }
> > > > +
> > > > +    /* Map to device */
> > > > +    r = vhost_vdpa_dma_map(v->shared, v->address_space_id,
> > > needle.iova,
> > > > +                           alloc_size, indirect_desc, true);
> > > > +    if (unlikely(r != 0)) {
> > > > +        error_report("Cannot map indirect descriptors to device: %s 
> > > > (%d)",
> > > > +                     g_strerror(-r), -r);
> > > > +        vhost_iova_tree_remove(v->shared->iova_tree, needle);
> > > > +        munmap(indirect_desc, alloc_size);
> > > > +        return false;
> > > > +    }
> > > > +
> > > > +    buf->desc = indirect_desc;
> > > > +    buf->iova = needle.iova;
> > > > +    buf->size = alloc_size;
> > > > +    buf->state = SVQ_INDIRECT_BUF_FREED;
> > > > +    buf->num_descs = num_descs;
> > > > +    buf->freed_descs = num_descs;  /* All descriptors are free 
> > > > initially
> */
> > > > +    buf->freeing_descs = 0;
> > > > +    buf->freed_head = 0;
> > > > +
> > > > +    return true;
> > > > +}
> > > > +
> > > > +/**
> > > > + * Free a single indirect descriptor buffer
> > > > + *
> > > > + * @v: vhost_vdpa instance
> > > > + * @buf: Buffer to free
> > > > + */
> > > > +static void vhost_vdpa_free_indirect_buf(struct vhost_vdpa *v,
> > > > +                                          SVQIndirectDescBuf *buf) {
> > > > +    DMAMap needle;
> > > > +    const DMAMap *result;
> > > > +    int r;
> > > > +
> > > > +    if (!buf->desc) {
> > > > +        return;
> > > > +    }
> > > > +
> > > > +    needle = (DMAMap) {
> > > > +        .translated_addr = (uint64_t)(uintptr_t)buf->desc,
> > > > +    };
> > > > +    result = vhost_iova_tree_find_iova(v->shared->iova_tree,
> > > > + &needle);
> > > > +
> > > > +    if (result) {
> > > > +        r = vhost_vdpa_dma_unmap(v->shared, v->address_space_id,
> > > > +                                 result->iova, buf->size);
> > > > +        if (unlikely(r != 0)) {
> > > > +            error_report("Cannot unmap indirect descriptors: %s (%d)",
> > > > +                         g_strerror(-r), -r);
> > > > +        }
> > > > +
> > > > +        vhost_iova_tree_remove(v->shared->iova_tree, *result);
> > > > +    }
> > > > +
> > > > +    munmap(buf->desc, buf->size);
> > > > +    buf->desc = NULL;
> > > > +    buf->iova = 0;
> > > > +    buf->size = 0;
> > > > +}
> > > > +
> > > > +/**
> > > > + * Initialize indirect descriptor buffers for a single SVQ
> > > > + *
> > > > + * @v: vhost_vdpa instance
> > > > + * @svq: Shadow virtqueue to initialize
> > > > + *
> > > > + * Returns true on success, false on failure.
> > > > + */
> > > > +static bool vhost_vdpa_svq_init_indirect(struct vhost_vdpa *v,
> > > > +                                          VhostShadowVirtqueue
> > > > +*svq) {
> > > > +    if (!svq->vring.num) {
> > > > +        return true;
> > > > +    }
> > > > +
> > > > +    /* Allocate indirect descriptor buffers for this SVQ */
> > > > +    for (int j = 0; j < SVQ_NUM_INDIRECT_BUFS; j++) {
> > > > +        if (!vhost_vdpa_alloc_indirect_buf(v, svq->vring.num * 4,
> > > > +
> > > > + &svq->indirect_bufs[j])) {
> > >
> > > Why not allocate a single array of size "SVQ_NUM_INDIRECT_BUFS *
> > > svq->vring.num * 4"? It should help management and maybe even
> > > performance as memory accesses are more predictable.
> > >
> >
> > We cannot assume or rely on the device to populate the used ring in the
> same order as the available ring.
> > With a single large array, this may introduce holes, which makes managing
> freed descriptors difficult.
> > In addition, the virtio specification requires indirect descriptor 
> > addresses to
> be contiguous, which further complicates management with a single array.
> >
> 
> Actually you made me review the QEMU's virtqueue_pop and vhost code
> and it does not agree with the standard, so thanks for that :). As you 
> correctly
> point out, the standard says:
> 
> The first descriptor is located at the start of the indirect descriptor table,
> additional indirect descriptors come immediately afterwards. The
> VIRTQ_DESC_F_WRITE flags bit is the only valid flag for descriptors in the
> indirect table. Others are reserved and are ignored by the device. Buffer ID 
> is
> also reserved and is ignored by the device.
> 

This is the packed indirect descriptors specification.

The split indirect descriptors is slightly different, as described below:

The first indirect descriptor is located at start of the indirect descriptor 
table (index 0), additional indirect descriptors are chained by next. 
An indirect descriptor without a valid next (with flags&VIRTQ_DESC_F_NEXT off) 
signals the end of the descriptor. 
A single indirect descriptor table can include both device-readable and 
device-writable descriptors.

If VIRTIO_F_IN_ORDER has been negotiated, indirect descriptors use sequential 
indices, in-order: index 0 followed by index 1 followed by index 2, etc.

https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-430005

> But in the do {} while after the "/* Collect all the descriptors */"
> comments in the virtqueue_pop it follows the _F_NEXT field and
> desc->next, as it is transversing a descriptor chain, and it should
> ignore them if it is processing the indirect table. I think that should be 
> fixed,
> but maybe we can break some driver? Not Linux's one for sure. Jason, MST,
> what do you think? If we should fix the device's code, would you like to
> submit a patch, Wafer?
> 
> Now regarding the patch. We need the indirect descriptors to be contiguous
> in memory, but my point is that having many different tables makes it harder
> to manage. A good first step would be to just allocate a buffer of that size
> and then divide it into
> svq->indirect_bufs[i].
> 
> Instead of:
> svq->indirect_buf[0].iova = mmap(size) && vhost_dma_map(...) ...
> svq->indirect_buf[1].iova = mmap(size) && vhost_dma_map(...) ...
> 
> At least:
> buf = mmap(size*2) && vhost_dma_map(...) ...
> svq->indirect_buf[0].iova = buf
> svq->indirect_buf[1].iova = buf + size
> 
> So the translation tree etc is smaller, QEMU need to do less ioctls, less
> memory transactions, etc.
> 
> Even with that, we're losing opportunities of finding contiguous free
> descriptors with many SVQIndirectDescBuf instead of a bigger single one able
> to reuse the free holes. Let's say the SVQIndirectDescBuf are
> 4 descriptor length for simplicity. Now the driver makes available one buffer
> that spans 3 descriptors (step 1), another one that spans three descriptors
> (step 2), and another buffer than spans two descriptors (step 3).
> 
> In this series the SVQ places the first indirect table in the first
> SVQIndirectDescBuf, the second table at the second SVQIndirectDescBuf,
> and then the third one must resort to chained descriptors. If we make just
> one big buffer of size 8, all of them fits in the indirect table, we just 
> need to
> mark it as free.
> 
> Actually, as long as the device does not mark as used one descriptor, we lose
> the all the SVQIndirectDescBuf from the time buf->freed_head reaches the
> end.
> 

Thank you for the suggestion!
This is a good approach and allows the array resources to be utilized more 
efficiently.
I understand your point and will improve this in the next version.

> Some ideas for that:
> * Reuse IOVATree to handle the indirect table. It already do the linear search
> on it, but the allocation and free may hurt performance.
> Still, it should be able to find contiguous buffers if any.
> * Use it in a circular way but store some data in desc_state so we can
> coalesce the free entries. I can expand on that if needed.
> 
> If we still find that this approach still works better, I think we should have
> many SVQIndirectDescBuf of queue length, which is the maximum indirect
> table length. We can handle them as a free / used stack, so looking for a free
> entry would be cheaper. What do you think about that?
> 
> > > > +            /* Cleanup already allocated buffers */
> > > > +            for (int k = 0; k < j; k++) {
> > > > +                vhost_vdpa_free_indirect_buf(v, 
> > > > &svq->indirect_bufs[k]);
> > > > +            }
> > > > +            return false;
> > > > +        }
> > > > +    }
> > > > +
> > > > +    svq->indirect_enabled = true;
> > > > +    svq->current_indirect_buf = 0;
> > > > +    return true;
> > > > +}
> > > > +
> > > > +/**
> > > > + * Cleanup indirect descriptor buffers for a single SVQ
> > > > + *
> > > > + * @v: vhost_vdpa instance
> > > > + * @svq: Shadow virtqueue to cleanup  */ static void
> > > > +vhost_vdpa_svq_cleanup_indirect(struct vhost_vdpa *v,
> > > > +                                             VhostShadowVirtqueue
> > > > +*svq) {
> > > > +    for (int j = 0; j < SVQ_NUM_INDIRECT_BUFS; j++) {
> > > > +        vhost_vdpa_free_indirect_buf(v, &svq->indirect_bufs[j]);
> > > > +    }
> > > > +    svq->indirect_enabled = false;
> > > > +    svq->current_indirect_buf = -1; }
> > > > +
> > > >  static void vhost_vdpa_init_svq(struct vhost_dev *hdev, struct
> > > > vhost_vdpa *v)  {
> > > >      g_autoptr(GPtrArray) shadow_vqs = NULL; @@ -791,8 +940,11 @@
> > > > static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev)
> > > >      size_t idx;
> > > >
> > > >      for (idx = 0; idx < v->shadow_vqs->len; ++idx) {
> > > > -        vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, idx));
> > > > +        VhostShadowVirtqueue *svq =
> > > > + g_ptr_array_index(v->shadow_vqs,
> > > idx);
> > > > +        vhost_vdpa_svq_cleanup_indirect(v, svq);
> > >
> > > All usages of vhost_vdpa_svq_cleanup_indirect go with vhost_svq_stop.
> > > We can call vhost_vdpa_svq_cleanup_indirect from vhost_svq_stop or
> > > even embed it.
> > >
> >
> > Thank you for the suggestion. I will modify this in the next version of the
> patch.
> >
> > However, I have one question: in
> > hw/virtio/vhost-vdpa.c:vhost_vdpa_svq_cleanup,
> > why does it only call vhost_svq_stop(svq) without calling
> vhost_vdpa_svq_unmap_rings(dev, svq)?
> >
> 
> We could call it, but the device is about to be totally destroyed, either
> because it has been deleted from QEMU or because QEMU itself is shutting
> down. The only action QEMU can do afterwards is to close the descriptor, so
> the device should clean the maps anyway. But maybe I'm missing something
> here.
> 
> > >
> > > > +        vhost_svq_stop(svq);
> > > >      }
> > > > +
> > > >      g_ptr_array_free(v->shadow_vqs, true);  }
> > > >
> > > > @@ -1299,6 +1451,13 @@ static bool vhost_vdpa_svqs_start(struct
> > > vhost_dev *dev)
> > > >              error_setg_errno(&err, -r, "Cannot set device address");
> > > >              goto err_set_addr;
> > > >          }
> > > > +
> > > > +        /* Initialize indirect descriptor buffers for this SVQ */
> > > > +        if (!vhost_vdpa_svq_init_indirect(v, svq)) {
> > > > +            /* Non-fatal: will fallback to chain mode */
> > > > +            warn_report("Cannot initialize indirect descriptor for SVQ 
> > > > %u",
> > > > +                virtio_get_queue_index(vq));
> > > > +        }
> > > >      }
> > > >
> > > >      return true;
> > > > @@ -1313,6 +1472,7 @@ err:
> > > >      error_reportf_err(err, "Cannot setup SVQ %u: ", i);
> > > >      for (unsigned j = 0; j < i; ++j) {
> > > >          VhostShadowVirtqueue *svq =
> > > > g_ptr_array_index(v->shadow_vqs, j);
> > > > +        vhost_vdpa_svq_cleanup_indirect(v, svq);
> > > >          vhost_vdpa_svq_unmap_rings(dev, svq);
> > > >          vhost_svq_stop(svq);
> > > >      }
> > > > @@ -1331,6 +1491,7 @@ static void vhost_vdpa_svqs_stop(struct
> > > vhost_dev *dev)
> > > >      for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
> > > >          VhostShadowVirtqueue *svq =
> > > > g_ptr_array_index(v->shadow_vqs, i);
> > > >
> > > > +        vhost_vdpa_svq_cleanup_indirect(v, svq);
> > > >          vhost_svq_stop(svq);
> > > >          vhost_vdpa_svq_unmap_rings(dev, svq);
> > > >
> > > > --
> > > > 2.48.1
> > > >
> >

Reply via email to