> -----Original Message----- > From: Eugenio Perez Martin <[email protected]> > Sent: 2025年12月23日 2:20 > To: Wafer <[email protected]> > Cc: [email protected]; [email protected]; [email protected]; > [email protected]; [email protected]; Angus Chen > <[email protected]> > Subject: Re: [PATCH v3 2/4] vdpa: implement a statically allocated buffer for > SVQ > > External Mail: This email originated from OUTSIDE of the organization! > Do not click links, open attachments or provide ANY information unless you > recognize the sender and know the content is safe. > > > On Mon, Dec 22, 2025 at 6:10 AM Wafer <[email protected]> wrote: > > > > > > > > > -----Original Message----- > > > From: Eugenio Perez Martin <[email protected]> > > > Sent: 2025年12月19日 16:12 > > > To: Wafer <[email protected]> > > > Cc: [email protected]; [email protected]; [email protected]; > > > [email protected]; [email protected]; Angus Chen > > > <[email protected]> > > > Subject: Re: [PATCH v3 2/4] vdpa: implement a statically allocated > > > buffer for SVQ > > > > > > External Mail: This email originated from OUTSIDE of the organization! > > > Do not click links, open attachments or provide ANY information > > > unless you recognize the sender and know the content is safe. > > > > > > > > > On Tue, Dec 16, 2025 at 2:55 AM Wafer Xie <[email protected]> > wrote: > > > > > > > > From: wafer Xie <[email protected]> > > > > > > > > allocated and initialized when creating the vhost-vdpa device, and > > > > release the indirect buffer when vhost-vdpa is stopped. > > > > > > > > Signed-off-by: wafer Xie <[email protected]> > > > > --- > > > > hw/virtio/vhost-shadow-virtqueue.c | 25 +++++ > > > > hw/virtio/vhost-vdpa.c | 163 > ++++++++++++++++++++++++++++- > > > > 2 files changed, 187 insertions(+), 1 deletion(-) > > > > > > > > diff --git a/hw/virtio/vhost-shadow-virtqueue.c > > > > b/hw/virtio/vhost-shadow-virtqueue.c > > > > index 2481d49345..85eff1d841 100644 > > > > --- a/hw/virtio/vhost-shadow-virtqueue.c > > > > +++ b/hw/virtio/vhost-shadow-virtqueue.c > > > > @@ -708,6 +708,25 @@ void vhost_svq_start(VhostShadowVirtqueue > > > *svq, VirtIODevice *vdev, > > > > for (unsigned i = 0; i < svq->vring.num - 1; i++) { > > > > svq->desc_next[i] = i + 1; > > > > } > > > > + > > > > + /* Initialize indirect descriptor state */ > > > > + svq->indirect_enabled = false; > > > > + svq->current_indirect_buf = -1; > > > > + for (int i = 0; i < SVQ_NUM_INDIRECT_BUFS; i++) { > > > > + svq->indirect_bufs[i].desc = NULL; > > > > + svq->indirect_bufs[i].iova = 0; > > > > + svq->indirect_bufs[i].size = 0; > > > > + svq->indirect_bufs[i].state = SVQ_INDIRECT_BUF_FREED; > > > > + svq->indirect_bufs[i].num_descs = 0; > > > > + svq->indirect_bufs[i].freed_descs = 0; > > > > + svq->indirect_bufs[i].freeing_descs = 0; > > > > + svq->indirect_bufs[i].freed_head = 0; > > > > + } > > > > + > > > > + /* Initialize desc_state indirect_buf_idx to -1 */ > > > > + for (unsigned i = 0; i < svq->vring.num; i++) { > > > > + svq->desc_state[i].indirect_buf_idx = -1; > > > > + } > > > > } > > > > > > > > /** > > > > @@ -748,6 +767,10 @@ void vhost_svq_stop(VhostShadowVirtqueue > *svq) > > > > munmap(svq->vring.desc, vhost_svq_driver_area_size(svq)); > > > > munmap(svq->vring.used, vhost_svq_device_area_size(svq)); > > > > event_notifier_set_handler(&svq->hdev_call, NULL); > > > > + > > > > + /* Reset indirect descriptor state (memory is freed by vhost-vdpa) > */ > > > > + svq->indirect_enabled = false; > > > > + svq->current_indirect_buf = -1; > > > > } > > > > > > > > /** > > > > @@ -765,6 +788,8 @@ VhostShadowVirtqueue *vhost_svq_new(const > > > VhostShadowVirtqueueOps *ops, > > > > event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND); > > > > svq->ops = ops; > > > > svq->ops_opaque = ops_opaque; > > > > + svq->indirect_enabled = false; > > > > + svq->current_indirect_buf = -1; > > > > return svq; > > > > } > > > > > > > > diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c index > > > > 7061b6e1a3..816868d153 100644 > > > > --- a/hw/virtio/vhost-vdpa.c > > > > +++ b/hw/virtio/vhost-vdpa.c > > > > @@ -584,6 +584,155 @@ static int > > > > vhost_vdpa_get_dev_features(struct > > > vhost_dev *dev, > > > > return ret; > > > > } > > > > > > > > +/** > > > > + * Allocate a single indirect descriptor buffer for SVQ > > > > + * > > > > + * @v: vhost_vdpa instance > > > > + * @num_descs: Number of descriptors (should be vring.num) > > > > + * @buf: Output buffer structure to fill > > > > + * > > > > + * Returns true on success, false on failure. > > > > + */ > > > > +static bool vhost_vdpa_alloc_indirect_buf(struct vhost_vdpa *v, > > > > + uint16_t num_descs, > > > > + SVQIndirectDescBuf *buf) { > > > > + size_t desc_size = sizeof(vring_desc_t) * num_descs; > > > > + size_t alloc_size = ROUND_UP(desc_size, > > > qemu_real_host_page_size()); > > > > + DMAMap needle = { > > > > + .size = alloc_size - 1, > > > > + .perm = IOMMU_RO, > > > > + }; > > > > + vring_desc_t *indirect_desc; > > > > + int r; > > > > + > > > > + /* Allocate memory for indirect descriptors */ > > > > + indirect_desc = mmap(NULL, alloc_size, PROT_READ | PROT_WRITE, > > > > + MAP_SHARED | MAP_ANONYMOUS, -1, 0); > > > > + if (indirect_desc == MAP_FAILED) { > > > > + error_report("Cannot allocate indirect descriptor buffer"); > > > > + return false; > > > > + } > > > > + > > > > + /* Allocate IOVA for the indirect descriptor buffer */ > > > > + r = vhost_iova_tree_map_alloc(v->shared->iova_tree, &needle, > > > > + (hwaddr)(uintptr_t)indirect_desc); > > > > + if (unlikely(r != IOVA_OK)) { > > > > + error_report("Cannot allocate iova for indirect descriptors > > > > (%d)", > r); > > > > + munmap(indirect_desc, alloc_size); > > > > + return false; > > > > + } > > > > + > > > > + /* Map to device */ > > > > + r = vhost_vdpa_dma_map(v->shared, v->address_space_id, > > > needle.iova, > > > > + alloc_size, indirect_desc, true); > > > > + if (unlikely(r != 0)) { > > > > + error_report("Cannot map indirect descriptors to device: %s > > > > (%d)", > > > > + g_strerror(-r), -r); > > > > + vhost_iova_tree_remove(v->shared->iova_tree, needle); > > > > + munmap(indirect_desc, alloc_size); > > > > + return false; > > > > + } > > > > + > > > > + buf->desc = indirect_desc; > > > > + buf->iova = needle.iova; > > > > + buf->size = alloc_size; > > > > + buf->state = SVQ_INDIRECT_BUF_FREED; > > > > + buf->num_descs = num_descs; > > > > + buf->freed_descs = num_descs; /* All descriptors are free > > > > initially > */ > > > > + buf->freeing_descs = 0; > > > > + buf->freed_head = 0; > > > > + > > > > + return true; > > > > +} > > > > + > > > > +/** > > > > + * Free a single indirect descriptor buffer > > > > + * > > > > + * @v: vhost_vdpa instance > > > > + * @buf: Buffer to free > > > > + */ > > > > +static void vhost_vdpa_free_indirect_buf(struct vhost_vdpa *v, > > > > + SVQIndirectDescBuf *buf) { > > > > + DMAMap needle; > > > > + const DMAMap *result; > > > > + int r; > > > > + > > > > + if (!buf->desc) { > > > > + return; > > > > + } > > > > + > > > > + needle = (DMAMap) { > > > > + .translated_addr = (uint64_t)(uintptr_t)buf->desc, > > > > + }; > > > > + result = vhost_iova_tree_find_iova(v->shared->iova_tree, > > > > + &needle); > > > > + > > > > + if (result) { > > > > + r = vhost_vdpa_dma_unmap(v->shared, v->address_space_id, > > > > + result->iova, buf->size); > > > > + if (unlikely(r != 0)) { > > > > + error_report("Cannot unmap indirect descriptors: %s (%d)", > > > > + g_strerror(-r), -r); > > > > + } > > > > + > > > > + vhost_iova_tree_remove(v->shared->iova_tree, *result); > > > > + } > > > > + > > > > + munmap(buf->desc, buf->size); > > > > + buf->desc = NULL; > > > > + buf->iova = 0; > > > > + buf->size = 0; > > > > +} > > > > + > > > > +/** > > > > + * Initialize indirect descriptor buffers for a single SVQ > > > > + * > > > > + * @v: vhost_vdpa instance > > > > + * @svq: Shadow virtqueue to initialize > > > > + * > > > > + * Returns true on success, false on failure. > > > > + */ > > > > +static bool vhost_vdpa_svq_init_indirect(struct vhost_vdpa *v, > > > > + VhostShadowVirtqueue > > > > +*svq) { > > > > + if (!svq->vring.num) { > > > > + return true; > > > > + } > > > > + > > > > + /* Allocate indirect descriptor buffers for this SVQ */ > > > > + for (int j = 0; j < SVQ_NUM_INDIRECT_BUFS; j++) { > > > > + if (!vhost_vdpa_alloc_indirect_buf(v, svq->vring.num * 4, > > > > + > > > > + &svq->indirect_bufs[j])) { > > > > > > Why not allocate a single array of size "SVQ_NUM_INDIRECT_BUFS * > > > svq->vring.num * 4"? It should help management and maybe even > > > performance as memory accesses are more predictable. > > > > > > > We cannot assume or rely on the device to populate the used ring in the > same order as the available ring. > > With a single large array, this may introduce holes, which makes managing > freed descriptors difficult. > > In addition, the virtio specification requires indirect descriptor > > addresses to > be contiguous, which further complicates management with a single array. > > > > Actually you made me review the QEMU's virtqueue_pop and vhost code > and it does not agree with the standard, so thanks for that :). As you > correctly > point out, the standard says: > > The first descriptor is located at the start of the indirect descriptor table, > additional indirect descriptors come immediately afterwards. The > VIRTQ_DESC_F_WRITE flags bit is the only valid flag for descriptors in the > indirect table. Others are reserved and are ignored by the device. Buffer ID > is > also reserved and is ignored by the device. >
This is the packed indirect descriptors specification. The split indirect descriptors is slightly different, as described below: The first indirect descriptor is located at start of the indirect descriptor table (index 0), additional indirect descriptors are chained by next. An indirect descriptor without a valid next (with flags&VIRTQ_DESC_F_NEXT off) signals the end of the descriptor. A single indirect descriptor table can include both device-readable and device-writable descriptors. If VIRTIO_F_IN_ORDER has been negotiated, indirect descriptors use sequential indices, in-order: index 0 followed by index 1 followed by index 2, etc. https://docs.oasis-open.org/virtio/virtio/v1.3/csd01/virtio-v1.3-csd01.html#x1-430005 > But in the do {} while after the "/* Collect all the descriptors */" > comments in the virtqueue_pop it follows the _F_NEXT field and > desc->next, as it is transversing a descriptor chain, and it should > ignore them if it is processing the indirect table. I think that should be > fixed, > but maybe we can break some driver? Not Linux's one for sure. Jason, MST, > what do you think? If we should fix the device's code, would you like to > submit a patch, Wafer? > > Now regarding the patch. We need the indirect descriptors to be contiguous > in memory, but my point is that having many different tables makes it harder > to manage. A good first step would be to just allocate a buffer of that size > and then divide it into > svq->indirect_bufs[i]. > > Instead of: > svq->indirect_buf[0].iova = mmap(size) && vhost_dma_map(...) ... > svq->indirect_buf[1].iova = mmap(size) && vhost_dma_map(...) ... > > At least: > buf = mmap(size*2) && vhost_dma_map(...) ... > svq->indirect_buf[0].iova = buf > svq->indirect_buf[1].iova = buf + size > > So the translation tree etc is smaller, QEMU need to do less ioctls, less > memory transactions, etc. > > Even with that, we're losing opportunities of finding contiguous free > descriptors with many SVQIndirectDescBuf instead of a bigger single one able > to reuse the free holes. Let's say the SVQIndirectDescBuf are > 4 descriptor length for simplicity. Now the driver makes available one buffer > that spans 3 descriptors (step 1), another one that spans three descriptors > (step 2), and another buffer than spans two descriptors (step 3). > > In this series the SVQ places the first indirect table in the first > SVQIndirectDescBuf, the second table at the second SVQIndirectDescBuf, > and then the third one must resort to chained descriptors. If we make just > one big buffer of size 8, all of them fits in the indirect table, we just > need to > mark it as free. > > Actually, as long as the device does not mark as used one descriptor, we lose > the all the SVQIndirectDescBuf from the time buf->freed_head reaches the > end. > Thank you for the suggestion! This is a good approach and allows the array resources to be utilized more efficiently. I understand your point and will improve this in the next version. > Some ideas for that: > * Reuse IOVATree to handle the indirect table. It already do the linear search > on it, but the allocation and free may hurt performance. > Still, it should be able to find contiguous buffers if any. > * Use it in a circular way but store some data in desc_state so we can > coalesce the free entries. I can expand on that if needed. > > If we still find that this approach still works better, I think we should have > many SVQIndirectDescBuf of queue length, which is the maximum indirect > table length. We can handle them as a free / used stack, so looking for a free > entry would be cheaper. What do you think about that? > > > > > + /* Cleanup already allocated buffers */ > > > > + for (int k = 0; k < j; k++) { > > > > + vhost_vdpa_free_indirect_buf(v, > > > > &svq->indirect_bufs[k]); > > > > + } > > > > + return false; > > > > + } > > > > + } > > > > + > > > > + svq->indirect_enabled = true; > > > > + svq->current_indirect_buf = 0; > > > > + return true; > > > > +} > > > > + > > > > +/** > > > > + * Cleanup indirect descriptor buffers for a single SVQ > > > > + * > > > > + * @v: vhost_vdpa instance > > > > + * @svq: Shadow virtqueue to cleanup */ static void > > > > +vhost_vdpa_svq_cleanup_indirect(struct vhost_vdpa *v, > > > > + VhostShadowVirtqueue > > > > +*svq) { > > > > + for (int j = 0; j < SVQ_NUM_INDIRECT_BUFS; j++) { > > > > + vhost_vdpa_free_indirect_buf(v, &svq->indirect_bufs[j]); > > > > + } > > > > + svq->indirect_enabled = false; > > > > + svq->current_indirect_buf = -1; } > > > > + > > > > static void vhost_vdpa_init_svq(struct vhost_dev *hdev, struct > > > > vhost_vdpa *v) { > > > > g_autoptr(GPtrArray) shadow_vqs = NULL; @@ -791,8 +940,11 @@ > > > > static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev) > > > > size_t idx; > > > > > > > > for (idx = 0; idx < v->shadow_vqs->len; ++idx) { > > > > - vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, idx)); > > > > + VhostShadowVirtqueue *svq = > > > > + g_ptr_array_index(v->shadow_vqs, > > > idx); > > > > + vhost_vdpa_svq_cleanup_indirect(v, svq); > > > > > > All usages of vhost_vdpa_svq_cleanup_indirect go with vhost_svq_stop. > > > We can call vhost_vdpa_svq_cleanup_indirect from vhost_svq_stop or > > > even embed it. > > > > > > > Thank you for the suggestion. I will modify this in the next version of the > patch. > > > > However, I have one question: in > > hw/virtio/vhost-vdpa.c:vhost_vdpa_svq_cleanup, > > why does it only call vhost_svq_stop(svq) without calling > vhost_vdpa_svq_unmap_rings(dev, svq)? > > > > We could call it, but the device is about to be totally destroyed, either > because it has been deleted from QEMU or because QEMU itself is shutting > down. The only action QEMU can do afterwards is to close the descriptor, so > the device should clean the maps anyway. But maybe I'm missing something > here. > > > > > > > > + vhost_svq_stop(svq); > > > > } > > > > + > > > > g_ptr_array_free(v->shadow_vqs, true); } > > > > > > > > @@ -1299,6 +1451,13 @@ static bool vhost_vdpa_svqs_start(struct > > > vhost_dev *dev) > > > > error_setg_errno(&err, -r, "Cannot set device address"); > > > > goto err_set_addr; > > > > } > > > > + > > > > + /* Initialize indirect descriptor buffers for this SVQ */ > > > > + if (!vhost_vdpa_svq_init_indirect(v, svq)) { > > > > + /* Non-fatal: will fallback to chain mode */ > > > > + warn_report("Cannot initialize indirect descriptor for SVQ > > > > %u", > > > > + virtio_get_queue_index(vq)); > > > > + } > > > > } > > > > > > > > return true; > > > > @@ -1313,6 +1472,7 @@ err: > > > > error_reportf_err(err, "Cannot setup SVQ %u: ", i); > > > > for (unsigned j = 0; j < i; ++j) { > > > > VhostShadowVirtqueue *svq = > > > > g_ptr_array_index(v->shadow_vqs, j); > > > > + vhost_vdpa_svq_cleanup_indirect(v, svq); > > > > vhost_vdpa_svq_unmap_rings(dev, svq); > > > > vhost_svq_stop(svq); > > > > } > > > > @@ -1331,6 +1491,7 @@ static void vhost_vdpa_svqs_stop(struct > > > vhost_dev *dev) > > > > for (unsigned i = 0; i < v->shadow_vqs->len; ++i) { > > > > VhostShadowVirtqueue *svq = > > > > g_ptr_array_index(v->shadow_vqs, i); > > > > > > > > + vhost_vdpa_svq_cleanup_indirect(v, svq); > > > > vhost_svq_stop(svq); > > > > vhost_vdpa_svq_unmap_rings(dev, svq); > > > > > > > > -- > > > > 2.48.1 > > > > > >
