On 2/3/2025 12:48 PM, Cédric Le Goater wrote:
On 1/29/25 15:43, Steve Sistare wrote:
At vfio creation time, save the value of vfio container, group, and device
descriptors in CPR state. On qemu restart, vfio_realize() finds and uses
the saved descriptors, and remembers the reused status for subsequent
patches. The reused status is cleared when vmstate load finishes.
During reuse, device and iommu state is already configured, so operations
in vfio_realize that would modify the configuration, such as vfio ioctl's,
are skipped. The result is that vfio_realize constructs qemu data
structures that reflect the current state of the device.
Signed-off-by: Steve Sistare <steven.sist...@oracle.com>
---
hw/vfio/container.c | 105 ++++++++++++++++++++++++++++++++++--------
hw/vfio/cpr-legacy.c | 17 +++++++
include/hw/vfio/vfio-common.h | 2 +
3 files changed, 105 insertions(+), 19 deletions(-)
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index a90ce6c..81d0ccc 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -31,6 +31,7 @@
#include "system/reset.h"
#include "trace.h"
#include "qapi/error.h"
+#include "migration/cpr.h"
#include "pci.h"
VFIOGroupList vfio_group_list =
@@ -415,12 +416,28 @@ static bool vfio_set_iommu(int container_fd, int group_fd,
}
static VFIOContainer *vfio_create_container(int fd, VFIOGroup *group,
- Error **errp)
+ bool reused, Error **errp)
Please rename 'reused' to 'cpr_reused'. We should know what this parameter
is for and I don't see any other use than CPR.
Hi Cedric, glad to virtually meet you, and thanks for reviewing this.
There is no other notion of "reused" in qemu -- CPR is the first to introduce
it. Thus "reused" is unambiguous, it always refers to CPR. IMO shorter names
without underscores make the code more readable, as long as they are
unambiguous.
Also, the "reused" identifier already appears in the initial series for
cpr-transfer, and to switch now to a different identifier leaves us with two
names for the same functionality. Right now I can cscope "reused" and find
everything.
For those reasons, I prefer reused, but if you feel strongly, I will rename it.
{
int iommu_type;
const char *vioc_name;
VFIOContainer *container;
+ /*
+ * If container is reused, just set its type and skip the ioctls, as the
+ * container and group are already configured in the kernel.
+ * VFIO_TYPE1v2_IOMMU is the only type that supports reuse/cpr.
+ */
+ if (reused) {
+ if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1v2_IOMMU)) {
+ iommu_type = VFIO_TYPE1v2_IOMMU;
+ goto skip_iommu;
+ } else {
+ error_setg(errp, "container was reused but VFIO_TYPE1v2_IOMMU "
+ "is not supported");
+ return NULL;
+ }
+ }
+
Can we use 'iommu_type' below instead and avoid VFIO_CHECK_EXTENSION
ioctl ? and then set the iommu unless CPR reused is set.
Sure, I'll mke that change.
iommu_type = vfio_get_iommu_type(fd, errp);
if (iommu_type < 0) {
return NULL;
@@ -430,10 +447,12 @@ static VFIOContainer *vfio_create_container(int fd,
VFIOGroup *group,
return NULL;
}
+skip_iommu:
I think we can avoid this 'skip_iommu' label with some minor refactoring.
vioc_name = vfio_get_iommu_class_name(iommu_type);
container = VFIO_IOMMU_LEGACY(object_new(vioc_name));
container->fd = fd;
+ container->reused = reused;
container->iommu_type = iommu_type;
return container;
}
@@ -543,10 +562,13 @@ static bool vfio_connect_container(VFIOGroup *group,
AddressSpace *as,
VFIOContainer *container;
VFIOContainerBase *bcontainer;
int ret, fd;
+ bool reused;
cpr_reused.
VFIOAddressSpace *space;
VFIOIOMMUClass *vioc;
space = vfio_get_address_space(as);
+ fd = cpr_find_fd("vfio_container_for_group", group->groupid);
+ reused = (fd > 0);
hmm, so we are deducing from the existence of a CprFd state element
that we are doing a live update of the VM. This seems to me to be a
somewhat quick heuristic.
Isn't there a global helper ? Isn't the VM aware that it's being
restarted after a live update ? I am not familiar with the CPR
sequence.
There is a global mode that can be checked, but we would still need to
fetch the fd. Checking the fd alone yields tighter code. It also seems
perfectly logical to me when reading the code. Can't find the cpr fd?
Then we are not doing cpr. BTW, it is not heuristic. The cpr fd exists
at creation time iff we are doing cpr.
/*
* VFIO is currently incompatible with discarding of RAM insofar as the
@@ -579,28 +601,52 @@ static bool vfio_connect_container(VFIOGroup *group,
AddressSpace *as,
* details once we know which type of IOMMU we are using.
*/
+ /*
+ * If the container is reused, then the group is already attached in the
+ * kernel. If a container with matching fd is found, then update the
+ * userland group list and return. If not, then after the loop, create
+ * the container struct and group list.
+ */
+
QLIST_FOREACH(bcontainer, &space->containers, next) {
container = container_of(bcontainer, VFIOContainer, bcontainer);
- if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
- ret = vfio_ram_block_discard_disable(container, true);
- if (ret) {
- error_setg_errno(errp, -ret,
- "Cannot set discarding of RAM broken");
- if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER,
- &container->fd)) {
- error_report("vfio: error disconnecting group %d from"
- " container", group->groupid);
- }
- return false;
+
+ if (reused) {
+ if (container->fd != fd) {
+ continue;
}
- group->container = container;
- QLIST_INSERT_HEAD(&container->group_list, group, container_next);
+ } else if (ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd))
{
+ continue;
+ }
+
+ /* Container is a match for the group */
+ ret = vfio_ram_block_discard_disable(container, true);
+ if (ret) {
+ error_setg_errno(errp, -ret,
+ "Cannot set discarding of RAM broken");
+ if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER,
+ &container->fd)) {
+ error_report("vfio: error disconnecting group %d from"
+ " container", group->groupid);
+
+ }
+ goto delete_fd_exit;
+ }
+ group->container = container;
+ QLIST_INSERT_HEAD(&container->group_list, group, container_next);
+ if (!reused) {
vfio_kvm_device_add_group(group);
- return true;
+ cpr_save_fd("vfio_container_for_group", group->groupid,
+ container->fd);
}
+ return true;
+ }
The above changes are difficult to understand
Agreed, the above diffs are indeed hard to grok. Please apply the changes
and review the resulting code and let me know if it still needs helpers.
I could move all of the code after "Container is a match for the group" to
a helper, or just the code after "group->container = container", but IMO
neither choice helps one understand the slightly tricky logic in the loop.
and I really don't like
these 'if (reused)' code sequences scattered all over the place. It
would make reading and long term maintenance easier if we could
introduce helpers to hide the "CPR reuse" aspect of the machine
initialization phase.
I'll look into refactoring and helpers, but I'm not convinced the resulting
code will be more readable, because there are many separate steps that must
be performed in order, and the lines to be skipped for cpr are interleaved
throughout.
Again, I hope you get a chance to read the patched code, and not just the
diffs. Reading a patched function from top to bottom, it is easy to see
what is skipped for cpr.
+ /* No matching container found, create one */
+ if (!reused) {
+ fd = qemu_open("/dev/vfio/vfio", O_RDWR, errp);
}
- fd = qemu_open("/dev/vfio/vfio", O_RDWR, errp);> if (fd < 0) {
goto put_space_exit;
}
@@ -612,11 +658,12 @@ static bool vfio_connect_container(VFIOGroup *group,
AddressSpace *as,> goto close_fd_exit;
}
- container = vfio_create_container(fd, group, errp);
+ container = vfio_create_container(fd, group, reused, errp);
if (!container) {
goto close_fd_exit;
}
bcontainer = &container->bcontainer;
+ container->reused = reused;
that's done already in vfio_create_container()
Thanks, I will delete the redundant assignment.
if (!vfio_legacy_cpr_register_container(container, errp)) {
goto free_container_exit;
@@ -652,6 +699,7 @@ static bool vfio_connect_container(VFIOGroup *group,
AddressSpace *as,
}
bcontainer->initialized = true;
+ cpr_resave_fd("vfio_container_for_group", group->groupid, fd);
can't we have an helper routine to open/reuse/resave the fd ? Same
comment for vfio_get_device() and vfio_get_group()
Yes, for some cases where the descriptor is opened using qemu_open, I could
define a helper. It would work well for vfio_get_group, which was:
group->fd = cpr_find_fd("vfio_group", groupid);
if (group->fd < 0) {
group->fd = qemu_open(path, O_RDWR, errp);
}
...
cpr_resave_fd("vfio_group", groupid, group->fd);
and now becomes:
group->fd = cpr_open_or_find_fd(path, O_RDWR, "vfio_group", groupid, errp);
but now we need an additional call to delete the fd on failure, so the helper
provides only a modest improvement in lines of code:
free_group_exit:
cpr_delete_fd("vfio_group", group->groupid);
Also, the helper cannot be used for vfio_get_device, because it creates the
descriptor via VFIO_GROUP_GET_DEVICE_FD.
And it cannot be used for vfio_connect_container, because the reused fd must be
known early, during the search of containers, before qemu_open("/dev/vfio/vfio")
is called.
return true;
listener_release_exit:
@@ -677,6 +725,8 @@ close_fd_exit:
put_space_exit:
vfio_put_address_space(space);
+delete_fd_exit:
+ cpr_delete_fd("vfio_container_for_group", group->groupid);
Another exit label. That's the 7th in vfio_connect_container() ...
This is becoming too complex, we need to refactor first.
I don't see any obvious subroutine candidates that would reduce the
goto count.
But, if we set and clear variables appropriately, we can check them while
unwinding, and rely on some cleanup functions being safe to call even when
not needed, and delete all intermediate labels:
fail:
if (group_was_added) { // new local variable
QLIST_REMOVE(group, container_next);
vfio_kvm_device_del_group(group);
}
memory_listener_unregister(&bcontainer->listener); // safe
if (vioc && vioc->release) {
vioc->release(bcontainer);
}
if (discard_disabled) { // new local variable
vfio_ram_block_discard_disable(container, false);
}
vfio_legacy_cpr_unregister_container(container); // safe
if (container) {
object_unref(container);
}
if (fd >= 0) {
close(fd);
}
if (space) {
vfio_put_address_space(space);
}
cpr_delete_fd("vfio_container_for_group", group->groupid); // safe
return false;
Sound good?
- Steve
return false;
}
@@ -688,6 +738,7 @@ static void vfio_disconnect_container(VFIOGroup *group)
QLIST_REMOVE(group, container_next);
group->container = NULL;
+ cpr_delete_fd("vfio_container_for_group", group->groupid);
/*
* Explicitly release the listener first before unset container,
@@ -741,7 +792,12 @@ static VFIOGroup *vfio_get_group(int groupid, AddressSpace
*as, Error **errp)
group = g_malloc0(sizeof(*group));
snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
- group->fd = qemu_open(path, O_RDWR, errp);
+
+ group->fd = cpr_find_fd("vfio_group", groupid);
+ if (group->fd < 0) {
+ group->fd = qemu_open(path, O_RDWR, errp);
+ }
+
if (group->fd < 0) {
goto free_group_exit;
}
@@ -769,6 +825,7 @@ static VFIOGroup *vfio_get_group(int groupid, AddressSpace
*as, Error **errp)
}
QLIST_INSERT_HEAD(&vfio_group_list, group, next);
+ cpr_resave_fd("vfio_group", groupid, group->fd);
return group;
@@ -794,6 +851,7 @@ static void vfio_put_group(VFIOGroup *group)
vfio_disconnect_container(group);
QLIST_REMOVE(group, next);
trace_vfio_put_group(group->fd);
+ cpr_delete_fd("vfio_group", group->groupid);
close(group->fd);
g_free(group);
}
@@ -803,8 +861,14 @@ static bool vfio_get_device(VFIOGroup *group, const char
*name,
{
g_autofree struct vfio_device_info *info = NULL;
int fd;
+ bool reused;
+
+ fd = cpr_find_fd(name, 0);
+ reused = (fd >= 0);
+ if (!reused) {
+ fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
+ }
- fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
if (fd < 0) {
error_setg_errno(errp, errno, "error getting device from group %d",
group->groupid);
@@ -849,6 +913,8 @@ static bool vfio_get_device(VFIOGroup *group, const char
*name,
vbasedev->num_irqs = info->num_irqs;
vbasedev->num_regions = info->num_regions;
vbasedev->flags = info->flags;
+ vbasedev->reused = reused;
+ cpr_resave_fd(name, 0, fd);
trace_vfio_get_device(name, info->flags, info->num_regions,
info->num_irqs);
@@ -865,6 +931,7 @@ static void vfio_put_base_device(VFIODevice *vbasedev)
QLIST_REMOVE(vbasedev, next);
vbasedev->group = NULL;
trace_vfio_put_base_device(vbasedev->fd);
+ cpr_delete_fd(vbasedev->name, 0);
close(vbasedev->fd);
}
diff --git a/hw/vfio/cpr-legacy.c b/hw/vfio/cpr-legacy.c
index d3bbc05..ce6f14e 100644
--- a/hw/vfio/cpr-legacy.c
+++ b/hw/vfio/cpr-legacy.c
@@ -29,10 +29,27 @@ static bool vfio_cpr_supported(VFIOContainer *container,
Error **errp)
}
}
+static int vfio_container_post_load(void *opaque, int version_id)
+{
+ VFIOContainer *container = opaque;
+ VFIOGroup *group;
+ VFIODevice *vbasedev;
+
+ container->reused = false;
+
+ QLIST_FOREACH(group, &container->group_list, container_next) {
+ QLIST_FOREACH(vbasedev, &group->device_list, next) {
+ vbasedev->reused = false;
+ }
+ }
+ return 0;
+}
+
static const VMStateDescription vfio_container_vmstate = {
.name = "vfio-container",
.version_id = 0,
.minimum_version_id = 0,
+ .post_load = vfio_container_post_load,
.needed = cpr_needed_for_reuse,
.fields = (VMStateField[]) {
VMSTATE_END_OF_LIST()
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 53e554f..a435a90 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -85,6 +85,7 @@ typedef struct VFIOContainer {
int fd; /* /dev/vfio/vfio, empowered by the attached groups */
unsigned iommu_type;
Error *cpr_blocker;
+ bool reused;
QLIST_HEAD(, VFIOGroup) group_list;
} VFIOContainer;
@@ -135,6 +136,7 @@ typedef struct VFIODevice {
bool ram_block_discard_allowed;
OnOffAuto enable_migration;
bool migration_events;
+ bool reused;
VFIODeviceOps *ops;
unsigned int num_irqs;
unsigned int num_regions;