On 5/15/25 21:08, Steven Sistare wrote:
On 5/15/2025 8:59 AM, Cédric Le Goater wrote:
On 5/12/25 17:32, Steve Sistare wrote:
At vfio creation time, save the value of vfio container, group, and device
descriptors in CPR state. On qemu restart, vfio_realize() finds and uses
the saved descriptors, and remembers the reused status for subsequent
patches. The reused status is cleared when vmstate load finishes.
During reuse, device and iommu state is already configured, so operations
in vfio_realize that would modify the configuration, such as vfio ioctl's,
are skipped. The result is that vfio_realize constructs qemu data
structures that reflect the current state of the device.
Signed-off-by: Steve Sistare <steven.sist...@oracle.com>
---
hw/vfio/container.c | 65 ++++++++++++++++++++++++++++++++++++-------
hw/vfio/cpr-legacy.c | 46 ++++++++++++++++++++++++++++++
include/hw/vfio/vfio-cpr.h | 9 ++++++
include/hw/vfio/vfio-device.h | 2 ++
4 files changed, 112 insertions(+), 10 deletions(-)
diff --git a/hw/vfio/container.c b/hw/vfio/container.c
index 85c76da..278a220 100644
--- a/hw/vfio/container.c
+++ b/hw/vfio/container.c
@@ -31,6 +31,8 @@
#include "system/reset.h"
#include "trace.h"
#include "qapi/error.h"
+#include "migration/cpr.h"
+#include "migration/blocker.h"
#include "pci.h"
#include "hw/vfio/vfio-container.h"
#include "hw/vfio/vfio-cpr.h"
@@ -414,7 +416,7 @@ static bool vfio_set_iommu(int container_fd, int group_fd,
}
static VFIOContainer *vfio_create_container(int fd, VFIOGroup *group,
- Error **errp)
+ bool cpr_reused, Error **errp)
{
int iommu_type;
const char *vioc_name;
@@ -425,7 +427,11 @@ static VFIOContainer *vfio_create_container(int fd,
VFIOGroup *group,
return NULL;
}
- if (!vfio_set_iommu(fd, group->fd, &iommu_type, errp)) {
+ /*
+ * If container is reused, just set its type and skip the ioctls, as the
+ * container and group are already configured in the kernel.
+ */
+ if (!cpr_reused && !vfio_set_iommu(fd, group->fd, &iommu_type, errp)) {
return NULL;
}
@@ -433,6 +439,7 @@ static VFIOContainer *vfio_create_container(int fd,
VFIOGroup *group,
container = VFIO_IOMMU_LEGACY(object_new(vioc_name));
container->fd = fd;
+ container->cpr.reused = cpr_reused;
container->iommu_type = iommu_type;
return container;
}
@@ -584,7 +591,7 @@ static bool
vfio_container_attach_discard_disable(VFIOContainer *container,
}
static bool vfio_container_group_add(VFIOContainer *container, VFIOGroup
*group,
- Error **errp)
+ bool cpr_reused, Error **errp)
{
if (!vfio_container_attach_discard_disable(container, group, errp)) {
return false;
@@ -592,6 +599,9 @@ static bool vfio_container_group_add(VFIOContainer
*container, VFIOGroup *group,
group->container = container;
QLIST_INSERT_HEAD(&container->group_list, group, container_next);
vfio_group_add_kvm_device(group);
+ if (!cpr_reused) {
+ cpr_save_fd("vfio_container_for_group", group->groupid, container->fd);
+ }
Could we avoid the test on cpr_reused always call cpr_save_fd() ?
No. If cpr_reused is true, then the fd is already on cpr's save list.
We don't want to save duplicates of the same entry.
Can't we call cpr_find_fd() like in cpr_open_fd() ?
return true;
}
@@ -601,6 +611,7 @@ static void vfio_container_group_del(VFIOContainer
*container, VFIOGroup *group)
group->container = NULL;
vfio_group_del_kvm_device(group);
vfio_ram_block_discard_disable(container, false);
+ cpr_delete_fd("vfio_container_for_group", group->groupid);
}
static bool vfio_container_connect(VFIOGroup *group, AddressSpace *as,
@@ -613,17 +624,37 @@ static bool vfio_container_connect(VFIOGroup *group,
AddressSpace *as,
VFIOIOMMUClass *vioc = NULL;
bool new_container = false;
bool group_was_added = false;
+ bool cpr_reused;
space = vfio_address_space_get(as);
+ fd = cpr_find_fd("vfio_container_for_group", group->groupid);
+ cpr_reused = (fd > 0);
The code above is doing 2 things : it grabs a restored fd and
deduces from the fd value that the VM is doing are doing a CPR
reboot.
Instead of adding this cpr_reused flag, I would prefer to duplicate
the code into something like:
if (!cpr_reboot) {
QLIST_FOREACH(bcontainer, &space->containers, next) {
container = container_of(bcontainer, VFIOContainer, bcontainer);
if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
return vfio_container_group_add(container, group, errp);
}
}
fd = qemu_open("/dev/vfio/vfio", O_RDWR, errp);
if (fd < 0) {
goto fail;
}
ret = ioctl(fd, VFIO_GET_API_VERSION);
if (ret != VFIO_API_VERSION) {
error_setg(errp, "supported vfio version: %d, "
"reported version: %d", VFIO_API_VERSION, ret);
goto fail;
}
container = vfio_create_container(fd, group, errp);
} else {
/* ... */
}
OK, but there is no sense in duplicating the identical code for
VFIO_GET_API_VERSION and vfio_create_container. If you want me to
simplify the loop, I suggest:
if (!cpr_reused) {
QLIST_FOREACH(bcontainer, &space->containers, next) {
container = container_of(bcontainer, VFIOContainer, bcontainer);
if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
return vfio_container_group_add(container, group, false, errp);
}
}
fd = qemu_open("/dev/vfio/vfio", O_RDWR, errp);
if (fd < 0) {
goto fail;
}
} else {
QLIST_FOREACH(bcontainer, &space->containers, next) {
container = container_of(bcontainer, VFIOContainer, bcontainer);
if (vfio_cpr_container_match(container, group, &fd)) {
return vfio_container_group_add(container, group, true, errp);
}
}
}
ret = ioctl(fd, VFIO_GET_API_VERSION);
...
OK. Let's do that. I find it easier to read.
+ /*
+ * If the container is reused, then the group is already attached in the
+ * kernel. If a container with matching fd is found, then update the
+ * userland group list and return. If not, then after the loop, create
+ * the container struct and group list.
+ */
QLIST_FOREACH(bcontainer, &space->containers, next) {
container = container_of(bcontainer, VFIOContainer, bcontainer);
- if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
- return vfio_container_group_add(container, group, errp);
+
+ if (cpr_reused) {
+ if (!vfio_cpr_container_match(container, group, &fd)) {
why do we need to modify fd ?
That is explained by the comments inside vfio_cpr_container_match, where the
explanation is more easily understood.
I haven't been able to see what a modified fd was useful for before because
we test cpr_reused and in other places !cpr_reused :
if (cpr_reused) {
if (!vfio_cpr_container_match(container, group, &fd)) {
continue;
}
and later
if (!cpr_reused) {
fd = qemu_open("/dev/vfio/vfio", O_RDWR, errp);
}
I think I got it now. This was a bit confusing.
+ continue;
+ }
+ } else if (ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd))
{
+ continue;
}
+ return vfio_container_group_add(container, group, cpr_reused, errp);
+ }
+
+ if (!cpr_reused) {
+ fd = qemu_open("/dev/vfio/vfio", O_RDWR, errp);
}
- fd = qemu_open("/dev/vfio/vfio", O_RDWR, errp);
if (fd < 0) {> goto fail;
}
@@ -635,7 +666,7 @@ static bool vfio_container_connect(VFIOGroup *group,
AddressSpace *as,
goto fail;
}
- container = vfio_create_container(fd, group, errp);
+ container = vfio_create_container(fd, group, cpr_reused, errp);
if (!container) {
goto fail;
}
@@ -655,7 +686,7 @@ static bool vfio_container_connect(VFIOGroup *group,
AddressSpace *as,
vfio_address_space_insert(space, bcontainer);
- if (!vfio_container_group_add(container, group, errp)) {
+ if (!vfio_container_group_add(container, group, cpr_reused, errp)) {
goto fail;
}
group_was_added = true;
@@ -697,6 +728,7 @@ static void vfio_container_disconnect(VFIOGroup *group)
QLIST_REMOVE(group, container_next);
group->container = NULL;
+ cpr_delete_fd("vfio_container_for_group", group->groupid);
/*
* Explicitly release the listener first before unset container,
@@ -750,7 +782,7 @@ static VFIOGroup *vfio_group_get(int groupid, AddressSpace
*as, Error **errp)
group = g_malloc0(sizeof(*group));
snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
- group->fd = qemu_open(path, O_RDWR, errp);
+ group->fd = cpr_open_fd(path, O_RDWR, "vfio_group", groupid, NULL, errp);
if (group->fd < 0) {
goto free_group_exit;
}
@@ -782,6 +814,7 @@ static VFIOGroup *vfio_group_get(int groupid, AddressSpace
*as, Error **errp)
return group;
close_fd_exit:
+ cpr_delete_fd("vfio_group", groupid);
close(group->fd);
free_group_exit:
@@ -803,6 +836,7 @@ static void vfio_group_put(VFIOGroup *group)
vfio_container_disconnect(group);
QLIST_REMOVE(group, next);
trace_vfio_group_put(group->fd);
+ cpr_delete_fd("vfio_group", group->groupid);
close(group->fd);
g_free(group);
}
@@ -812,8 +846,14 @@ static bool vfio_device_get(VFIOGroup *group, const char
*name,
{
g_autofree struct vfio_device_info *info = NULL;
int fd;
+ bool cpr_reused;
+
+ fd = cpr_find_fd(name, 0);
+ cpr_reused = (fd >= 0);
+ if (!cpr_reused) {
+ fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
+ }
Could we introduce an helper routine to open this file, like we have
cpr_open_fd() ?
OK, but this would be the only use of the helper, and it would bury
generic vfio functionality -- VFIO_GROUP_GET_DEVICE_FD -- inside a cpr
flavored helper. IMO not an improvement.
VFIO_GROUP_GET_DEVICE_FD would still be passed as a parameter and
so it won't be buried IMO. I don't dislike it that much.
However, I don't like the "if (cpr_reused)" statements scattered
throughout the code, so I'm looking for ways to bury them.
Thanks,
C.
- fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
if (fd < 0) {
error_setg_errno(errp, errno, "error getting device from group %d",
group->groupid);
@@ -857,6 +897,10 @@ static bool vfio_device_get(VFIOGroup *group, const char
*name,
vbasedev->group = group;
QLIST_INSERT_HEAD(&group->device_list, vbasedev, next);
+ vbasedev->cpr.reused = cpr_reused;
+ if (!cpr_reused) {
+ cpr_save_fd(name, 0, fd);
Could we avoid the test on cpr_reused always call cpr_save_fd() ?
No. Must avoid adding duplicate entries.
+ }
trace_vfio_device_get(name, info->flags, info->num_regions,
info->num_irqs);
return true;
@@ -870,6 +914,7 @@ static void vfio_device_put(VFIODevice *vbasedev)
QLIST_REMOVE(vbasedev, next);
vbasedev->group = NULL;
trace_vfio_device_put(vbasedev->fd);
+ cpr_delete_fd(vbasedev->name, 0);
close(vbasedev->fd);
}
diff --git a/hw/vfio/cpr-legacy.c b/hw/vfio/cpr-legacy.c
index fac323c..638a8e0 100644
--- a/hw/vfio/cpr-legacy.c
+++ b/hw/vfio/cpr-legacy.c
@@ -10,6 +10,7 @@
#include "qemu/osdep.h"
#include "hw/vfio/vfio-container.h"
#include "hw/vfio/vfio-cpr.h"
+#include "hw/vfio/vfio-device.h"
#include "migration/blocker.h"
#include "migration/cpr.h"
#include "migration/migration.h"
@@ -31,10 +32,27 @@ static bool vfio_cpr_supported(VFIOContainer *container,
Error **errp)
}
}
+static int vfio_container_post_load(void *opaque, int version_id)
+{
+ VFIOContainer *container = opaque;
+ VFIOGroup *group;
+ VFIODevice *vbasedev;
+
+ container->cpr.reused = false;
+
+ QLIST_FOREACH(group, &container->group_list, container_next) {
+ QLIST_FOREACH(vbasedev, &group->device_list, next) {
+ vbasedev->cpr.reused = false;
+ }
+ }
+ return 0;
+}
+
static const VMStateDescription vfio_container_vmstate = {
.name = "vfio-container",
.version_id = 0,
.minimum_version_id = 0,
+ .post_load = vfio_container_post_load,
.needed = cpr_needed_for_reuse,
.fields = (VMStateField[]) {
VMSTATE_END_OF_LIST()
@@ -68,3 +86,31 @@ void vfio_legacy_cpr_unregister_container(VFIOContainer
*container)
migrate_del_blocker(&container->cpr.blocker);
vmstate_unregister(NULL, &vfio_container_vmstate, container);
}
+
+static bool same_device(int fd1, int fd2)
+{
+ struct stat st1, st2;
+
+ return !fstat(fd1, &st1) && !fstat(fd2, &st2) && st1.st_dev == st2.st_dev;
+}
+
+bool vfio_cpr_container_match(VFIOContainer *container, VFIOGroup *group,
+ int *pfd)
+{
+ if (container->fd == *pfd) {
+ return true;
+ }
+ if (!same_device(container->fd, *pfd)) {
+ return false;
+ }
+ /*
+ * Same device, different fd. This occurs when the container fd is
+ * cpr_save'd multiple times, once for each groupid, so SCM_RIGHTS
+ * produces duplicates. De-dup it.
+ */
+ cpr_delete_fd("vfio_container_for_group", group->groupid);
+ close(*pfd);
+ cpr_save_fd("vfio_container_for_group", group->groupid, container->fd);
+ *pfd = container->fd;
I am not sure 'pfd' is used afterwards. Is it ?
True, good eye. I will change it to "int fd" and stop returning the new value.
- Steve
+ return true;
+}
diff --git a/include/hw/vfio/vfio-cpr.h b/include/hw/vfio/vfio-cpr.h
index f864547..1c4f070 100644
--- a/include/hw/vfio/vfio-cpr.h
+++ b/include/hw/vfio/vfio-cpr.h
@@ -13,10 +13,16 @@
typedef struct VFIOContainerCPR {
Error *blocker;
+ bool reused;
} VFIOContainerCPR;
+typedef struct VFIODeviceCPR {
+ bool reused;
+} VFIODeviceCPR;
+
struct VFIOContainer;
struct VFIOContainerBase;
+struct VFIOGroup;
bool vfio_legacy_cpr_register_container(struct VFIOContainer *container,
Error **errp);
@@ -29,4 +35,7 @@ bool vfio_cpr_register_container(struct VFIOContainerBase
*bcontainer,
Error **errp);
void vfio_cpr_unregister_container(struct VFIOContainerBase *bcontainer);
+bool vfio_cpr_container_match(struct VFIOContainer *container,
+ struct VFIOGroup *group, int *fd);
+
#endif /* HW_VFIO_VFIO_CPR_H */
diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h
index 8bcb3c1..4e4d0b6 100644
--- a/include/hw/vfio/vfio-device.h
+++ b/include/hw/vfio/vfio-device.h
@@ -28,6 +28,7 @@
#endif
#include "system/system.h"
#include "hw/vfio/vfio-container-base.h"
+#include "hw/vfio/vfio-cpr.h"
#include "system/host_iommu_device.h"
#include "system/iommufd.h"
@@ -84,6 +85,7 @@ typedef struct VFIODevice {
VFIOIOASHwpt *hwpt;
QLIST_ENTRY(VFIODevice) hwpt_next;
struct vfio_region_info **reginfo;
+ VFIODeviceCPR cpr;
} VFIODevice;
struct VFIODeviceOps {
Thanks,
C.