On 7/2/2025 12:02 PM, Peter Xu wrote:
On Tue, Jul 01, 2025 at 11:25:23AM -0400, Steven Sistare wrote:
Hi Paolo, Peter, Fabiano,
This patch needs review. CPR for vfio is broken without it.
Soft feature freeze July 15.
Sorry to not have tried looking at this more even if this is marked
"migration".. obviously I still almost see it as a KVM change..
Questions inline below:
- Steve
On 6/10/2025 11:39 AM, Steve Sistare wrote:
cpr-transfer breaks vfio network connectivity to and from the guest, and
the host system log shows:
irq bypass consumer (token 00000000a03c32e5) registration fails: -16
which is EBUSY. This occurs because KVM descriptors are still open in
the old QEMU process. Close them.
Cc: Paolo Bonzini <pbonz...@redhat.com>
Signed-off-by: Steve Sistare <steven.sist...@oracle.com>
---
include/hw/vfio/vfio-device.h | 2 ++
include/migration/cpr.h | 2 ++
include/system/kvm.h | 1 +
accel/kvm/kvm-all.c | 32 ++++++++++++++++++++++++++++++++
accel/stubs/kvm-stub.c | 5 +++++
hw/vfio/helpers.c | 10 ++++++++++
hw/vfio/vfio-stubs.c | 13 +++++++++++++
migration/cpr-transfer.c | 18 ++++++++++++++++++
migration/cpr.c | 8 ++++++++
migration/migration.c | 1 +
hw/vfio/meson.build | 2 ++
11 files changed, 94 insertions(+)
create mode 100644 hw/vfio/vfio-stubs.c
diff --git a/include/hw/vfio/vfio-device.h b/include/hw/vfio/vfio-device.h
index 4e4d0b6..6eb6f21 100644
--- a/include/hw/vfio/vfio-device.h
+++ b/include/hw/vfio/vfio-device.h
@@ -231,4 +231,6 @@ void vfio_device_set_fd(VFIODevice *vbasedev, const char
*str, Error **errp);
void vfio_device_init(VFIODevice *vbasedev, int type, VFIODeviceOps *ops,
DeviceState *dev, bool ram_discard);
int vfio_device_get_aw_bits(VFIODevice *vdev);
+
+void vfio_kvm_device_close(void);
#endif /* HW_VFIO_VFIO_COMMON_H */
diff --git a/include/migration/cpr.h b/include/migration/cpr.h
index 07858e9..d09b657 100644
--- a/include/migration/cpr.h
+++ b/include/migration/cpr.h
@@ -32,7 +32,9 @@ void cpr_state_close(void);
struct QIOChannel *cpr_state_ioc(void);
bool cpr_incoming_needed(void *opaque);
+void cpr_kvm_close(void);
+void cpr_transfer_init(void);
QEMUFile *cpr_transfer_output(MigrationChannel *channel, Error **errp);
QEMUFile *cpr_transfer_input(MigrationChannel *channel, Error **errp);
diff --git a/include/system/kvm.h b/include/system/kvm.h
index 7cc60d2..4896a3c 100644
--- a/include/system/kvm.h
+++ b/include/system/kvm.h
@@ -195,6 +195,7 @@ bool kvm_has_sync_mmu(void);
int kvm_has_vcpu_events(void);
int kvm_max_nested_state_length(void);
int kvm_has_gsi_routing(void);
+void kvm_close(void);
/**
* kvm_arm_supports_user_irq
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index a317783..3d3a557 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -515,16 +515,23 @@ static int do_kvm_destroy_vcpu(CPUState *cpu)
goto err;
}
+ /* If I am the CPU that created coalesced_mmio_ring, then discard it */
+ if (s->coalesced_mmio_ring == (void *)cpu->kvm_run + PAGE_SIZE) {
+ s->coalesced_mmio_ring = NULL;
+ }
+
ret = munmap(cpu->kvm_run, mmap_size);
if (ret < 0) {
goto err;
}
+ cpu->kvm_run = NULL;
if (cpu->kvm_dirty_gfns) {
ret = munmap(cpu->kvm_dirty_gfns, s->kvm_dirty_ring_bytes);
if (ret < 0) {
goto err;
}
+ cpu->kvm_dirty_gfns = NULL;
}
kvm_park_vcpu(cpu);
@@ -608,6 +615,31 @@ err:
return ret;
}
+void kvm_close(void)
+{
+ CPUState *cpu;
+
+ if (!kvm_state || kvm_state->fd == -1) {
+ return;
+ }
+
+ CPU_FOREACH(cpu) {
+ cpu_remove_sync(cpu);
+ close(cpu->kvm_fd);
+ cpu->kvm_fd = -1;
+ close(cpu->kvm_vcpu_stats_fd);
+ cpu->kvm_vcpu_stats_fd = -1;
+ }
+
+ if (kvm_state && kvm_state->fd != -1) {
+ close(kvm_state->vmfd);
+ kvm_state->vmfd = -1;
+ close(kvm_state->fd);
+ kvm_state->fd = -1;
+ }
+ kvm_state = NULL;
+}
+
/*
* dirty pages logging control
*/
diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c
index ecfd763..97dacb3 100644
--- a/accel/stubs/kvm-stub.c
+++ b/accel/stubs/kvm-stub.c
@@ -134,3 +134,8 @@ int kvm_create_guest_memfd(uint64_t size, uint64_t flags,
Error **errp)
{
return -ENOSYS;
}
+
+void kvm_close(void)
+{
+ return;
+}
diff --git a/hw/vfio/helpers.c b/hw/vfio/helpers.c
index d0dbab1..af1db2f 100644
--- a/hw/vfio/helpers.c
+++ b/hw/vfio/helpers.c
@@ -117,6 +117,16 @@ bool vfio_get_info_dma_avail(struct vfio_iommu_type1_info
*info,
int vfio_kvm_device_fd = -1;
#endif
+void vfio_kvm_device_close(void)
+{
+#ifdef CONFIG_KVM
+ if (vfio_kvm_device_fd != -1) {
+ close(vfio_kvm_device_fd);
+ vfio_kvm_device_fd = -1;
+ }
+#endif
+}
+
int vfio_kvm_device_add_fd(int fd, Error **errp)
{
#ifdef CONFIG_KVM
diff --git a/hw/vfio/vfio-stubs.c b/hw/vfio/vfio-stubs.c
new file mode 100644
index 0000000..a4c8b56
--- /dev/null
+++ b/hw/vfio/vfio-stubs.c
@@ -0,0 +1,13 @@
+/*
+ * Copyright (c) 2025 Oracle and/or its affiliates.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "hw/vfio/vfio-device.h"
+
+void vfio_kvm_device_close(void)
+{
+ return;
+}
Do we really need this stub, and the "include VFIO" headers in CPR as
below? I thought it would be doable the other way round, that VFIO or KVM
can register migration notifiers for CPR mode only. After all, the
registration (migration_add_notifier_mode) is in misc.h so it should be
available to QEMU all across.
OK. I have reworked the code to register the notifier from vfio and
eliminate the stub.
Besides that, a high level question: what this patch does is trying to
close early the relevant kvm/vfio fds that are used to attach to irq
consumer / providers. > At the meantime, AFAICT, CPR as a whole feature when
used against VFIO available, works only if VFIO can do whatever it wants
(DMA, irq injections) during the whole process of CPR live upgrade,
assuming that all the states are persisted in the fds. Then, if here we
need to (a) unregister on src QEMU and (b) re-attach on dest QEMU, what
happens if the irqs are generated exactly between (a) and (b)? Could they
get lost?
The irq producer is not closed, but it is detached from the kvm consumer.
It's eventfd is preserved in new QEMU, and interrupts that arrive during
transition are pended there.
- Steve
diff --git a/migration/cpr-transfer.c b/migration/cpr-transfer.c
index e1f1403..396558f 100644
--- a/migration/cpr-transfer.c
+++ b/migration/cpr-transfer.c
@@ -17,6 +17,24 @@
#include "migration/vmstate.h"
#include "trace.h"
+static int cpr_transfer_notifier(NotifierWithReturn *notifier,
+ MigrationEvent *e,
+ Error **errp)
+{
+ if (e->type == MIG_EVENT_PRECOPY_DONE) {
+ cpr_kvm_close();
+ }
+ return 0;
+}
+
+void cpr_transfer_init(void)
+{
+ static NotifierWithReturn notifier;
+
+ migration_add_notifier_mode(¬ifier, cpr_transfer_notifier,
+ MIG_MODE_CPR_TRANSFER);
+}
+
QEMUFile *cpr_transfer_output(MigrationChannel *channel, Error **errp)
{
MigrationAddress *addr = channel->addr;
diff --git a/migration/cpr.c b/migration/cpr.c
index a50a57e..49fb0a5 100644
--- a/migration/cpr.c
+++ b/migration/cpr.c
@@ -7,12 +7,14 @@
#include "qemu/osdep.h"
#include "qapi/error.h"
+#include "hw/vfio/vfio-device.h"
[1]
#include "migration/cpr.h"
#include "migration/misc.h"
#include "migration/options.h"
#include "migration/qemu-file.h"
#include "migration/savevm.h"
#include "migration/vmstate.h"
+#include "system/kvm.h"
#include "system/runstate.h"
#include "trace.h"
@@ -264,3 +266,9 @@ bool cpr_incoming_needed(void *opaque)
MigMode mode = migrate_mode();
return mode == MIG_MODE_CPR_TRANSFER;
}
+
+void cpr_kvm_close(void)
+{
+ kvm_close();
+ vfio_kvm_device_close();
+}
diff --git a/migration/migration.c b/migration/migration.c
index 4098870..8f23cff 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -337,6 +337,7 @@ void migration_object_init(void)
ram_mig_init();
dirty_bitmap_mig_init();
+ cpr_transfer_init();
/* Initialize cpu throttle timers */
cpu_throttle_init();
diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build
index 73d29f9..98134a7 100644
--- a/hw/vfio/meson.build
+++ b/hw/vfio/meson.build
@@ -17,6 +17,8 @@ vfio_ss.add(when: 'CONFIG_VFIO_IGD', if_true: files('igd.c'))
specific_ss.add_all(when: 'CONFIG_VFIO', if_true: vfio_ss)
+system_ss.add(when: 'CONFIG_VFIO', if_false: files('vfio-stubs.c'))
+
system_ss.add(when: 'CONFIG_VFIO_XGMAC', if_true: files('calxeda-xgmac.c'))
system_ss.add(when: 'CONFIG_VFIO_AMD_XGBE', if_true: files('amd-xgbe.c'))
system_ss.add(when: 'CONFIG_VFIO', if_true: files(