On 11/6/2025 11:19 AM, Ján Tomko wrote:
Open iommufd FDs from libvirt backend without
exposing these FDs to XML users, i.e. one per
domain for /dev/iommu and one per iommufd
hostdev for /dev/vfio/devices/vfioX, and pass
the FD to qemu command line.
The part formatting the object and the part formatting the device
should be split.
Sounds good, I will split it into two commits. >> Signed-off-by: Nathan
Chen <[email protected]>
---
src/qemu/qemu_command.c | 43 +++++++-
src/qemu/qemu_command.h | 3 +-
src/qemu/qemu_domain.c | 8 ++
src/qemu/qemu_domain.h | 7 ++
src/qemu/qemu_hotplug.c | 2 +-
src/qemu/qemu_process.c | 232 ++++++++++++++++++++++++++++++++++++++++
6 files changed, 289 insertions(+), 6 deletions(-)
diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
index 8fd7527645..740a6970f2 100644
--- a/src/qemu/qemu_command.c
+++ b/src/qemu/qemu_command.c
@@ -4730,7 +4730,8 @@ qemuBuildVideoCommandLine(virCommand *cmd,
virJSONValue *
qemuBuildPCIHostdevDevProps(const virDomainDef *def,
- virDomainHostdevDef *dev)
+ virDomainHostdevDef *dev,
+ virDomainObj *vm)
Hmm, perhaps exposing the iommufd object in the XML would save us from
having to pass this.
We are passing virDomainObj to this function in order to retrieve the FD
number, would you mind clarifying how we could avoid passing this by
exposing the iommufd object in the XML? It is my understanding that
exposing the iommufd object ID would still mean we need pass the
virDomainObj.
{
g_autoptr(virJSONValue) props = NULL;
virDomainHostdevSubsysPCI *pcisrc = &dev->source.subsys.u.pci;
@@ -4741,6 +4742,13 @@ qemuBuildPCIHostdevDevProps(const virDomainDef
*def,
const char *iommufdId = NULL;
/* 'ramfb' property must be omitted unless it's to be enabled */
bool ramfb = pcisrc->ramfb == VIR_TRISTATE_SWITCH_ON;
+ bool useIommufd = false;
+ qemuDomainObjPrivate *priv = vm ? vm->privateData : NULL;
+
+ if (pcisrc->driver.name ==
VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO &&
+ pcisrc->driver.iommufd) {
+ useIommufd = true;
+ }
/* caller has to assign proper passthrough driver name */
switch (pcisrc->driver.name) {
@@ -4787,6 +4795,18 @@ qemuBuildPCIHostdevDevProps(const virDomainDef
*def,
NULL) < 0)
return NULL;
+ if (useIommufd && priv) {
+ g_autofree char *vfioFdName = g_strdup_printf("vfio-%04x:
%02x:%02x.%d",
+ pcisrc-
>addr.domain, pcisrc->addr.bus,
+ pcisrc-
>addr.slot, pcisrc->addr.function);
+
There's no need to duplicate the list of hostdevs which use iommufd in a
per-domain hash table.
For storing per-device file descriptors, we have per-device private
data.
+ int vfiofd = GPOINTER_TO_INT(g_hash_table_lookup(priv-
>vfioDeviceFds, vfioFdName));
+ if (virJSONValueObjectAdd(&props,
+ "S:fd", g_strdup_printf("%d", vfiofd),
+ NULL) < 0)
+ return NULL;
+ }
+
if (qemuBuildDeviceAddressProps(props, def, dev->info) < 0)
return NULL;
@@ -5197,12 +5217,14 @@ qemuBuildAcpiNodesetProps(virCommand *cmd,
static int
qemuBuildHostdevCommandLine(virCommand *cmd,
const virDomainDef *def,
- virQEMUCaps *qemuCaps)
+ virQEMUCaps *qemuCaps,
+ virDomainObj *vm)
{
size_t i;
g_autoptr(virJSONValue) props = NULL;
int iommufd = 0;
const char * iommufdId = "iommufd0";
+ qemuDomainObjPrivate *priv = vm->privateData;
for (i = 0; i < def->nhostdevs; i++) {
virDomainHostdevDef *hostdev = def->hostdevs[i];
@@ -5233,8 +5255,10 @@ qemuBuildHostdevCommandLine(virCommand *cmd,
if (subsys->u.pci.driver.iommufd && iommufd == 0) {
iommufd = 1;
+ virCommandPassFD(cmd, priv->iommufd,
VIR_COMMAND_PASS_FD_CLOSE_PARENT);
if (qemuMonitorCreateObjectProps(&props, "iommufd",
iommufdId,
+ "S:fd",
g_strdup_printf("%d", priv->iommufd),
NULL) < 0)
return -1;
@@ -5245,7 +5269,18 @@ qemuBuildHostdevCommandLine(virCommand *cmd,
if (qemuCommandAddExtDevice(cmd, hostdev->info, def,
qemuCaps) < 0)
return -1;
- if (!(devprops = qemuBuildPCIHostdevDevProps(def, hostdev)))
+ if (subsys->u.pci.driver.iommufd) {
+ virDomainHostdevSubsysPCI *pcisrc = &hostdev-
>source.subsys.u.pci;
+ g_autofree char *vfioFdName = g_strdup_printf("vfio-
%04x:%02x:%02x.%d",
+ pcisrc-
>addr.domain, pcisrc->addr.bus,
+ pcisrc-
>addr.slot, pcisrc->addr.function);
+
+ int vfiofd =
GPOINTER_TO_INT(g_hash_table_lookup(priv->vfioDeviceFds, vfioFdName));
+
+ virCommandPassFD(cmd, vfiofd,
VIR_COMMAND_PASS_FD_CLOSE_PARENT);
This would become just:
qemuDomainHostdevPrivate *priv = (qemuDomainHostdevPrivate *)vsock-
>privateData;
virCommandPassFD(cmd, priv->vfiofd, VIR_COMMAND_PASS_FD_CLOSE_PARENT);
I will proceed with implementing a qemuDomainHostdevPrivate struct and
look into the existing implementation for qemuDomainDiskPrivate for
reference. I was not able to find a private data attribute in the
_virDomainHostdevDef struct definition, but I do see "virObject
*privateData;" under the _virDomainDiskDef struct definition - are you
aware of any reason behind this, or has it just never been needed for
the hostdev struct?
+ }
+
+ if (!(devprops = qemuBuildPCIHostdevDevProps(def,
hostdev, vm)))
return -1;
if (qemuBuildDeviceCommandlineFromJSON(cmd, devprops, def,
qemuCaps) < 0)
@@ -10893,7 +10928,7 @@ qemuBuildCommandLine(virDomainObj *vm,
if (qemuBuildRedirdevCommandLine(cmd, def, qemuCaps) < 0)
return NULL;
- if (qemuBuildHostdevCommandLine(cmd, def, qemuCaps) < 0)
+ if (qemuBuildHostdevCommandLine(cmd, def, qemuCaps, vm) < 0)
return NULL;
if (migrateURI)
diff --git a/src/qemu/qemu_command.h b/src/qemu/qemu_command.h
index ad068f1f16..380aac261f 100644
--- a/src/qemu/qemu_command.h
+++ b/src/qemu/qemu_command.h
@@ -180,7 +180,8 @@ qemuBuildThreadContextProps(virJSONValue **tcProps,
/* Current, best practice */
virJSONValue *
qemuBuildPCIHostdevDevProps(const virDomainDef *def,
- virDomainHostdevDef *dev);
+ virDomainHostdevDef *dev,
+ virDomainObj *vm);
virJSONValue *
qemuBuildRNGDevProps(const virDomainDef *def,
diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
index a42721efad..86640aa3e3 100644
--- a/src/qemu/qemu_domain.c
+++ b/src/qemu/qemu_domain.c
@@ -1953,6 +1953,11 @@ qemuDomainObjPrivateFree(void *data)
virChrdevFree(priv->devs);
+ if (priv->iommufd >= 0) {
+ virEventRemoveHandle(priv->iommufd);
There is no handle to remove (and none is needed). So no need for the
condition either.
Thanks for catching this, agreed that the other logic to close iommufd
file descriptor is sufficient and this is not needed.
+ priv->iommufd = -1;
+ }
+
if (priv->pidMonitored >= 0) {
virEventRemoveHandle(priv->pidMonitored);
priv->pidMonitored = -1;
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
index 45fc32a663..cecfed94a7 100644
--- a/src/qemu/qemu_process.c
+++ b/src/qemu/qemu_process.c
@@ -25,6 +25,7 @@
#include <unistd.h>
#include <signal.h>
#include <sys/stat.h>
+#include <dirent.h>
We should not need this in qemu_process.
This must have been left over from a previous implementation attempt. I
will remove this in the next revision. Thanks
#if WITH_SYS_SYSCALL_H
# include <sys/syscall.h>
#endif
@@ -8091,6 +8092,9 @@ qemuProcessLaunch(virConnectPtr conn,
if (qemuExtDevicesStart(driver, vm, incomingMigrationExtDevices) < 0)
goto cleanup;
+ if (qemuProcessOpenVfioFds(vm) < 0)
+ goto cleanup;
+
if (!(cmd = qemuBuildCommandLine(vm,
incoming ? "defer" : NULL,
vmop,
@@ -10267,3 +10271,231 @@
qemuProcessHandleNbdkitExit(qemuNbdkitProcess *nbdkit,
qemuProcessEventSubmit(vm, QEMU_PROCESS_EVENT_NBDKIT_EXITED, 0, 0,
nbdkit);
virObjectUnlock(vm);
}
+
+/**
+ * qemuProcessOpenIommuFd:
+ * @vm: domain object
+ * @iommuFd: returned file descriptor
+ *
+ * Opens /dev/iommu file descriptor for the VM.
+ *
+ * Returns: 0 on success, -1 on failure
+ */
+static int
+qemuProcessOpenIommuFd(virDomainObj *vm, int *iommuFd)
+{
+ int fd = -1;
+
+ VIR_DEBUG("Opening IOMMU FD for domain %s", vm->def->name);
+
+ if ((fd = open("/dev/iommu", O_RDWR | O_CLOEXEC)) < 0) {
+ if (errno == ENOENT) {
+ virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+ _("IOMMU FD support requires /dev/iommu
device"));
+ } else {
+ virReportSystemError(errno, "%s",
+ _("cannot open /dev/iommu"));
+ }
+ return -1;
+ }
+
+ *iommuFd = fd;
+ VIR_DEBUG("Opened IOMMU FD %d for domain %s", fd, vm->def->name);
+ return 0;
+}
+
+/**
+ * qemuProcessGetVfioDevicePath:
+ * @hostdev: host device definition
+ * @vfioPath: returned VFIO device path
+ *
+ * Constructs the VFIO device path for a PCI hostdev.
+ *
+ * Returns: 0 on success, -1 on failure
+ */
+static int
+qemuProcessGetVfioDevicePath(virDomainHostdevDef *hostdev,
No need to pass the whole hostdev here. Then this function can live in
virpci.c
Sounds good, I will move this to virpci.c and just pass in the device
address.
-Nathan