On a Monday in 2025, Nathan Chen via Devel wrote:
Open iommufd FDs from libvirt backend without
exposing these FDs to XML users, i.e. one per
domain for /dev/iommu and one per iommufd
hostdev for /dev/vfio/devices/vfioX, and pass
the FD to qemu command line.


The part formatting the object and the part formatting the device
should be split.

Signed-off-by: Nathan Chen <[email protected]>
---
src/qemu/qemu_command.c |  43 +++++++-
src/qemu/qemu_command.h |   3 +-
src/qemu/qemu_domain.c  |   8 ++
src/qemu/qemu_domain.h  |   7 ++
src/qemu/qemu_hotplug.c |   2 +-
src/qemu/qemu_process.c | 232 ++++++++++++++++++++++++++++++++++++++++
6 files changed, 289 insertions(+), 6 deletions(-)

diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
index 8fd7527645..740a6970f2 100644
--- a/src/qemu/qemu_command.c
+++ b/src/qemu/qemu_command.c
@@ -4730,7 +4730,8 @@ qemuBuildVideoCommandLine(virCommand *cmd,

virJSONValue *
qemuBuildPCIHostdevDevProps(const virDomainDef *def,
-                            virDomainHostdevDef *dev)
+                            virDomainHostdevDef *dev,
+                            virDomainObj *vm)

Hmm, perhaps exposing the iommufd object in the XML would save us from
having to pass this.

{
    g_autoptr(virJSONValue) props = NULL;
    virDomainHostdevSubsysPCI *pcisrc = &dev->source.subsys.u.pci;
@@ -4741,6 +4742,13 @@ qemuBuildPCIHostdevDevProps(const virDomainDef *def,
    const char *iommufdId = NULL;
    /* 'ramfb' property must be omitted unless it's to be enabled */
    bool ramfb = pcisrc->ramfb == VIR_TRISTATE_SWITCH_ON;
+    bool useIommufd = false;
+    qemuDomainObjPrivate *priv = vm ? vm->privateData : NULL;
+
+    if (pcisrc->driver.name == VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO &&
+        pcisrc->driver.iommufd) {
+        useIommufd = true;
+    }

    /* caller has to assign proper passthrough driver name */
    switch (pcisrc->driver.name) {
@@ -4787,6 +4795,18 @@ qemuBuildPCIHostdevDevProps(const virDomainDef *def,
                              NULL) < 0)
        return NULL;

+    if (useIommufd && priv) {
+        g_autofree char *vfioFdName = g_strdup_printf("vfio-%04x:%02x:%02x.%d",
+                                                      pcisrc->addr.domain, 
pcisrc->addr.bus,
+                                                      pcisrc->addr.slot, 
pcisrc->addr.function);
+

There's no need to duplicate the list of hostdevs which use iommufd in a
per-domain hash table.

For storing per-device file descriptors, we have per-device private
data.

+        int vfiofd = GPOINTER_TO_INT(g_hash_table_lookup(priv->vfioDeviceFds, 
vfioFdName));
+        if (virJSONValueObjectAdd(&props,
+                                  "S:fd", g_strdup_printf("%d", vfiofd),
+                                  NULL) < 0)
+            return NULL;
+    }
+
    if (qemuBuildDeviceAddressProps(props, def, dev->info) < 0)
        return NULL;

@@ -5197,12 +5217,14 @@ qemuBuildAcpiNodesetProps(virCommand *cmd,
static int
qemuBuildHostdevCommandLine(virCommand *cmd,
                            const virDomainDef *def,
-                            virQEMUCaps *qemuCaps)
+                            virQEMUCaps *qemuCaps,
+                            virDomainObj *vm)
{
    size_t i;
    g_autoptr(virJSONValue) props = NULL;
    int iommufd = 0;
    const char * iommufdId = "iommufd0";
+    qemuDomainObjPrivate *priv = vm->privateData;

    for (i = 0; i < def->nhostdevs; i++) {
        virDomainHostdevDef *hostdev = def->hostdevs[i];
@@ -5233,8 +5255,10 @@ qemuBuildHostdevCommandLine(virCommand *cmd,

            if (subsys->u.pci.driver.iommufd && iommufd == 0) {
                iommufd = 1;
+                virCommandPassFD(cmd, priv->iommufd, 
VIR_COMMAND_PASS_FD_CLOSE_PARENT);
                if (qemuMonitorCreateObjectProps(&props, "iommufd",
                                                 iommufdId,
+                                                 "S:fd", g_strdup_printf("%d", 
priv->iommufd),
                                                 NULL) < 0)
                    return -1;

@@ -5245,7 +5269,18 @@ qemuBuildHostdevCommandLine(virCommand *cmd,
            if (qemuCommandAddExtDevice(cmd, hostdev->info, def, qemuCaps) < 0)
                return -1;

-            if (!(devprops = qemuBuildPCIHostdevDevProps(def, hostdev)))
+            if (subsys->u.pci.driver.iommufd) {
+                virDomainHostdevSubsysPCI *pcisrc = 
&hostdev->source.subsys.u.pci;
+                g_autofree char *vfioFdName = 
g_strdup_printf("vfio-%04x:%02x:%02x.%d",
+                                                              
pcisrc->addr.domain, pcisrc->addr.bus,
+                                                              pcisrc->addr.slot, 
pcisrc->addr.function);
+
+                int vfiofd = 
GPOINTER_TO_INT(g_hash_table_lookup(priv->vfioDeviceFds, vfioFdName));
+
+                virCommandPassFD(cmd, vfiofd, 
VIR_COMMAND_PASS_FD_CLOSE_PARENT);

This would become just:

qemuDomainHostdevPrivate *priv = (qemuDomainHostdevPrivate *)vsock->privateData;

virCommandPassFD(cmd, priv->vfiofd, VIR_COMMAND_PASS_FD_CLOSE_PARENT);

+            }
+
+            if (!(devprops = qemuBuildPCIHostdevDevProps(def, hostdev, vm)))
                return -1;

            if (qemuBuildDeviceCommandlineFromJSON(cmd, devprops, def, qemuCaps) 
< 0)
@@ -10893,7 +10928,7 @@ qemuBuildCommandLine(virDomainObj *vm,
    if (qemuBuildRedirdevCommandLine(cmd, def, qemuCaps) < 0)
        return NULL;

-    if (qemuBuildHostdevCommandLine(cmd, def, qemuCaps) < 0)
+    if (qemuBuildHostdevCommandLine(cmd, def, qemuCaps, vm) < 0)
        return NULL;

    if (migrateURI)
diff --git a/src/qemu/qemu_command.h b/src/qemu/qemu_command.h
index ad068f1f16..380aac261f 100644
--- a/src/qemu/qemu_command.h
+++ b/src/qemu/qemu_command.h
@@ -180,7 +180,8 @@ qemuBuildThreadContextProps(virJSONValue **tcProps,
/* Current, best practice */
virJSONValue *
qemuBuildPCIHostdevDevProps(const virDomainDef *def,
-                            virDomainHostdevDef *dev);
+                            virDomainHostdevDef *dev,
+                            virDomainObj *vm);

virJSONValue *
qemuBuildRNGDevProps(const virDomainDef *def,
diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
index a42721efad..86640aa3e3 100644
--- a/src/qemu/qemu_domain.c
+++ b/src/qemu/qemu_domain.c
@@ -1953,6 +1953,11 @@ qemuDomainObjPrivateFree(void *data)

    virChrdevFree(priv->devs);

+    if (priv->iommufd >= 0) {
+        virEventRemoveHandle(priv->iommufd);

There is no handle to remove (and none is needed). So no need for the
condition either.

+        priv->iommufd = -1;
+    }
+
    if (priv->pidMonitored >= 0) {
        virEventRemoveHandle(priv->pidMonitored);
        priv->pidMonitored = -1;
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
index 45fc32a663..cecfed94a7 100644
--- a/src/qemu/qemu_process.c
+++ b/src/qemu/qemu_process.c
@@ -25,6 +25,7 @@
#include <unistd.h>
#include <signal.h>
#include <sys/stat.h>
+#include <dirent.h>

We should not need this in qemu_process.

#if WITH_SYS_SYSCALL_H
# include <sys/syscall.h>
#endif
@@ -8091,6 +8092,9 @@ qemuProcessLaunch(virConnectPtr conn,
    if (qemuExtDevicesStart(driver, vm, incomingMigrationExtDevices) < 0)
        goto cleanup;

+    if (qemuProcessOpenVfioFds(vm) < 0)
+        goto cleanup;
+
    if (!(cmd = qemuBuildCommandLine(vm,
                                     incoming ? "defer" : NULL,
                                     vmop,
@@ -10267,3 +10271,231 @@ qemuProcessHandleNbdkitExit(qemuNbdkitProcess *nbdkit,
    qemuProcessEventSubmit(vm, QEMU_PROCESS_EVENT_NBDKIT_EXITED, 0, 0, nbdkit);
    virObjectUnlock(vm);
}
+
+/**
+ * qemuProcessOpenIommuFd:
+ * @vm: domain object
+ * @iommuFd: returned file descriptor
+ *
+ * Opens /dev/iommu file descriptor for the VM.
+ *
+ * Returns: 0 on success, -1 on failure
+ */
+static int
+qemuProcessOpenIommuFd(virDomainObj *vm, int *iommuFd)
+{
+    int fd = -1;
+
+    VIR_DEBUG("Opening IOMMU FD for domain %s", vm->def->name);
+
+    if ((fd = open("/dev/iommu", O_RDWR | O_CLOEXEC)) < 0) {
+        if (errno == ENOENT) {
+            virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
+                           _("IOMMU FD support requires /dev/iommu device"));
+        } else {
+            virReportSystemError(errno, "%s",
+                                 _("cannot open /dev/iommu"));
+        }
+        return -1;
+    }
+
+    *iommuFd = fd;
+    VIR_DEBUG("Opened IOMMU FD %d for domain %s", fd, vm->def->name);
+    return 0;
+}
+
+/**
+ * qemuProcessGetVfioDevicePath:
+ * @hostdev: host device definition
+ * @vfioPath: returned VFIO device path
+ *
+ * Constructs the VFIO device path for a PCI hostdev.
+ *
+ * Returns: 0 on success, -1 on failure
+ */
+static int
+qemuProcessGetVfioDevicePath(virDomainHostdevDef *hostdev,

No need to pass the whole hostdev here. Then this function can live in
virpci.c

+                             char **vfioPath)
+{
+    virPCIDeviceAddress *addr;
+    g_autofree char *sysfsPath = NULL;
+    DIR *dir = NULL;
+    struct dirent *entry = NULL;
+    int ret = -1;
+

Jano

Attachment: signature.asc
Description: PGP signature

Reply via email to