Open VFIO FDs from libvirt backend without exposing
these FDs to XML users, i.e. one per iommufd hostdev
for /dev/vfio/devices/vfioX, and pass the FD to qemu
command line.

Signed-off-by: Nathan Chen <[email protected]>
---
 src/conf/domain_conf.h   |   2 +
 src/libvirt_private.syms |   1 +
 src/qemu/qemu_command.c  |  26 ++++++++
 src/qemu/qemu_domain.c   |  39 ++++++++++++
 src/qemu/qemu_domain.h   |  17 +++++
 src/qemu/qemu_process.c  | 130 +++++++++++++++++++++++++++++++++++++++
 src/util/virpci.c        |  69 +++++++++++++++++++++
 src/util/virpci.h        |   2 +
 8 files changed, 286 insertions(+)

diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h
index 4fd8342950..da4ce9fc86 100644
--- a/src/conf/domain_conf.h
+++ b/src/conf/domain_conf.h
@@ -364,6 +364,8 @@ struct _virDomainHostdevDef {
      */
     virDomainNetDef *parentnet;
 
+    virObject *privateData;
+
     virDomainHostdevMode mode;
     virDomainStartupPolicy startupPolicy;
     bool managed;
diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
index 4e57e4a8f6..ed2b0d381e 100644
--- a/src/libvirt_private.syms
+++ b/src/libvirt_private.syms
@@ -3159,6 +3159,7 @@ virPCIDeviceGetStubDriverName;
 virPCIDeviceGetStubDriverType;
 virPCIDeviceGetUnbindFromStub;
 virPCIDeviceGetUsedBy;
+virPCIDeviceGetVfioPath;
 virPCIDeviceGetVPD;
 virPCIDeviceHasPCIExpressLink;
 virPCIDeviceIsAssignable;
diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c
index 95d1c2ee98..9b08f66175 100644
--- a/src/qemu/qemu_command.c
+++ b/src/qemu/qemu_command.c
@@ -4756,6 +4756,12 @@ qemuBuildPCIHostdevDevProps(const virDomainDef *def,
     const char *iommufdId = NULL;
     /* 'ramfb' property must be omitted unless it's to be enabled */
     bool ramfb = pcisrc->ramfb == VIR_TRISTATE_SWITCH_ON;
+    bool useIommufd = false;
+
+    if (pcisrc->driver.name == VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO &&
+        pcisrc->driver.iommufd == VIR_TRISTATE_BOOL_YES) {
+        useIommufd = true;
+    }
 
     /* caller has to assign proper passthrough driver name */
     switch (pcisrc->driver.name) {
@@ -4802,6 +4808,17 @@ qemuBuildPCIHostdevDevProps(const virDomainDef *def,
                               NULL) < 0)
         return NULL;
 
+    if (useIommufd && dev->privateData) {
+        qemuDomainHostdevPrivate *hostdevPriv = 
QEMU_DOMAIN_HOSTDEV_PRIVATE(dev);
+
+        if (hostdevPriv->vfioDeviceFd >= 0) {
+            if (virJSONValueObjectAdd(&props,
+                                      "S:fd", g_strdup_printf("%d", 
hostdevPriv->vfioDeviceFd),
+                                      NULL) < 0)
+                return NULL;
+        }
+    }
+
     if (qemuBuildDeviceAddressProps(props, def, dev->info) < 0)
         return NULL;
 
@@ -5260,6 +5277,15 @@ qemuBuildHostdevCommandLine(virCommand *cmd,
             if (qemuCommandAddExtDevice(cmd, hostdev->info, def, qemuCaps) < 0)
                 return -1;
 
+            if (subsys->u.pci.driver.iommufd == VIR_TRISTATE_BOOL_YES) {
+                qemuDomainHostdevPrivate *hostdevPriv = 
QEMU_DOMAIN_HOSTDEV_PRIVATE(hostdev);
+
+                if (hostdevPriv && hostdevPriv->vfioDeviceFd >= 0) {
+                    virCommandPassFD(cmd, hostdevPriv->vfioDeviceFd,
+                                     VIR_COMMAND_PASS_FD_CLOSE_PARENT);
+                }
+            }
+
             if (!(devprops = qemuBuildPCIHostdevDevProps(def, hostdev)))
                 return -1;
 
diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
index ac56fc7cb4..7601bdbb2b 100644
--- a/src/qemu/qemu_domain.c
+++ b/src/qemu/qemu_domain.c
@@ -1238,6 +1238,45 @@ qemuDomainNetworkPrivateFormat(const virDomainNetDef 
*net,
 }
 
 
+static virClass *qemuDomainHostdevPrivateClass;
+
+static void
+qemuDomainHostdevPrivateDispose(void *obj)
+{
+    qemuDomainHostdevPrivate *priv = obj;
+
+    VIR_FORCE_CLOSE(priv->vfioDeviceFd);
+}
+
+
+static int
+qemuDomainHostdevPrivateOnceInit(void)
+{
+    if (!VIR_CLASS_NEW(qemuDomainHostdevPrivate, virClassForObject()))
+        return -1;
+
+    return 0;
+}
+
+VIR_ONCE_GLOBAL_INIT(qemuDomainHostdevPrivate);
+
+virObject *
+qemuDomainHostdevPrivateNew(void)
+{
+    qemuDomainHostdevPrivate *priv;
+
+    if (qemuDomainHostdevPrivateInitialize() < 0)
+        return NULL;
+
+    if (!(priv = virObjectNew(qemuDomainHostdevPrivateClass)))
+        return NULL;
+
+    priv->vfioDeviceFd = -1;
+
+    return (virObject *) priv;
+}
+
+
 /* qemuDomainSecretInfoSetup:
  * @priv: pointer to domain private object
  * @alias: alias of the secret
diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h
index 3396f929fd..4736f1ede5 100644
--- a/src/qemu/qemu_domain.h
+++ b/src/qemu/qemu_domain.h
@@ -461,6 +461,17 @@ struct _qemuDomainTPMPrivate {
 };
 
 
+#define QEMU_DOMAIN_HOSTDEV_PRIVATE(hostdev) \
+    ((qemuDomainHostdevPrivate *) (hostdev)->privateData)
+
+typedef struct _qemuDomainHostdevPrivate qemuDomainHostdevPrivate;
+struct _qemuDomainHostdevPrivate {
+    virObject parent;
+
+    /* VFIO device file descriptor for iommufd passthrough */
+    int vfioDeviceFd;
+};
+
 void
 qemuDomainNetworkPrivateClearFDs(qemuDomainNetworkPrivate *priv);
 
@@ -1174,3 +1185,9 @@ qemuDomainCheckCPU(virArch arch,
 bool
 qemuDomainMachineSupportsFloppy(const char *machine,
                                 virQEMUCaps *qemuCaps);
+
+virObject *
+qemuDomainHostdevPrivateNew(void);
+
+int qemuProcessOpenVfioFds(virDomainObj *vm);
+void qemuProcessCloseVfioFds(virDomainObj *vm);
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
index 45fc32a663..bf245ee8af 100644
--- a/src/qemu/qemu_process.c
+++ b/src/qemu/qemu_process.c
@@ -106,6 +106,7 @@
 
 #include "logging/log_manager.h"
 #include "logging/log_protocol.h"
+#include "util/virpci.h"
 
 #define VIR_FROM_THIS VIR_FROM_QEMU
 
@@ -8091,6 +8092,9 @@ qemuProcessLaunch(virConnectPtr conn,
     if (qemuExtDevicesStart(driver, vm, incomingMigrationExtDevices) < 0)
         goto cleanup;
 
+    if (qemuProcessOpenVfioFds(vm) < 0)
+        goto cleanup;
+
     if (!(cmd = qemuBuildCommandLine(vm,
                                      incoming ? "defer" : NULL,
                                      vmop,
@@ -10267,3 +10271,129 @@ qemuProcessHandleNbdkitExit(qemuNbdkitProcess *nbdkit,
     qemuProcessEventSubmit(vm, QEMU_PROCESS_EVENT_NBDKIT_EXITED, 0, 0, nbdkit);
     virObjectUnlock(vm);
 }
+
+/**
+ * qemuProcessOpenVfioDeviceFd:
+ * @hostdev: host device definition
+ * @vfioFd: returned file descriptor
+ *
+ * Opens the VFIO device file descriptor for a hostdev.
+ *
+ * Returns: 0 on success, -1 on failure
+ */
+static int
+qemuProcessOpenVfioDeviceFd(virDomainHostdevDef *hostdev,
+                            int *vfioFd)
+{
+    g_autofree char *vfioPath = NULL;
+    int fd = -1;
+
+
+    if (hostdev->mode != VIR_DOMAIN_HOSTDEV_MODE_SUBSYS ||
+        hostdev->source.subsys.type != VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI) {
+        virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
+                       _("VFIO FD only supported for PCI hostdevs"));
+        return -1;
+    }
+
+    if (virPCIDeviceGetVfioPath(&hostdev->source.subsys.u.pci.addr, &vfioPath) 
< 0)
+        return -1;
+
+    VIR_DEBUG("Opening VFIO device %s", vfioPath);
+
+    if ((fd = open(vfioPath, O_RDWR | O_CLOEXEC)) < 0) {
+        if (errno == ENOENT) {
+            virReportError(VIR_ERR_CONFIG_UNSUPPORTED,
+                           _("VFIO device %1$s not found - ensure device is 
bound to vfio-pci driver"),
+                           vfioPath);
+        } else {
+            virReportSystemError(errno,
+                                 _("cannot open VFIO device %1$s"), vfioPath);
+        }
+        return -1;
+    }
+
+    *vfioFd = fd;
+    VIR_DEBUG("Opened VFIO device FD %d for %s", *vfioFd, vfioPath);
+    return 0;
+}
+
+/**
+ * qemuProcessOpenVfioFds:
+ * @vm: domain object
+ *
+ * Opens all necessary VFIO file descriptors for the domain.
+ *
+ * Returns: 0 on success, -1 on failure
+ */
+int
+qemuProcessOpenVfioFds(virDomainObj *vm)
+{
+    size_t i;
+
+    /* Check if we have any hostdevs that need VFIO FDs */
+    for (i = 0; i < vm->def->nhostdevs; i++) {
+        virDomainHostdevDef *hostdev = vm->def->hostdevs[i];
+        qemuDomainHostdevPrivate *hostdevPriv = NULL;
+
+        if (hostdev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS &&
+            hostdev->source.subsys.type == VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI) 
{
+
+            if (hostdev->source.subsys.u.pci.driver.name == 
VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO &&
+                hostdev->source.subsys.u.pci.driver.iommufd == 
VIR_TRISTATE_BOOL_YES) {
+
+                if (!hostdev->privateData) {
+                    if (!(hostdev->privateData = 
qemuDomainHostdevPrivateNew()))
+                        goto error;
+                }
+
+                hostdevPriv = QEMU_DOMAIN_HOSTDEV_PRIVATE(hostdev);
+
+                /* Open VFIO device FD */
+                if (qemuProcessOpenVfioDeviceFd(hostdev, 
&hostdevPriv->vfioDeviceFd) < 0)
+                    goto error;
+
+                VIR_DEBUG("Stored VFIO FD %d in hostdev %04x:%02x:%02x.%d 
private data",
+                         hostdevPriv->vfioDeviceFd,
+                         hostdev->source.subsys.u.pci.addr.domain,
+                         hostdev->source.subsys.u.pci.addr.bus,
+                         hostdev->source.subsys.u.pci.addr.slot,
+                         hostdev->source.subsys.u.pci.addr.function);
+            }
+        }
+    }
+
+    return 0;
+
+ error:
+    qemuProcessCloseVfioFds(vm);
+    return -1;
+}
+
+/**
+ * qemuProcessCloseVfioFds:
+ * @vm: domain object
+ *
+ * Closes all VFIO file descriptors for the domain.
+ */
+void
+qemuProcessCloseVfioFds(virDomainObj *vm)
+{
+    size_t i;
+
+    /* Close all VFIO device FDs */
+    for (i = 0; i < vm->def->nhostdevs; i++) {
+        virDomainHostdevDef *hostdev = vm->def->hostdevs[i];
+        qemuDomainHostdevPrivate *hostdevPriv;
+
+        if (!hostdev->privateData)
+            continue;
+
+        hostdevPriv = QEMU_DOMAIN_HOSTDEV_PRIVATE(hostdev);
+
+        if (hostdevPriv->vfioDeviceFd >= 0) {
+            VIR_DEBUG("Closing VFIO device FD %d", hostdevPriv->vfioDeviceFd);
+            VIR_FORCE_CLOSE(hostdevPriv->vfioDeviceFd);
+        }
+    }
+}
diff --git a/src/util/virpci.c b/src/util/virpci.c
index 90617e69c6..da62ece0f6 100644
--- a/src/util/virpci.c
+++ b/src/util/virpci.c
@@ -3320,3 +3320,72 @@ virPCIDeviceAddressFree(virPCIDeviceAddress *address)
 {
     g_free(address);
 }
+
+/**
+ * virPCIDeviceGetVfioPath:
+ * @addr: host device PCI address
+ * @vfioPath: returned VFIO device path
+ *
+ * Constructs the VFIO device path for a PCI hostdev.
+ *
+ * Returns: 0 on success, -1 on failure
+ */
+int
+virPCIDeviceGetVfioPath(virPCIDeviceAddress *addr,
+                        char **vfioPath)
+{
+    g_autofree char *addrStr = NULL;
+
+    *vfioPath = NULL;
+    addrStr = virPCIDeviceAddressAsString(addr);
+
+    /* First try: Direct lookup in device's vfio-dev subdirectory */
+    {
+        g_autofree char *sysfsPath = NULL;
+        g_autoptr(DIR) dir = NULL;
+        struct dirent *entry = NULL;
+
+        sysfsPath = g_strdup_printf("/sys/bus/pci/devices/%s/vfio-dev/", 
addrStr);
+
+        if (virDirOpen(&dir, sysfsPath) == 1) {
+            while (virDirRead(dir, &entry, sysfsPath) > 0) {
+                if (STRPREFIX(entry->d_name, "vfio")) {
+                    *vfioPath = g_strdup_printf("/dev/vfio/devices/%s", 
entry->d_name);
+                    return 0;
+                }
+            }
+        }
+    }
+
+    /* Second try: Scan /sys/class/vfio-dev */
+    {
+        g_autofree char *sysfsPath = g_strdup("/sys/class/vfio-dev");
+        g_autoptr(DIR) dir = NULL;
+        struct dirent *entry = NULL;
+
+        if (virDirOpen(&dir, sysfsPath) == 1) {
+            while (virDirRead(dir, &entry, sysfsPath) > 0) {
+                g_autofree char *devLink = NULL;
+                g_autofree char *target = NULL;
+
+                if (!STRPREFIX(entry->d_name, "vfio"))
+                    continue;
+
+                devLink = g_strdup_printf("/sys/class/vfio-dev/%s/device", 
entry->d_name);
+
+                if (virFileResolveLink(devLink, &target) < 0)
+                    continue;
+
+                if (strstr(target, addrStr)) {
+                    *vfioPath = g_strdup_printf("/dev/vfio/devices/%s", 
entry->d_name);
+                    return 0;
+                }
+            }
+        }
+    }
+
+    virReportError(VIR_ERR_INTERNAL_ERROR,
+                  _("cannot find VFIO device for PCI device %1$s"),
+                  addrStr);
+    return -1;
+}
diff --git a/src/util/virpci.h b/src/util/virpci.h
index fc538566e1..24ede10755 100644
--- a/src/util/virpci.h
+++ b/src/util/virpci.h
@@ -296,6 +296,8 @@ void virPCIEDeviceInfoFree(virPCIEDeviceInfo *dev);
 
 void virPCIDeviceAddressFree(virPCIDeviceAddress *address);
 
+int virPCIDeviceGetVfioPath(virPCIDeviceAddress *addr, char **vfioPath);
+
 G_DEFINE_AUTOPTR_CLEANUP_FUNC(virPCIDevice, virPCIDeviceFree);
 G_DEFINE_AUTOPTR_CLEANUP_FUNC(virPCIDeviceAddress, virPCIDeviceAddressFree);
 G_DEFINE_AUTOPTR_CLEANUP_FUNC(virPCIEDeviceInfo, virPCIEDeviceInfoFree);
-- 
2.43.0

Reply via email to