On 11/21/2025 8:37 AM, Ján Tomko wrote:
On a Monday in 2025, Nathan Chen via Devel wrote:
Allow access to /dev/iommu and /dev/vfio/devices/vfio* when launching
a qemu
VM with iommufd feature enabled.
Signed-off-by: Nathan Chen <[email protected]>
---
src/qemu/qemu_cgroup.c | 61 ++++++++++++++++++++++++++++
src/qemu/qemu_cgroup.h | 1 +
src/qemu/qemu_namespace.c | 44 +++++++++++++++++++++
src/security/security_apparmor.c | 15 +++++++
src/security/security_dac.c | 34 ++++++++++++++++
src/security/security_selinux.c | 34 ++++++++++++++++
src/security/virt-aa-helper.c | 11 +++++-
src/util/virpci.c | 68 ++++++++++++++++++++++++++++++++
src/util/virpci.h | 1 +
9 files changed, 268 insertions(+), 1 deletion(-)
diff --git a/src/qemu/qemu_cgroup.c b/src/qemu/qemu_cgroup.c
index 46a7dc1d8b..e15ffd2007 100644
--- a/src/qemu/qemu_cgroup.c
+++ b/src/qemu/qemu_cgroup.c
@@ -461,6 +461,54 @@ qemuTeardownInputCgroup(virDomainObj *vm,
}
+int
+qemuSetupIommufdCgroup(virDomainObj *vm)
+{
+ qemuDomainObjPrivate *priv = vm->privateData;
+ g_autoptr(DIR) dir = NULL;
+ struct dirent *dent;
+ g_autofree char *path = NULL;
+ int iommufd = 0;
+ size_t i;
+
+ for (i = 0; i < vm->def->nhostdevs; i++) {
+ if (vm->def->hostdevs[i]->source.subsys.u.pci.driver.iommufd) {
+ iommufd = 1;
+ break;
+ }
+ }
+
+ if (iommufd == 1) {
+ if (!virCgroupHasController(priv->cgroup,
VIR_CGROUP_CONTROLLER_DEVICES))
+ return 0;
+ if (virDirOpen(&dir, "/dev/vfio/devices") < 0) {
+ if (errno == ENOENT)
+ return 0;
+ return -1;
+ }
+ while (virDirRead(dir, &dent, "/dev/vfio/devices") > 0) {
+ if (STRPREFIX(dent->d_name, "vfio")) {
+ path = g_strdup_printf("/dev/vfio/devices/%s", dent-
>d_name);
+ }
+ if (path &&
+ qemuCgroupAllowDevicePath(vm, path,
+ VIR_CGROUP_DEVICE_RW,
false) < 0) {
+ return -1;
This allows all the devices instead of just the ones the VM needs.
Also, this is still a hostdev, so it should be done inside
qemuSetupHostdevCgroup.
Do hostdevs using iommufd also need access to
a) /dev/vfio/vfio and
b) /dev/vfio/<iommugroup>
which were already allowed in qemuSetupHostdevCgroup?
They don't need access to these. I'll add a check to avoid providing
access to these if iommufd is specified.
+ }
+ path = NULL;
+ }
+ if (virFileExists("/dev/iommu"))
+ path = g_strdup("/dev/iommu");
No need to check for the existence of the device. If it does not exist,
the VM won't start anyway.
Also, is it necessary to allow these? libvirt already opened the files
and passed file descriptors.
Thanks for catching this, I had included this cgroup and namespace logic
from earlier patches when we did not pass the file descriptors. I'll
exclude it in the next revision.
+ if (path &&
+ qemuCgroupAllowDevicePath(vm, path,
+ VIR_CGROUP_DEVICE_RW, false) <
0) {
+ return -1;
+ }
+ }
+ return 0;
+}
+
+
/**
* qemuSetupHostdevCgroup:
* vm: domain object
@@ -759,6 +807,7 @@ qemuSetupDevicesCgroup(virDomainObj *vm)
g_autoptr(virQEMUDriverConfig) cfg = virQEMUDriverGetConfig(priv-
>driver);
const char *const *deviceACL = (const char *const *) cfg-
>cgroupDeviceACL;
int rv = -1;
+ int iommufd = 0;
size_t i;
if (!virCgroupHasController(priv->cgroup,
VIR_CGROUP_CONTROLLER_DEVICES))
@@ -836,6 +885,18 @@ qemuSetupDevicesCgroup(virDomainObj *vm)
return -1;
}
+ for (i = 0; i < vm->def->nhostdevs; i++) {
+ if (vm->def->hostdevs[i]->source.subsys.u.pci.driver.iommufd) {
+ iommufd = 1;
+ break;
+ }
+ }
+
No need to check this upfront. If /dev/iommu access is necessary, the
per-hostdev function qemuSetupHostdevCgroup can add it to the list
multiple times, like it already does for /dev/vfio/vfio
That makes sense, I will remove this.
+ if (iommufd == 1) {
+ if (qemuSetupIommufdCgroup(vm) < 0)
+ return -1;
+ }
+
for (i = 0; i < vm->def->nmems; i++) {
if (qemuSetupMemoryDevicesCgroup(vm, vm->def->mems[i]) < 0)
return -1;
diff --git a/src/qemu/qemu_cgroup.h b/src/qemu/qemu_cgroup.h
index 3668034cde..bea677ba3c 100644
--- a/src/qemu/qemu_cgroup.h
+++ b/src/qemu/qemu_cgroup.h
@@ -42,6 +42,7 @@ int qemuSetupHostdevCgroup(virDomainObj *vm,
int qemuTeardownHostdevCgroup(virDomainObj *vm,
virDomainHostdevDef *dev)
G_GNUC_WARN_UNUSED_RESULT;
+int qemuSetupIommufdCgroup(virDomainObj *vm);
int qemuSetupMemoryDevicesCgroup(virDomainObj *vm,
virDomainMemoryDef *mem);
int qemuTeardownMemoryDevicesCgroup(virDomainObj *vm,
diff --git a/src/qemu/qemu_namespace.c b/src/qemu/qemu_namespace.c
index 932777505b..80496f2f0f 100644
--- a/src/qemu/qemu_namespace.c
+++ b/src/qemu/qemu_namespace.c
@@ -683,6 +683,47 @@ qemuDomainSetupLaunchSecurity(virDomainObj *vm,
}
+static int
+qemuDomainSetupIommufd(virDomainObj *vm,
+ GSList **paths)
+{
+ g_autoptr(DIR) dir = NULL;
+ struct dirent *dent;
+ g_autofree char *path = NULL;
+ int iommufd = 0;
+ size_t i;
+
+ for (i = 0; i < vm->def->nhostdevs; i++) {
+ if (vm->def->hostdevs[i]->source.subsys.u.pci.driver.iommufd) {
+ iommufd = 1;
+ break;
+ }
+ }
+
+ /* Check if iommufd is enabled */
+ if (iommufd == 1) {
+ if (virDirOpen(&dir, "/dev/vfio/devices") < 0) {
+ if (errno == ENOENT)
+ return 0;
+ return -1;
+ }
+ while (virDirRead(dir, &dent, "/dev/vfio/devices") > 0) {
+ if (STRPREFIX(dent->d_name, "vfio")) {
+ path = g_strdup_printf("/dev/vfio/devices/%s", dent-
>d_name);
+ *paths = g_slist_prepend(*paths,
g_steal_pointer(&path));
+ }
+ }
+ path = NULL;
+ if (virFileExists("/dev/iommu"))
+ path = g_strdup("/dev/iommu");
+ if (path)
+ *paths = g_slist_prepend(*paths, g_steal_pointer(&path));
Same comments as for cgroups apply here too.
Ok, I will update the namespace logic as well.
+ }
+
+ return 0;
+}
+
+
static int
qemuNamespaceMknodPaths(virDomainObj *vm,
GSList *paths,
@@ -706,6 +747,9 @@ qemuDomainBuildNamespace(virQEMUDriverConfig *cfg,
if (qemuDomainSetupAllDisks(vm, &paths) < 0)
return -1;
+ if (qemuDomainSetupIommufd(vm, &paths) < 0)
+ return -1;
+
if (qemuDomainSetupAllHostdevs(vm, &paths) < 0)
return -1;
diff --git a/src/security/security_apparmor.c b/src/security/
security_apparmor.c
index 68ac39611f..0a878fd205 100644
--- a/src/security/security_apparmor.c
+++ b/src/security/security_apparmor.c
@@ -856,6 +856,21 @@
AppArmorSetSecurityHostdevLabel(virSecurityManager *mgr,
}
ret = AppArmorSetSecurityPCILabel(pci, vfioGroupDev, ptr);
VIR_FREE(vfioGroupDev);
+
+ if (dev->source.subsys.u.pci.driver.iommufd) {
+ g_autofree char *vfiofdDev =
virPCIDeviceGetIOMMUFDDev(pci);
+ const char *iommufdDir = "/dev/iommu";
+ if (vfiofdDev) {
+ int ret2 = AppArmorSetSecurityPCILabel(pci,
vfiofdDev, ptr);
+ if (ret2 < 0)
+ ret = ret2;
+ ret2 = AppArmorSetSecurityPCILabel(pci,
iommufdDir, ptr);
+ if (ret2 < 0)
+ ret = ret2;
+ } else {
+ return -1;
+ }
+ }
} else {
ret = virPCIDeviceFileIterate(pci,
AppArmorSetSecurityPCILabel, ptr);
}
diff --git a/src/util/virpci.c b/src/util/virpci.c
index 90617e69c6..6e6e5e47c0 100644
--- a/src/util/virpci.c
+++ b/src/util/virpci.c
@@ -2478,6 +2478,74 @@ virPCIDeviceGetIOMMUGroupDev(virPCIDevice *dev)
return g_strdup_printf("/dev/vfio/%s", groupFile);
}
+/* virPCIDeviceGetIOMMUFDDev - return the name of the device used
+ * to control this PCI device's group (e.g. "/dev/vfio/devices/vfio15")
+ */
+char *
+virPCIDeviceGetIOMMUFDDev(virPCIDevice *dev)
+{
+ g_autofree char *path = NULL;
+ const char *pci_addr = NULL;
+ g_autoptr(DIR) dir = NULL;
+ struct dirent *entry;
+ char *vfiodev = NULL;
+
+ /* Get PCI device address */
No need for this kind of comment - it's obvious from the variable and
function names.
+ pci_addr = virPCIDeviceGetName(dev);
+ if (!pci_addr)
+ return NULL;
+
+ /* First try: look in PCI device's vfio-dev subdirectory */
+ path = g_strdup_printf("/sys/bus/pci/devices/%s/vfio-dev",
pci_addr);
+
+ if (virDirOpen(&dir, path) == 1) {
+ while (virDirRead(dir, &entry, path) > 0) {
+ if (!g_str_has_prefix(entry->d_name, "vfio"))
+ continue;
+
+ vfiodev = g_strdup_printf("/dev/vfio/devices/%s", entry-
>d_name);
+ break;
+ }
+ /* g_autoptr will automatically close dir when it goes out of
scope */
This comment is also obvious.
Yes agreed, I will exclude the obvious comments.
+ dir = NULL;
That does not make dir go out of scope. That's a memory leak.
We try not to mix g_auto with manual freeing of variables, so either use
two variables or two different scopes.
I see, I will use two variables.
Thanks,
Nathan