On 1/5/2026 8:17 AM, Daniel P. Berrangé wrote:
On Thu, Dec 18, 2025 at 06:19:23PM -0800, Nathan Chen via Devel wrote:
From: Nathan Chen<[email protected]>
Integrate and use the IOMMU_OPTION_RLIMIT_MODE
ioctl to set per-process memory accounting for
iommufd. This prevents ENOMEM errors from the
default per-user memory accounting when multiple
VMs under the libvirt-qemu user have their pinned
memory summed and checked against a per-process
RLIMIT_MEMLOCK limit.
Signed-off-by: Nathan Chen<[email protected]>
---
po/POTFILES | 1 +
src/libvirt_private.syms | 3 ++
src/qemu/qemu_process.c | 7 ++++
src/util/meson.build | 1 +
src/util/viriommufd.c | 89 ++++++++++++++++++++++++++++++++++++++++
src/util/viriommufd.h | 23 +++++++++++
6 files changed, 124 insertions(+)
create mode 100644 src/util/viriommufd.c
create mode 100644 src/util/viriommufd.h
diff --git a/src/util/viriommufd.c b/src/util/viriommufd.c
new file mode 100644
index 0000000000..163ac632ba
--- /dev/null
+++ b/src/util/viriommufd.c
@@ -0,0 +1,89 @@
+#include <config.h>
+
+#include "viriommufd.h"
+#include "virlog.h"
+#include "virerror.h"
+
+#include <sys/ioctl.h>
+#include <linux/types.h>
+
+#define VIR_FROM_THIS VIR_FROM_NONE
+
+#define IOMMUFD_TYPE (';')
+
+#ifndef IOMMUFD_CMD_OPTION
+# define IOMMUFD_CMD_OPTION 0x87
+#endif
+
+#ifndef IOMMU_OPTION
+# define IOMMU_OPTION _IO(IOMMUFD_TYPE, IOMMUFD_CMD_OPTION)
+#endif
+
+VIR_LOG_INIT("util.iommufd");
+
+enum iommufd_option {
+ IOMMU_OPTION_RLIMIT_MODE = 0,
+ IOMMU_OPTION_HUGE_PAGES = 1,
+};
+
+enum iommufd_option_ops {
+ IOMMU_OPTION_OP_SET = 0,
+ IOMMU_OPTION_OP_GET = 1,
+};
+
+struct iommu_option {
+ __u32 size;
+ __u32 option_id;
+ __u16 op;
+ __u16 __reserved;
+ __u32 object_id;
+ __aligned_u64 val64;
+};
These structs and enums are duplicating stuff defined in
linux/iommu.h - why not use the system headers, or at
least conditionally define these only if the system header
lacks them, so we can eventually delete the local re-definition
Ok, I will fix this to use the system headers when possible and
conditionally define these if missing from the system header.
+/**
+ * virIOMMUFDSetRLimitMode:
+ * @fd: iommufd file descriptor
+ * @processAccounting: true for per-process, false for per-user
+ *
+ * Set RLIMIT_MEMLOCK accounting mode for the iommufd.
+ *
+ * Returns: 0 on success, -1 on error
+ */
+int
+virIOMMUFDSetRLimitMode(int fd, bool processAccounting)
+{
+ struct iommu_option option = {
+ .size = sizeof(struct iommu_option),
+ .option_id = IOMMU_OPTION_RLIMIT_MODE,
+ .op = IOMMU_OPTION_OP_SET,
+ .__reserved = 0,
+ .object_id = 0,
+ .val64 = processAccounting ? 1 : 0,
+ };
+
+ if (ioctl(fd, IOMMU_OPTION, &option) < 0) {
+ switch (errno) {
+ case ENOTTY:
+ VIR_WARN("IOMMU_OPTION ioctl not supported");
+ return 0;
+
+ case EOPNOTSUPP:
+ VIR_WARN("IOMMU_OPTION_RLIMIT_MODE not supported by kernel");
+ return 0;
+
+ case EINVAL:
+ virReportSystemError(errno, "%s",
+ _("invalid iommufd option parameters"));
+ return -1;
+
+ default:
+ virReportSystemError(errno, "%s",
+ _("failed to set iommufd option"));
+ return -1;
+ }
+ }
So this can also fail with EPERM if lacking CAP_SYS_RESOURCE.
I'm wondering if this is liable to cause problems with KubeVirt
since IIUC they're trying to run libvirt largely unprivileged.
I'm not sure what they do with PCI device assignment thnough ?
I will add non-fatal handling + warning print for EPERM here, indicating
that the feature will fall back to per-user memory accounting. In that
case, CAP_IPC_LOCK can be manually granted to qemu to bypass
RLIMIT_MEMLOCK entirely if needed (e.g., if KubeVirt runs libvirt
unprivileged but encounters memory limits).