On Thu, Jan 29, 2026 at 02:39:52PM -0800, Nathan Chen wrote: > > > On 1/29/2026 11:58 AM, Pavel Hrdina wrote: > > On Mon, Jan 26, 2026 at 05:17:02PM -0800, Nathan Chen wrote: > > > > > > On 1/26/2026 1:07 PM, Pavel Hrdina wrote: > > > > On Fri, Jan 23, 2026 at 12:30:28PM -0800, Nathan Chen wrote: > > > > > On 1/20/2026 10:24 AM, Pavel Hrdina wrote: > > > > > > On Fri, Jan 16, 2026 at 05:39:33PM -0800, Nathan Chen via Devel > > > > > > wrote: > > > > > > > From: Nathan Chen<[email protected]> > > > > > > > > > > > > > > Implement the IOMMU_OPTION_RLIMIT_MODE > > > > > > > ioctl to set per-process memory accounting for > > > > > > > iommufd. This prevents ENOMEM errors from the > > > > > > > default per-user memory accounting when multiple > > > > > > > VMs under the libvirt-qemu user have their pinned > > > > > > > memory summed and checked against a per-process > > > > > > > RLIMIT_MEMLOCK limit. > > > > > > > > > > > > > > Signed-off-by: Nathan Chen<[email protected]> > > > > > > > --- > > > > > > > meson.build | 1 + > > > > > > > po/POTFILES | 1 + > > > > > > > src/libvirt_private.syms | 3 ++ > > > > > > > src/util/meson.build | 1 + > > > > > > > src/util/viriommufd.c | 111 > > > > > > > +++++++++++++++++++++++++++++++++++++++ > > > > > > > src/util/viriommufd.h | 25 +++++++++ > > > > > > > 6 files changed, 142 insertions(+) > > > > > > > create mode 100644 src/util/viriommufd.c > > > > > > > create mode 100644 src/util/viriommufd.h > > > > > > > > > > > > > > diff --git a/meson.build b/meson.build > > > > > > > index 964d1fa4e1..a6db70f13e 100644 > > > > > > > --- a/meson.build > > > > > > > +++ b/meson.build > > > > > > > @@ -732,6 +732,7 @@ headers = [ > > > > > > > 'ifaddrs.h', > > > > > > > 'libtasn1.h', > > > > > > > 'linux/kvm.h', > > > > > > > + 'linux/iommufd.h', > > > > > > > 'mntent.h', > > > > > > > 'net/ethernet.h', > > > > > > > 'net/if.h', > > > > > > > diff --git a/po/POTFILES b/po/POTFILES > > > > > > > index f0aad35c8c..c78d2b8000 100644 > > > > > > > --- a/po/POTFILES > > > > > > > +++ b/po/POTFILES > > > > > > > @@ -303,6 +303,7 @@ src/util/virhostuptime.c > > > > > > > src/util/viridentity.c > > > > > > > src/util/virinhibitor.c > > > > > > > src/util/virinitctl.c > > > > > > > +src/util/viriommufd.c > > > > > > > src/util/viriscsi.c > > > > > > > src/util/virjson.c > > > > > > > src/util/virlease.c > > > > > > > diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms > > > > > > > index 6bffd2eb6d..7fa76a1ec3 100644 > > > > > > > --- a/src/libvirt_private.syms > > > > > > > +++ b/src/libvirt_private.syms > > > > > > > @@ -2646,6 +2646,9 @@ virInhibitorRelease; > > > > > > > virInitctlFifos; > > > > > > > virInitctlSetRunLevel; > > > > > > > +# util/viriommufd.h > > > > > > > +virIOMMUFDSetRLimitMode; > > > > > > > + > > > > > > > # util/viriscsi.h > > > > > > > virISCSIConnectionLogin; > > > > > > > virISCSIConnectionLogout; > > > > > > > diff --git a/src/util/meson.build b/src/util/meson.build > > > > > > > index 4950a795cc..9fb0aa0fe7 100644 > > > > > > > --- a/src/util/meson.build > > > > > > > +++ b/src/util/meson.build > > > > > > > @@ -46,6 +46,7 @@ util_sources = [ > > > > > > > 'viridentity.c', > > > > > > > 'virinhibitor.c', > > > > > > > 'virinitctl.c', > > > > > > > + 'viriommufd.c', > > > > > > > 'viriscsi.c', > > > > > > > 'virjson.c', > > > > > > > 'virkeycode.c', > > > > > > > diff --git a/src/util/viriommufd.c b/src/util/viriommufd.c > > > > > > > new file mode 100644 > > > > > > > index 0000000000..225c76f4b2 > > > > > > > --- /dev/null > > > > > > > +++ b/src/util/viriommufd.c > > > > > > > @@ -0,0 +1,111 @@ > > > > > > > +#include <config.h> > > > > > > > + > > > > > > > +#include "viriommufd.h" > > > > > > > +#include "virlog.h" > > > > > > > +#include "virerror.h" > > > > > > > +#include "virfile.h" > > > > > > > + > > > > > > > +#ifdef __linux__ > > > > > > > + > > > > > > > +# include <sys/ioctl.h> > > > > > > > +# include <linux/types.h> > > > > > > > + > > > > > > > +# ifdef HAVE_LINUX_IOMMUFD_H > > > > > > > +# include <linux/iommufd.h> > > > > > > > +# endif > > > > > > > + > > > > > > > +# define VIR_FROM_THIS VIR_FROM_NONE > > > > > > > + > > > > > > > +VIR_LOG_INIT("util.iommufd"); > > > > > > > + > > > > > > > +# ifndef IOMMU_OPTION > > > > > > > + > > > > > > > +enum iommufd_option { > > > > > > > + IOMMU_OPTION_RLIMIT_MODE = 0, > > > > > > > + IOMMU_OPTION_HUGE_PAGES = 1, > > > > > > > +}; > > > > > > > + > > > > > > > +enum iommufd_option_ops { > > > > > > > + IOMMU_OPTION_OP_SET = 0, > > > > > > > + IOMMU_OPTION_OP_GET = 1, > > > > > > > +}; > > > > > > > + > > > > > > > +struct iommu_option { > > > > > > > + __u32 size; > > > > > > > + __u32 option_id; > > > > > > > + __u16 op; > > > > > > > + __u16 __reserved; > > > > > > > + __u32 object_id; > > > > > > > + __aligned_u64 val64; > > > > > > > +}; > > > > > > > + > > > > > > > +# define IOMMUFD_TYPE (';') > > > > > > > +# define IOMMUFD_CMD_OPTION 0x87 > > > > > > > +# define IOMMU_OPTION _IO(IOMMUFD_TYPE, IOMMUFD_CMD_OPTION) > > > > > > > + > > > > > > > +# endif > > > > > > > + > > > > > > > +/** > > > > > > > + * virIOMMUFDSetRLimitMode: > > > > > > > + * @fd: iommufd file descriptor > > > > > > > + * @processAccounting: true for per-process, false for per-user > > > > > > > + * > > > > > > > + * Set RLIMIT_MEMLOCK accounting mode for the iommufd. > > > > > > > + * > > > > > > > + * Returns: 0 on success, -1 on error > > > > > > > + */ > > > > > > > +int > > > > > > > +virIOMMUFDSetRLimitMode(int fd, bool processAccounting) > > > > > > > +{ > > > > > > > + struct iommu_option option = { > > > > > > > + .size = sizeof(struct iommu_option), > > > > > > > + .option_id = IOMMU_OPTION_RLIMIT_MODE, > > > > > > > + .op = IOMMU_OPTION_OP_SET, > > > > > > > + .__reserved = 0, > > > > > > > + .object_id = 0, > > > > > > > + .val64 = processAccounting ? 1 : 0, > > > > > > > + }; > > > > > > > + > > > > > > > + if (ioctl(fd, IOMMU_OPTION, &option) < 0) { > > > > > > > + switch (errno) { > > > > > > > + case ENOTTY: > > > > > > > + VIR_WARN("IOMMU_OPTION ioctl not supported"); > > > > > > > + return 0; > > > > > > > + > > > > > > > + case EOPNOTSUPP: > > > > > > > + VIR_WARN("IOMMU_OPTION_RLIMIT_MODE not supported > > > > > > > by kernel"); > > > > > > > + return 0; > > > > > > > + > > > > > > > + case EINVAL: > > > > > > > + virReportSystemError(errno, "%s", > > > > > > > + _("invalid iommufd option > > > > > > > parameters")); > > > > > > > + return -1; > > > > > > > + > > > > > > > + case EPERM: > > > > > > > + VIR_WARN("Permission denied for IOMMU_OPTION > > > > > > > ioctl. " > > > > > > > + "Per-user-based memory accounting to be > > > > > > > used by default."); > > > > > > > + return 0; > > > > > > > + > > > > > > > + default: > > > > > > > + virReportSystemError(errno, "%s", > > > > > > > + _("failed to set iommufd > > > > > > > option")); > > > > > > > + return -1; > > > > > > > + } > > > > > > > + } > > > > > > In my previous testing this part of code was not used so no rlimit > > > > > > was > > > > > > configured for the grace hopper GPU that was assigned to a VM. > > > > > > > > > > > > The VM OS was able to see the GPU and I was able to run cuda-samples > > > > > > with most of them passing. This setup didn't use vCMDQ or EGM. When > > > > > > I > > > > > > tried patches that add support for vCMDQ I was no longer able to > > > > > > use the > > > > > > GPU inside the VM until this code was called or setting > > > > > > "setcap cap_ipc_lock=ep" on the qemu binary but it was still > > > > > > detected > > > > > > inside the VM and the VM was started successfully. > > > > > > > > > > > > So is this required for all devices that want to use iommufd in > > > > > > order > > > > > > for them to work correctly inside the VM? Or is it necessary only > > > > > > when > > > > > > specific features are used? > > > > > > > > > > > I don’t think the ioctl is required for all devices, but vCMDQ can > > > > > increase > > > > > accounted pinned memory over the per‑user memory locking limit. vCMDQ > > > > > introduces additional guest‑RAM backed queues that could be the extra > > > > > pinned/accounted memory pushing over the memory locking limit. > > > > > Additionally, > > > > > attempting to launch a second iommufd VM could increase accounted > > > > > memory > > > > > over the per-user memory locking limit. > > > > If that ioctl call is not required for all devices we should not call it > > > > unconditionally for all VMs that will try to use iommufd with any > > > > device. > > > > > > > > Libvirt tries to guess correct memory limit for specific cases, see > > > > function qemuDomainGetMemLockLimitBytes() . > > > > > > > > If I manually set 64G hard_limit for VM with 32G ram everything works > > > > even without calling tha ioctl: > > > > > > > > <memtune> > > > > <hard_limit unit='GiB'>64</hard_limit> > > > > </memtune> > > > > > > > > So if we can figure out some reasonable overhead when vCMDQ is used that > > > > would be better solution. > > > > > > > It makes sense that the ioctl should not be used blindly for every iommufd > > > VM. Would you be open to gating the per-process accounting behind a config > > > setting (e.g. iommufd_rlimit_mode=process in libvirtd.conf)? That keeps > > > the > > > default behavior unchanged while accounting for the multi-VM failure case. > > I have no HW with multiple GPUs available to test if this is required or > > not in order to start multiple VMs each using one GPU. > > > > Currently based on my testing for single VM it is not required. Are you > > sure if we need this? If not we can remove this patch. > > > I am sure we need this - I just reproduced the behavior again by removing > the call to this ioctl and launching a second VM when another VM is already > up. The second VM does not boot and we see the following error:
Thanks for testing this, I've managed to get a system with multiple
network devices and managed to reproduce the same error. So it looks
like this is necessary every time iommufd is used.
As I already mentioned in this case we should always error out if we
cannot set per-process accounting to make sure that the VM will stay
within limits set for it by libvirt.
> 2026-01-29 22:35:29.927+0000: 291942: error : qemuProcessReportLogError:2151
> : internal error: QEMU unexpectedly closed the monitor (vm='1gpu-vm-2'):
> 2026-01-29T22:35:29.836876Z qemu-system-aarch64: -device
> {"driver":"vfio-pci","host":"0009:06:00.0","id":"hostdev0","x-vpasid-cap-offset":4088,"iommufd":"iommufd0","fd":"21","bus":"pci.3","addr":"0x0"}:
> vfio hostdev0: memory listener initialization failed: Region ram-node0:
> vfio_container_dma_map(0xabaebbf9bb80, 0x40000000, 0x400000000,
> 0xfeb733e00000) = -12 (Cannot allocate memory)
> error: Failed to start domain '1gpu-vm-2'
> error: internal error: QEMU unexpectedly closed the monitor
> (vm='1gpu-vm-2'): 2026-01-29T22:35:29.836876Z qemu-system-aarch64: -device
> {"driver":"vfio-pci","host":"0009:06:00.0","id":"hostdev0","x-vpasid-cap-offset":4088,"iommufd":"iommufd0","fd":"21","bus":"pci.3","addr":"0x0"}:
> vfio hostdev0: memory listener initialization failed: Region ram-node0:
> vfio_container_dma_map(0xabaebbf9bb80, 0x40000000, 0x400000000,
> 0xfeb733e00000) = -12 (Cannot allocate memory)
>
>
> > > Separately, I'd be happy to add memlock limit adjustments in the vCMDQ
> > > Libvirt patch series under qemuDomainGetMemLockLimitBytes() when vCMDQ is
> > > enabled.
> > It seems that there is no need to make any changes to current code,
> > libvirt already adds extra 1GiB if there is single PCI hostdev attached
> > to the VM.
> >
> > > > > For the case you observed, if it were truly a single isolated QEMU
> > > > > process
> > > > > with no other memlocked usage under the same uid, per‑process vs
> > > > > per‑user
> > > > > should be identical. The fact that switching to per‑process memory
> > > > > accounting fixes the issue suggests there is additional memlocked
> > > > > usage
> > > > > being charged to the libvirt‑qemu uid (e.g. other processes, helper
> > > > > daemons,
> > > > > or device‑related accounting). vCMDQ just pushes the summed memory
> > > > > over the
> > > > > limit.
> > > > When the limit was not high enough I got the following errors in host
> > > > dmesg:
> > > >
> > > > [30507.848263] acpi NVDA200C:03: tegra241_cmdqv: unexpected error
> > > > reported. vintf_map: 0000000000000002, vcmdq_map
> > > > 00000000:00000000:00000000:0000000c
> > > >
> > > > I think this needs additional work in QEMU, starting VM should error out
> > > > if it hits the memory limit instead of silently starting broken VM
> > > > configuration.
> > > Ok, I will discuss with Shameer about erroring out if it hits the memory
> > > limit. Thank you for testing and providing this detailed feedback.
> > I have new details about this error. It only happens when vCMDQ is used
> > and only when VM with vCMDQ is started for the first time after host is
> > power cycled (reboot is not enough to trigger this error).
> >
> > If this happens shutting down the VM and starting it again no longer
> > produce this error and I was able to run cuda-samples inside the VM.
>
> Are you encountering the same behavior with raw QEMU command line when you
> power cycle the host and launch a vCMDQ VM for the first time?
Yes I was able to reproduce it by running qemu directly as root.
Pavel
signature.asc
Description: PGP signature
