On Thu, Jan 29, 2026 at 02:39:52PM -0800, Nathan Chen wrote:
> 
> 
> On 1/29/2026 11:58 AM, Pavel Hrdina wrote:
> > On Mon, Jan 26, 2026 at 05:17:02PM -0800, Nathan Chen wrote:
> > > 
> > > On 1/26/2026 1:07 PM, Pavel Hrdina wrote:
> > > > On Fri, Jan 23, 2026 at 12:30:28PM -0800, Nathan Chen wrote:
> > > > > On 1/20/2026 10:24 AM, Pavel Hrdina wrote:
> > > > > > On Fri, Jan 16, 2026 at 05:39:33PM -0800, Nathan Chen via Devel 
> > > > > > wrote:
> > > > > > > From: Nathan Chen<[email protected]>
> > > > > > > 
> > > > > > > Implement the IOMMU_OPTION_RLIMIT_MODE
> > > > > > > ioctl to set per-process memory accounting for
> > > > > > > iommufd. This prevents ENOMEM errors from the
> > > > > > > default per-user memory accounting when multiple
> > > > > > > VMs under the libvirt-qemu user have their pinned
> > > > > > > memory summed and checked against a per-process
> > > > > > > RLIMIT_MEMLOCK limit.
> > > > > > > 
> > > > > > > Signed-off-by: Nathan Chen<[email protected]>
> > > > > > > ---
> > > > > > >     meson.build              |   1 +
> > > > > > >     po/POTFILES              |   1 +
> > > > > > >     src/libvirt_private.syms |   3 ++
> > > > > > >     src/util/meson.build     |   1 +
> > > > > > >     src/util/viriommufd.c    | 111 
> > > > > > > +++++++++++++++++++++++++++++++++++++++
> > > > > > >     src/util/viriommufd.h    |  25 +++++++++
> > > > > > >     6 files changed, 142 insertions(+)
> > > > > > >     create mode 100644 src/util/viriommufd.c
> > > > > > >     create mode 100644 src/util/viriommufd.h
> > > > > > > 
> > > > > > > diff --git a/meson.build b/meson.build
> > > > > > > index 964d1fa4e1..a6db70f13e 100644
> > > > > > > --- a/meson.build
> > > > > > > +++ b/meson.build
> > > > > > > @@ -732,6 +732,7 @@ headers = [
> > > > > > >       'ifaddrs.h',
> > > > > > >       'libtasn1.h',
> > > > > > >       'linux/kvm.h',
> > > > > > > +  'linux/iommufd.h',
> > > > > > >       'mntent.h',
> > > > > > >       'net/ethernet.h',
> > > > > > >       'net/if.h',
> > > > > > > diff --git a/po/POTFILES b/po/POTFILES
> > > > > > > index f0aad35c8c..c78d2b8000 100644
> > > > > > > --- a/po/POTFILES
> > > > > > > +++ b/po/POTFILES
> > > > > > > @@ -303,6 +303,7 @@ src/util/virhostuptime.c
> > > > > > >     src/util/viridentity.c
> > > > > > >     src/util/virinhibitor.c
> > > > > > >     src/util/virinitctl.c
> > > > > > > +src/util/viriommufd.c
> > > > > > >     src/util/viriscsi.c
> > > > > > >     src/util/virjson.c
> > > > > > >     src/util/virlease.c
> > > > > > > diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
> > > > > > > index 6bffd2eb6d..7fa76a1ec3 100644
> > > > > > > --- a/src/libvirt_private.syms
> > > > > > > +++ b/src/libvirt_private.syms
> > > > > > > @@ -2646,6 +2646,9 @@ virInhibitorRelease;
> > > > > > >     virInitctlFifos;
> > > > > > >     virInitctlSetRunLevel;
> > > > > > > +# util/viriommufd.h
> > > > > > > +virIOMMUFDSetRLimitMode;
> > > > > > > +
> > > > > > >     # util/viriscsi.h
> > > > > > >     virISCSIConnectionLogin;
> > > > > > >     virISCSIConnectionLogout;
> > > > > > > diff --git a/src/util/meson.build b/src/util/meson.build
> > > > > > > index 4950a795cc..9fb0aa0fe7 100644
> > > > > > > --- a/src/util/meson.build
> > > > > > > +++ b/src/util/meson.build
> > > > > > > @@ -46,6 +46,7 @@ util_sources = [
> > > > > > >       'viridentity.c',
> > > > > > >       'virinhibitor.c',
> > > > > > >       'virinitctl.c',
> > > > > > > +  'viriommufd.c',
> > > > > > >       'viriscsi.c',
> > > > > > >       'virjson.c',
> > > > > > >       'virkeycode.c',
> > > > > > > diff --git a/src/util/viriommufd.c b/src/util/viriommufd.c
> > > > > > > new file mode 100644
> > > > > > > index 0000000000..225c76f4b2
> > > > > > > --- /dev/null
> > > > > > > +++ b/src/util/viriommufd.c
> > > > > > > @@ -0,0 +1,111 @@
> > > > > > > +#include <config.h>
> > > > > > > +
> > > > > > > +#include "viriommufd.h"
> > > > > > > +#include "virlog.h"
> > > > > > > +#include "virerror.h"
> > > > > > > +#include "virfile.h"
> > > > > > > +
> > > > > > > +#ifdef __linux__
> > > > > > > +
> > > > > > > +# include <sys/ioctl.h>
> > > > > > > +# include <linux/types.h>
> > > > > > > +
> > > > > > > +# ifdef HAVE_LINUX_IOMMUFD_H
> > > > > > > +#  include <linux/iommufd.h>
> > > > > > > +# endif
> > > > > > > +
> > > > > > > +# define VIR_FROM_THIS VIR_FROM_NONE
> > > > > > > +
> > > > > > > +VIR_LOG_INIT("util.iommufd");
> > > > > > > +
> > > > > > > +# ifndef IOMMU_OPTION
> > > > > > > +
> > > > > > > +enum iommufd_option {
> > > > > > > +    IOMMU_OPTION_RLIMIT_MODE = 0,
> > > > > > > +    IOMMU_OPTION_HUGE_PAGES = 1,
> > > > > > > +};
> > > > > > > +
> > > > > > > +enum iommufd_option_ops {
> > > > > > > +    IOMMU_OPTION_OP_SET = 0,
> > > > > > > +    IOMMU_OPTION_OP_GET = 1,
> > > > > > > +};
> > > > > > > +
> > > > > > > +struct iommu_option {
> > > > > > > +    __u32 size;
> > > > > > > +    __u32 option_id;
> > > > > > > +    __u16 op;
> > > > > > > +    __u16 __reserved;
> > > > > > > +    __u32 object_id;
> > > > > > > +    __aligned_u64 val64;
> > > > > > > +};
> > > > > > > +
> > > > > > > +#  define IOMMUFD_TYPE (';')
> > > > > > > +#  define IOMMUFD_CMD_OPTION 0x87
> > > > > > > +#  define IOMMU_OPTION _IO(IOMMUFD_TYPE, IOMMUFD_CMD_OPTION)
> > > > > > > +
> > > > > > > +# endif
> > > > > > > +
> > > > > > > +/**
> > > > > > > + * virIOMMUFDSetRLimitMode:
> > > > > > > + * @fd: iommufd file descriptor
> > > > > > > + * @processAccounting: true for per-process, false for per-user
> > > > > > > + *
> > > > > > > + * Set RLIMIT_MEMLOCK accounting mode for the iommufd.
> > > > > > > + *
> > > > > > > + * Returns: 0 on success, -1 on error
> > > > > > > + */
> > > > > > > +int
> > > > > > > +virIOMMUFDSetRLimitMode(int fd, bool processAccounting)
> > > > > > > +{
> > > > > > > +    struct iommu_option option = {
> > > > > > > +        .size = sizeof(struct iommu_option),
> > > > > > > +        .option_id = IOMMU_OPTION_RLIMIT_MODE,
> > > > > > > +        .op = IOMMU_OPTION_OP_SET,
> > > > > > > +        .__reserved = 0,
> > > > > > > +        .object_id = 0,
> > > > > > > +        .val64 = processAccounting ? 1 : 0,
> > > > > > > +    };
> > > > > > > +
> > > > > > > +    if (ioctl(fd, IOMMU_OPTION, &option) < 0) {
> > > > > > > +        switch (errno) {
> > > > > > > +            case ENOTTY:
> > > > > > > +                VIR_WARN("IOMMU_OPTION ioctl not supported");
> > > > > > > +                return 0;
> > > > > > > +
> > > > > > > +            case EOPNOTSUPP:
> > > > > > > +                VIR_WARN("IOMMU_OPTION_RLIMIT_MODE not supported 
> > > > > > > by kernel");
> > > > > > > +                return 0;
> > > > > > > +
> > > > > > > +            case EINVAL:
> > > > > > > +                virReportSystemError(errno, "%s",
> > > > > > > +                                     _("invalid iommufd option 
> > > > > > > parameters"));
> > > > > > > +                return -1;
> > > > > > > +
> > > > > > > +            case EPERM:
> > > > > > > +                VIR_WARN("Permission denied for IOMMU_OPTION 
> > > > > > > ioctl. "
> > > > > > > +                         "Per-user-based memory accounting to be 
> > > > > > > used by default.");
> > > > > > > +                return 0;
> > > > > > > +
> > > > > > > +            default:
> > > > > > > +                virReportSystemError(errno, "%s",
> > > > > > > +                                     _("failed to set iommufd 
> > > > > > > option"));
> > > > > > > +                return -1;
> > > > > > > +        }
> > > > > > > +    }
> > > > > > In my previous testing this part of code was not used so no rlimit 
> > > > > > was
> > > > > > configured for the grace hopper GPU that was assigned to a VM.
> > > > > > 
> > > > > > The VM OS was able to see the GPU and I was able to run cuda-samples
> > > > > > with most of them passing. This setup didn't use vCMDQ or EGM. When 
> > > > > > I
> > > > > > tried patches that add support for vCMDQ I was no longer able to 
> > > > > > use the
> > > > > > GPU inside the VM until this code was called or setting
> > > > > > "setcap cap_ipc_lock=ep" on the qemu binary but it was still 
> > > > > > detected
> > > > > > inside the VM and the VM was started successfully.
> > > > > > 
> > > > > > So is this required for all devices that want to use iommufd in 
> > > > > > order
> > > > > > for them to work correctly inside the VM? Or is it necessary only 
> > > > > > when
> > > > > > specific features are used?
> > > > > > 
> > > > > I don’t think the ioctl is required for all devices, but vCMDQ can 
> > > > > increase
> > > > > accounted pinned memory over the per‑user memory locking limit. vCMDQ
> > > > > introduces additional guest‑RAM backed queues that could be the extra
> > > > > pinned/accounted memory pushing over the memory locking limit. 
> > > > > Additionally,
> > > > > attempting to launch a second iommufd VM could increase accounted 
> > > > > memory
> > > > > over the per-user memory locking limit.
> > > > If that ioctl call is not required for all devices we should not call it
> > > > unconditionally for all VMs that will try to use iommufd with any
> > > > device.
> > > > 
> > > > Libvirt tries to guess correct memory limit for specific cases, see
> > > > function qemuDomainGetMemLockLimitBytes() .
> > > > 
> > > > If I manually set 64G hard_limit for VM with 32G ram everything works
> > > > even without calling tha ioctl:
> > > > 
> > > >     <memtune>
> > > >       <hard_limit unit='GiB'>64</hard_limit>
> > > >     </memtune>
> > > > 
> > > > So if we can figure out some reasonable overhead when vCMDQ is used that
> > > > would be better solution.
> > > > 
> > > It makes sense that the ioctl should not be used blindly for every iommufd
> > > VM. Would you be open to gating the per-process accounting behind a config
> > > setting (e.g. iommufd_rlimit_mode=process in libvirtd.conf)? That keeps 
> > > the
> > > default behavior unchanged while accounting for the multi-VM failure case.
> > I have no HW with multiple GPUs available to test if this is required or
> > not in order to start multiple VMs each using one GPU.
> > 
> > Currently based on my testing for single VM it is not required. Are you
> > sure if we need this? If not we can remove this patch.
> > 
> I am sure we need this - I just reproduced the behavior again by removing
> the call to this ioctl and launching a second VM when another VM is already
> up. The second VM does not boot and we see the following error:

Thanks for testing this, I've managed to get a system with multiple
network devices and managed to reproduce the same error. So it looks
like this is necessary every time iommufd is used.

As I already mentioned in this case we should always error out if we
cannot set per-process accounting to make sure that the VM will stay
within limits set for it by libvirt.

> 2026-01-29 22:35:29.927+0000: 291942: error : qemuProcessReportLogError:2151
> : internal error: QEMU unexpectedly closed the monitor (vm='1gpu-vm-2'):
> 2026-01-29T22:35:29.836876Z qemu-system-aarch64: -device 
> {"driver":"vfio-pci","host":"0009:06:00.0","id":"hostdev0","x-vpasid-cap-offset":4088,"iommufd":"iommufd0","fd":"21","bus":"pci.3","addr":"0x0"}:
> vfio hostdev0: memory listener initialization failed: Region ram-node0:
> vfio_container_dma_map(0xabaebbf9bb80, 0x40000000, 0x400000000,
> 0xfeb733e00000) = -12 (Cannot allocate memory)
> error: Failed to start domain '1gpu-vm-2'
> error: internal error: QEMU unexpectedly closed the monitor
> (vm='1gpu-vm-2'): 2026-01-29T22:35:29.836876Z qemu-system-aarch64: -device 
> {"driver":"vfio-pci","host":"0009:06:00.0","id":"hostdev0","x-vpasid-cap-offset":4088,"iommufd":"iommufd0","fd":"21","bus":"pci.3","addr":"0x0"}:
> vfio hostdev0: memory listener initialization failed: Region ram-node0:
> vfio_container_dma_map(0xabaebbf9bb80, 0x40000000, 0x400000000,
> 0xfeb733e00000) = -12 (Cannot allocate memory)
> 
> 
> > > Separately, I'd be happy to add memlock limit adjustments in the vCMDQ
> > > Libvirt patch series under qemuDomainGetMemLockLimitBytes() when vCMDQ is
> > > enabled.
> > It seems that there is no need to make any changes to current code,
> > libvirt already adds extra 1GiB if there is single PCI hostdev attached
> > to the VM.
> > 
> > > > > For the case you observed, if it were truly a single isolated QEMU 
> > > > > process
> > > > > with no other memlocked usage under the same uid, per‑process vs 
> > > > > per‑user
> > > > > should be identical. The fact that switching to per‑process memory
> > > > > accounting fixes the issue suggests there is additional memlocked 
> > > > > usage
> > > > > being charged to the libvirt‑qemu uid (e.g. other processes, helper 
> > > > > daemons,
> > > > > or device‑related accounting). vCMDQ just pushes the summed memory 
> > > > > over the
> > > > > limit.
> > > > When the limit was not high enough I got the following errors in host
> > > > dmesg:
> > > > 
> > > > [30507.848263] acpi NVDA200C:03: tegra241_cmdqv: unexpected error 
> > > > reported. vintf_map: 0000000000000002, vcmdq_map 
> > > > 00000000:00000000:00000000:0000000c
> > > > 
> > > > I think this needs additional work in QEMU, starting VM should error out
> > > > if it hits the memory limit instead of silently starting broken VM
> > > > configuration.
> > > Ok, I will discuss with Shameer about erroring out if it hits the memory
> > > limit. Thank you for testing and providing this detailed feedback.
> > I have new details about this error. It only happens when vCMDQ is used
> > and only when VM with vCMDQ is started for the first time after host is
> > power cycled (reboot is not enough to trigger this error).
> > 
> > If this happens shutting down the VM and starting it again no longer
> > produce this error and I was able to run cuda-samples inside the VM.
> 
> Are you encountering the same behavior with raw QEMU command line when you
> power cycle the host and launch a vCMDQ VM for the first time?

Yes I was able to reproduce it by running qemu directly as root.

Pavel

Attachment: signature.asc
Description: PGP signature

Reply via email to