On 5/21/25 16:06, David Francis wrote: > amdgpu CRIU requires an amdgpu CRIU ioctl. This ioctl > has a similar interface to the amdkfd CRIU ioctl. > > The objects that can be checkpointed and restored are bos and vm > mappings. Because a single amdgpu bo can have multiple mappings. > the mappings are recorded separately. > > The ioctl has two modes: PROCESS_INFO, which sends to the user > how many bos and vms to expect, and CHECKPOINT, which copies > data about bos and vms into user-provided buffers. > > Restore is handled using existing amdgpu and drm ioctls. > > The new ioctl lives in a new file amdgpu_criu.c with its own > header amdgpu_criu.h > > Signed-off-by: David Francis <david.fran...@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/Makefile | 2 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_criu.c | 247 +++++++++++++++++++++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_criu.h | 35 ++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 2 + > drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 2 + > include/uapi/drm/amdgpu_drm.h | 45 +++++ > 6 files changed, 332 insertions(+), 1 deletion(-) > create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_criu.c > create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_criu.h > > diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile > b/drivers/gpu/drm/amd/amdgpu/Makefile > index 87080c06e5fc..0863edcdd03f 100644 > --- a/drivers/gpu/drm/amd/amdgpu/Makefile > +++ b/drivers/gpu/drm/amd/amdgpu/Makefile > @@ -63,7 +63,7 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o > amdgpu_kms.o \ > amdgpu_xgmi.o amdgpu_csa.o amdgpu_ras.o amdgpu_vm_cpu.o \ > amdgpu_vm_sdma.o amdgpu_discovery.o amdgpu_ras_eeprom.o amdgpu_nbio.o \ > amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \ > - amdgpu_fw_attestation.o amdgpu_securedisplay.o \ > + amdgpu_fw_attestation.o amdgpu_securedisplay.o amdgpu_criu.o \ > amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o \ > amdgpu_ring_mux.o amdgpu_xcp.o amdgpu_seq64.o amdgpu_aca.o > amdgpu_dev_coredump.o \ > amdgpu_cper.o amdgpu_userq_fence.o amdgpu_eviction_fence.o > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_criu.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_criu.c > new file mode 100644 > index 000000000000..dbd2d5049eb6 > --- /dev/null > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_criu.c > @@ -0,0 +1,247 @@ > +/* SPDX-License-Identifier: MIT */ > +/* > +* Copyright 2025 Advanced Micro Devices, Inc. > +* > +* Permission is hereby granted, free of charge, to any person obtaining a > +* copy of this software and associated documentation files (the "Software"), > +* to deal in the Software without restriction, including without limitation > +* the rights to use, copy, modify, merge, publish, distribute, sublicense, > +* and/or sell copies of the Software, and to permit persons to whom the > +* Software is furnished to do so, subject to the following conditions: > +* > +* The above copyright notice and this permission notice shall be included in > +* all copies or substantial portions of the Software. > +* > +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > +* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR > +* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, > +* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR > +* OTHER DEALINGS IN THE SOFTWARE. > +*/ > + > +#include <linux/dma-buf.h> > +#include <linux/hashtable.h> > +#include <linux/mutex.h> > +#include <linux/random.h> > + > +#include <drm/amdgpu_drm.h> > +#include <drm/drm_device.h> > +#include <drm/drm_file.h> > + > +#include "amdgpu_criu.h" > + > +#include <drm/amdgpu_drm.h> > +#include <drm/drm_drv.h> > +#include <drm/drm_exec.h> > +#include <drm/drm_gem_ttm_helper.h> > +#include <drm/ttm/ttm_tt.h> > + > +#include "amdgpu.h" > +#include "amdgpu_display.h" > +#include "amdgpu_dma_buf.h" > +#include "amdgpu_hmm.h" > +#include "amdgpu_xgmi.h" > + > +static bool is_import(struct amdgpu_bo *bo) > +{ > + if (bo->tbo.base.import_attach) > + return &bo->tbo.base != (struct drm_gem_object > *)bo->tbo.base.import_attach->dmabuf->priv;
Please only check import_attach here, the other check is not really valid. And BTW please make sure to use checkpatch.pl on those patches. That looks like something which might be complained about. > + return false; > +} > + > +static int amdgpu_criu_process_info(struct drm_device *dev, struct drm_file > *data, > + struct drm_amdgpu_criu_args *args) > +{ > + struct drm_gem_object *gobj; > + int id; > + int num_bos = 0; > + int num_vm_mappings = 0; > + struct amdgpu_vm *avm = &((struct amdgpu_fpriv *)data->driver_priv)->vm; Please sort that in reverse xmas tree order. E.g. longer lines first, smaller last. > + > + spin_lock(&data->table_lock); > + idr_for_each_entry(&data->object_idr, gobj, id) { > + struct amdgpu_bo *bo = gem_to_amdgpu_bo(gobj); > + struct amdgpu_vm_bo_base *vm_bo_base; > + > + num_bos += 1; > + > + vm_bo_base = bo->vm_bo; > + > + while (vm_bo_base) { You are not even remotely holding the necessary locks to do this. You need to grab the VM lock for that. > + struct amdgpu_bo_va *bo_va = container_of(vm_bo_base, > struct amdgpu_bo_va, base); > + struct amdgpu_bo_va_mapping *mapping; > + > + if (vm_bo_base->vm == avm) { In general please don't mess with code in a different component. E.g. for that here you should probably use amdgpu_vm_bo_find() instead. > + > + list_for_each_entry(mapping, &bo_va->invalids, > list) { > + num_vm_mappings += 1; > + } > + list_for_each_entry(mapping, &bo_va->valids, > list) { > + num_vm_mappings += 1; > + } Using the BO state machine is a very bad idea, probably better to walk over the VA mapping tree. > + } > + > + vm_bo_base = vm_bo_base->next; > + } > + } > + spin_unlock(&data->table_lock); > + > + args->num_bos = num_bos; > + args->num_vms = num_vm_mappings; Those informations become invalid as soon as you drop the looks. Stuff like that is usually not an acceptable design without explicit documentation. > + args->pid = avm->task_info->pid; This is not something userspace should know about. > + > + return 0; > +} > + > +static uint32_t hardware_flags_to_uapi_flags(struct amdgpu_device *adev, > uint64_t pte_flags) > +{ > + uint32_t gem_flags = 0; > + > + if (pte_flags & AMDGPU_PTE_EXECUTABLE) > + gem_flags |= AMDGPU_VM_PAGE_EXECUTABLE; > + if (pte_flags & AMDGPU_PTE_READABLE) > + gem_flags |= AMDGPU_VM_PAGE_READABLE; > + if (pte_flags & AMDGPU_PTE_WRITEABLE) > + gem_flags |= AMDGPU_VM_PAGE_WRITEABLE; > + if (pte_flags & AMDGPU_PTE_PRT_FLAG(adev)) > + gem_flags |= AMDGPU_VM_PAGE_PRT; > + if (pte_flags & AMDGPU_PTE_NOALLOC) > + gem_flags |= AMDGPU_VM_PAGE_NOALLOC; That won't work like this in some cases. > + > + return gem_flags; > +} > + > +static int amdgpu_criu_checkpoint(struct drm_device *dev, struct drm_file > *data, > + struct drm_amdgpu_criu_args *args) > +{ > + > + struct amdgpu_vm *avm = &((struct amdgpu_fpriv *)data->driver_priv)->vm; > + struct drm_amdgpu_criu_bo_bucket *bo_buckets; > + struct drm_amdgpu_criu_vm_bucket *vm_buckets; > + struct drm_gem_object *gobj; > + int vm_priv_index = 0; > + int bo_index = 0; > + int num_bos = 0; > + int fd, id, ret; > + > + spin_lock(&data->table_lock); > + idr_for_each_entry(&data->object_idr, gobj, id) > + num_bos += 1; > + spin_unlock(&data->table_lock); > + > + if (args->num_bos != num_bos) { > + ret = -EINVAL; > + goto exit; > + } That approach is complete nonsense. The input arguments should just give you the maximum size of the buffer, never an excapt size. > + > + bo_buckets = kvzalloc(num_bos * sizeof(*bo_buckets), GFP_KERNEL); > + if (!bo_buckets) { > + ret = -ENOMEM; > + goto free_buckets; > + } > + > + vm_buckets = kvzalloc(args->num_vms * sizeof(*vm_buckets), GFP_KERNEL); > + if (!vm_buckets) { > + ret = -ENOMEM; > + goto free_vms; > + } > + > + idr_for_each_entry(&data->object_idr, gobj, id) { You can't touch that without holding a lock. > + struct amdgpu_bo *bo = gem_to_amdgpu_bo(gobj); > + struct drm_amdgpu_criu_bo_bucket *bo_bucket; > + struct amdgpu_vm_bo_base *vm_bo_base; > + > + bo_bucket = &bo_buckets[bo_index]; > + > + bo_bucket->size = amdgpu_bo_size(bo); > + bo_bucket->offset = amdgpu_bo_mmap_offset(bo); > + bo_bucket->alloc_flags = bo->flags & > (!AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE); > + bo_bucket->preferred_domains = bo->preferred_domains; > + > + if (is_import(bo)) > + bo_bucket->flags |= AMDGPU_CRIU_BO_FLAG_IS_IMPORT; > + > + drm_gem_prime_handle_to_fd(dev, data, id, 0, &fd); > + if (fd) > + bo_bucket->dmabuf_fd = fd; Complete no-go to call that here. Regards, Christian. > + > + vm_bo_base = bo->vm_bo; > + > + while (vm_bo_base) { > + struct amdgpu_bo_va *bo_va = container_of(vm_bo_base, > struct amdgpu_bo_va, base); > + struct amdgpu_bo_va_mapping *mapping; > + > + if (vm_bo_base->vm == avm) { > + list_for_each_entry(mapping, &bo_va->invalids, > list) { > + vm_buckets[vm_priv_index].start = > mapping->start; > + vm_buckets[vm_priv_index].last = > mapping->last; > + vm_buckets[vm_priv_index].offset = > mapping->offset; > + vm_buckets[vm_priv_index].flags = > hardware_flags_to_uapi_flags(drm_to_adev(dev), mapping->flags); > + vm_buckets[vm_priv_index].gem_handle = > id; > + vm_priv_index += 1; > + > + bo_bucket->addr = mapping->start * > AMDGPU_GPU_PAGE_SIZE; > + } > + list_for_each_entry(mapping, &bo_va->valids, > list) { > + vm_buckets[vm_priv_index].start = > mapping->start; > + vm_buckets[vm_priv_index].last = > mapping->last; > + vm_buckets[vm_priv_index].offset = > mapping->offset; > + vm_buckets[vm_priv_index].flags = > hardware_flags_to_uapi_flags(drm_to_adev(dev), mapping->flags); > + vm_buckets[vm_priv_index].gem_handle = > id; > + vm_priv_index += 1; > + > + bo_bucket->addr = mapping->start * > AMDGPU_GPU_PAGE_SIZE; > + } > + } > + > + vm_bo_base = vm_bo_base->next; > + } > + > + bo_index += 1; > + } > + > + ret = copy_to_user((void __user *)args->bos, bo_buckets, num_bos * > sizeof(*bo_buckets)); > + if (ret) { > + pr_debug("Failed to copy BO information to user\n"); > + ret = -EFAULT; > + goto free_vms; > + } > + > + ret = copy_to_user((void __user *)args->vms, vm_buckets, args->num_vms > * sizeof(*vm_buckets)); > + if (ret) { > + pr_debug("Failed to copy BO information to user\n"); > + ret = -EFAULT; > + goto free_vms; > + } > + > +free_vms: > + kvfree(vm_buckets); > +free_buckets: > + kvfree(bo_buckets); > +exit: > + > + return ret; > +} > + > +int amdgpu_criu_op_ioctl(struct drm_device *dev, void *data, > + struct drm_file *filp) > +{ > + struct drm_amdgpu_criu_args *args = data; > + int ret; > + > + switch (args->op) { > + case AMDGPU_CRIU_OP_PROCESS_INFO: > + ret = amdgpu_criu_process_info(dev, filp, args); > + break; > + case AMDGPU_CRIU_OP_CHECKPOINT: > + ret = amdgpu_criu_checkpoint(dev, filp, args); > + break; > + default: > + ret = -EINVAL; > + break; > + } > + > + return ret; > +} > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_criu.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_criu.h > new file mode 100644 > index 000000000000..01677072f0ed > --- /dev/null > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_criu.h > @@ -0,0 +1,35 @@ > +/* SPDX-License-Identifier: MIT */ > +/* > +* Copyright 2025 Advanced Micro Devices, Inc. > +* > +* Permission is hereby granted, free of charge, to any person obtaining a > +* copy of this software and associated documentation files (the "Software"), > +* to deal in the Software without restriction, including without limitation > +* the rights to use, copy, modify, merge, publish, distribute, sublicense, > +* and/or sell copies of the Software, and to permit persons to whom the > +* Software is furnished to do so, subject to the following conditions: > +* > +* The above copyright notice and this permission notice shall be included in > +* all copies or substantial portions of the Software. > +* > +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > +* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR > +* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, > +* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR > +* OTHER DEALINGS IN THE SOFTWARE. > +*/ > + > +#ifndef __AMDGPU_CRIU_H__ > +#define __AMDGPU_CRIU_H__ > + > +#include <drm/amdgpu_drm.h> > + > +int amdgpu_criu_op_ioctl(struct drm_device *dev, void *data, > + struct drm_file *filp); > + > +struct amdgpu_bo *bo_from_criu_global_handle(uint8_t *handle); > +int insert_bo_at_criu_global_handle(struct amdgpu_bo *bo, uint8_t *handle); > + > +#endif > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > index 4db92e0a60da..1b8154395615 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > @@ -53,6 +53,7 @@ > #include "amdgpu_xgmi.h" > #include "amdgpu_userq.h" > #include "amdgpu_userq_fence.h" > +#include "amdgpu_criu.h" > #include "../amdxcp/amdgpu_xcp_drv.h" > > /* > @@ -3021,6 +3022,7 @@ const struct drm_ioctl_desc amdgpu_ioctls_kms[] = { > DRM_IOCTL_DEF_DRV(AMDGPU_USERQ, amdgpu_userq_ioctl, > DRM_AUTH|DRM_RENDER_ALLOW), > DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_SIGNAL, amdgpu_userq_signal_ioctl, > DRM_AUTH|DRM_RENDER_ALLOW), > DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_WAIT, amdgpu_userq_wait_ioctl, > DRM_AUTH|DRM_RENDER_ALLOW), > + DRM_IOCTL_DEF_DRV(AMDGPU_CRIU_OP, amdgpu_criu_op_ioctl, > DRM_AUTH|DRM_RENDER_ALLOW), > }; > > static const struct drm_driver amdgpu_kms_driver = { > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > index a2149afa5803..a8cf2d4580cc 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > @@ -45,6 +45,8 @@ > #include "amdgpu_dma_buf.h" > #include "kfd_debug.h" > > +#include "amdgpu_criu.h" > + > static long kfd_ioctl(struct file *, unsigned int, unsigned long); > static int kfd_open(struct inode *, struct file *); > static int kfd_release(struct inode *, struct file *); > diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h > index 45c4fa13499c..f7c3b160f396 100644 > --- a/include/uapi/drm/amdgpu_drm.h > +++ b/include/uapi/drm/amdgpu_drm.h > @@ -57,6 +57,7 @@ extern "C" { > #define DRM_AMDGPU_USERQ 0x16 > #define DRM_AMDGPU_USERQ_SIGNAL 0x17 > #define DRM_AMDGPU_USERQ_WAIT 0x18 > +#define DRM_AMDGPU_CRIU_OP 0x19 > > #define DRM_IOCTL_AMDGPU_GEM_CREATE DRM_IOWR(DRM_COMMAND_BASE + > DRM_AMDGPU_GEM_CREATE, union drm_amdgpu_gem_create) > #define DRM_IOCTL_AMDGPU_GEM_MMAP DRM_IOWR(DRM_COMMAND_BASE + > DRM_AMDGPU_GEM_MMAP, union drm_amdgpu_gem_mmap) > @@ -77,6 +78,7 @@ extern "C" { > #define DRM_IOCTL_AMDGPU_USERQ DRM_IOWR(DRM_COMMAND_BASE + > DRM_AMDGPU_USERQ, union drm_amdgpu_userq) > #define DRM_IOCTL_AMDGPU_USERQ_SIGNAL DRM_IOWR(DRM_COMMAND_BASE + > DRM_AMDGPU_USERQ_SIGNAL, struct drm_amdgpu_userq_signal) > #define DRM_IOCTL_AMDGPU_USERQ_WAIT DRM_IOWR(DRM_COMMAND_BASE + > DRM_AMDGPU_USERQ_WAIT, struct drm_amdgpu_userq_wait) > +#define DRM_IOCTL_AMDGPU_CRIU_OP DRM_IOWR(DRM_COMMAND_BASE + > DRM_AMDGPU_CRIU_OP, struct drm_amdgpu_criu_args) > > /** > * DOC: memory domains > @@ -1626,6 +1628,49 @@ struct drm_color_ctm_3x4 { > __u64 matrix[12]; > }; > > +/* CRIU ioctl > + * > + * When checkpointing a process, the CRIU amdgpu plugin will perform: > + * 1. INFO op to get information about state that needs to be saved. This > + * pauses execution until the checkpoint is done. > + * 2. CHECKPOINT op to save state > + * > + * Restore uses other ioctls. > + */ > +enum drm_amdgpu_criu_op { > + AMDGPU_CRIU_OP_PROCESS_INFO, > + AMDGPU_CRIU_OP_CHECKPOINT, > +}; > + > +struct drm_amdgpu_criu_args { > + __u64 bos; /* user pointer to bos array */ > + __u64 vms; /* user pointer to private data */ > + __u32 num_bos; > + __u32 num_vms; > + __u32 pid; > + __u32 op; > +}; > + > +#define AMDGPU_CRIU_BO_FLAG_IS_IMPORT (1 << 0) > + > +struct drm_amdgpu_criu_bo_bucket { > + __u64 addr; > + __u64 size; > + __u64 offset; > + __u64 alloc_flags; > + __u32 preferred_domains; > + __u32 dmabuf_fd; > + __u32 flags; > +}; > + > +struct drm_amdgpu_criu_vm_bucket { > + __u64 start; > + __u64 last; > + __u64 offset; > + __u64 flags; > + __u32 gem_handle; > +}; > + > #if defined(__cplusplus) > } > #endif