On 5/21/25 16:06, David Francis wrote:
> amdgpu CRIU requires an amdgpu CRIU ioctl. This ioctl
> has a similar interface to the amdkfd CRIU ioctl.
> 
> The objects that can be checkpointed and restored are bos and vm
> mappings. Because a single amdgpu bo can have multiple mappings.
> the mappings are recorded separately.
> 
> The ioctl has two modes: PROCESS_INFO, which sends to the user
> how many bos and vms to expect, and CHECKPOINT, which copies
> data about bos and vms into user-provided buffers.
> 
> Restore is handled using existing amdgpu and drm ioctls.
> 
> The new ioctl lives in a new file amdgpu_criu.c with its own
> header amdgpu_criu.h
> 
> Signed-off-by: David Francis <david.fran...@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/Makefile      |   2 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_criu.c | 247 +++++++++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_criu.h |  35 ++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |   2 +
>  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |   2 +
>  include/uapi/drm/amdgpu_drm.h            |  45 +++++
>  6 files changed, 332 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_criu.c
>  create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_criu.h
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
> b/drivers/gpu/drm/amd/amdgpu/Makefile
> index 87080c06e5fc..0863edcdd03f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/Makefile
> +++ b/drivers/gpu/drm/amd/amdgpu/Makefile
> @@ -63,7 +63,7 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o 
> amdgpu_kms.o \
>       amdgpu_xgmi.o amdgpu_csa.o amdgpu_ras.o amdgpu_vm_cpu.o \
>       amdgpu_vm_sdma.o amdgpu_discovery.o amdgpu_ras_eeprom.o amdgpu_nbio.o \
>       amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \
> -     amdgpu_fw_attestation.o amdgpu_securedisplay.o \
> +     amdgpu_fw_attestation.o amdgpu_securedisplay.o amdgpu_criu.o \
>       amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o \
>       amdgpu_ring_mux.o amdgpu_xcp.o amdgpu_seq64.o amdgpu_aca.o 
> amdgpu_dev_coredump.o \
>       amdgpu_cper.o amdgpu_userq_fence.o amdgpu_eviction_fence.o
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_criu.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_criu.c
> new file mode 100644
> index 000000000000..dbd2d5049eb6
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_criu.c
> @@ -0,0 +1,247 @@
> +/* SPDX-License-Identifier: MIT */
> +/*
> +* Copyright 2025 Advanced Micro Devices, Inc.
> +*
> +* Permission is hereby granted, free of charge, to any person obtaining a
> +* copy of this software and associated documentation files (the "Software"),
> +* to deal in the Software without restriction, including without limitation
> +* the rights to use, copy, modify, merge, publish, distribute, sublicense,
> +* and/or sell copies of the Software, and to permit persons to whom the
> +* Software is furnished to do so, subject to the following conditions:
> +*
> +* The above copyright notice and this permission notice shall be included in
> +* all copies or substantial portions of the Software.
> +*
> +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> +* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> +* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> +* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> +* OTHER DEALINGS IN THE SOFTWARE.
> +*/
> +
> +#include <linux/dma-buf.h>
> +#include <linux/hashtable.h>
> +#include <linux/mutex.h>
> +#include <linux/random.h>
> +
> +#include <drm/amdgpu_drm.h>
> +#include <drm/drm_device.h>
> +#include <drm/drm_file.h>
> +
> +#include "amdgpu_criu.h"
> +
> +#include <drm/amdgpu_drm.h>
> +#include <drm/drm_drv.h>
> +#include <drm/drm_exec.h>
> +#include <drm/drm_gem_ttm_helper.h>
> +#include <drm/ttm/ttm_tt.h>
> +
> +#include "amdgpu.h"
> +#include "amdgpu_display.h"
> +#include "amdgpu_dma_buf.h"
> +#include "amdgpu_hmm.h"
> +#include "amdgpu_xgmi.h"
> +
> +static bool is_import(struct amdgpu_bo *bo)
> +{
> +     if (bo->tbo.base.import_attach)
> +             return &bo->tbo.base != (struct drm_gem_object 
> *)bo->tbo.base.import_attach->dmabuf->priv;

Please only check import_attach here, the other check is not really valid.

And BTW please make sure to use checkpatch.pl on those patches. That looks like 
something which might be complained about.


> +     return false;
> +}
> +
> +static int amdgpu_criu_process_info(struct drm_device *dev, struct drm_file 
> *data,
> +                         struct drm_amdgpu_criu_args *args)
> +{
> +     struct drm_gem_object *gobj;
> +     int id;
> +     int num_bos = 0;
> +     int num_vm_mappings = 0;
> +     struct amdgpu_vm *avm = &((struct amdgpu_fpriv *)data->driver_priv)->vm;

Please sort that in reverse xmas tree order.

E.g. longer lines first, smaller last.


> +
> +     spin_lock(&data->table_lock);
> +     idr_for_each_entry(&data->object_idr, gobj, id) {
> +             struct amdgpu_bo *bo = gem_to_amdgpu_bo(gobj);
> +             struct amdgpu_vm_bo_base *vm_bo_base;
> +
> +             num_bos += 1;
> +
> +             vm_bo_base = bo->vm_bo;
> +
> +             while (vm_bo_base) {

You are not even remotely holding the necessary locks to do this. You need to 
grab the VM lock for that.

> +                     struct amdgpu_bo_va *bo_va = container_of(vm_bo_base, 
> struct amdgpu_bo_va, base);


> +                     struct amdgpu_bo_va_mapping *mapping;
> +
> +                     if (vm_bo_base->vm == avm) {

In general please don't mess with code in a different component. E.g. for that 
here you should probably use amdgpu_vm_bo_find() instead.

> +
> +                             list_for_each_entry(mapping, &bo_va->invalids, 
> list) {
> +                                     num_vm_mappings += 1;
> +                             }
> +                             list_for_each_entry(mapping, &bo_va->valids, 
> list) {
> +                                     num_vm_mappings += 1;
> +                             }

Using the BO state machine is a very bad idea, probably better to walk over the 
VA mapping tree.

> +                     }
> +
> +                     vm_bo_base = vm_bo_base->next;
> +             }
> +     }
> +     spin_unlock(&data->table_lock);
> +
> +     args->num_bos = num_bos;
> +     args->num_vms = num_vm_mappings;

Those informations become invalid as soon as you drop the looks. Stuff like 
that is usually not an acceptable design without explicit documentation.

> +     args->pid = avm->task_info->pid;

This is not something userspace should know about.

> +
> +     return 0;
> +}
> +
> +static uint32_t hardware_flags_to_uapi_flags(struct amdgpu_device *adev, 
> uint64_t pte_flags)
> +{
> +     uint32_t gem_flags = 0;
> +
> +     if (pte_flags & AMDGPU_PTE_EXECUTABLE)
> +             gem_flags |= AMDGPU_VM_PAGE_EXECUTABLE;
> +     if (pte_flags & AMDGPU_PTE_READABLE)
> +             gem_flags |= AMDGPU_VM_PAGE_READABLE;
> +     if (pte_flags & AMDGPU_PTE_WRITEABLE)
> +             gem_flags |= AMDGPU_VM_PAGE_WRITEABLE;
> +     if (pte_flags & AMDGPU_PTE_PRT_FLAG(adev))
> +             gem_flags |= AMDGPU_VM_PAGE_PRT;
> +     if (pte_flags & AMDGPU_PTE_NOALLOC)
> +             gem_flags |= AMDGPU_VM_PAGE_NOALLOC;

That won't work like this in some cases.

> +
> +     return gem_flags;
> +}
> +
> +static int amdgpu_criu_checkpoint(struct drm_device *dev, struct drm_file 
> *data,
> +                         struct drm_amdgpu_criu_args *args)
> +{
> +
> +     struct amdgpu_vm *avm = &((struct amdgpu_fpriv *)data->driver_priv)->vm;
> +     struct drm_amdgpu_criu_bo_bucket *bo_buckets;
> +     struct drm_amdgpu_criu_vm_bucket *vm_buckets;
> +     struct drm_gem_object *gobj;
> +     int vm_priv_index = 0;
> +     int bo_index = 0;
> +     int num_bos = 0;
> +     int fd, id, ret;
> +
> +     spin_lock(&data->table_lock);
> +     idr_for_each_entry(&data->object_idr, gobj, id)
> +             num_bos += 1;
> +     spin_unlock(&data->table_lock);
> +
> +     if (args->num_bos != num_bos) {
> +             ret = -EINVAL;
> +             goto exit;
> +     }

That approach is complete nonsense.

The input arguments should just give you the maximum size of the buffer, never 
an excapt size.


> +
> +     bo_buckets = kvzalloc(num_bos * sizeof(*bo_buckets), GFP_KERNEL);
> +     if (!bo_buckets) {
> +             ret = -ENOMEM;
> +             goto free_buckets;
> +     }
> +
> +     vm_buckets = kvzalloc(args->num_vms * sizeof(*vm_buckets), GFP_KERNEL);
> +     if (!vm_buckets) {
> +             ret = -ENOMEM;
> +             goto free_vms;
> +     }
> +
> +     idr_for_each_entry(&data->object_idr, gobj, id) {

You can't touch that without holding a lock.

> +             struct amdgpu_bo *bo = gem_to_amdgpu_bo(gobj);
> +             struct drm_amdgpu_criu_bo_bucket *bo_bucket;
> +             struct amdgpu_vm_bo_base *vm_bo_base;
> +
> +             bo_bucket = &bo_buckets[bo_index];
> +
> +             bo_bucket->size = amdgpu_bo_size(bo);
> +             bo_bucket->offset = amdgpu_bo_mmap_offset(bo);
> +             bo_bucket->alloc_flags = bo->flags & 
> (!AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE);
> +             bo_bucket->preferred_domains = bo->preferred_domains;
> +
> +             if (is_import(bo))
> +                     bo_bucket->flags |= AMDGPU_CRIU_BO_FLAG_IS_IMPORT;
> +
> +             drm_gem_prime_handle_to_fd(dev, data, id, 0, &fd);
> +             if (fd)
> +                     bo_bucket->dmabuf_fd = fd;

Complete no-go to call that here.

Regards,
Christian.

> +
> +             vm_bo_base = bo->vm_bo;
> +
> +             while (vm_bo_base) {
> +                     struct amdgpu_bo_va *bo_va = container_of(vm_bo_base, 
> struct amdgpu_bo_va, base);
> +                     struct amdgpu_bo_va_mapping *mapping;
> +
> +                     if (vm_bo_base->vm == avm) {
> +                             list_for_each_entry(mapping, &bo_va->invalids, 
> list) {
> +                                     vm_buckets[vm_priv_index].start = 
> mapping->start;
> +                                     vm_buckets[vm_priv_index].last = 
> mapping->last;
> +                                     vm_buckets[vm_priv_index].offset = 
> mapping->offset;
> +                                     vm_buckets[vm_priv_index].flags = 
> hardware_flags_to_uapi_flags(drm_to_adev(dev), mapping->flags);
> +                                     vm_buckets[vm_priv_index].gem_handle = 
> id;
> +                                     vm_priv_index += 1;
> +
> +                                     bo_bucket->addr = mapping->start * 
> AMDGPU_GPU_PAGE_SIZE;
> +                             }
> +                             list_for_each_entry(mapping, &bo_va->valids, 
> list) {
> +                                     vm_buckets[vm_priv_index].start = 
> mapping->start;
> +                                     vm_buckets[vm_priv_index].last = 
> mapping->last;
> +                                     vm_buckets[vm_priv_index].offset = 
> mapping->offset;
> +                                     vm_buckets[vm_priv_index].flags = 
> hardware_flags_to_uapi_flags(drm_to_adev(dev), mapping->flags);
> +                                     vm_buckets[vm_priv_index].gem_handle = 
> id;
> +                                     vm_priv_index += 1;
> +
> +                                     bo_bucket->addr = mapping->start * 
> AMDGPU_GPU_PAGE_SIZE;
> +                             }
> +                     }
> +
> +                     vm_bo_base = vm_bo_base->next;
> +             }
> +
> +             bo_index += 1;
> +     }
> +
> +     ret = copy_to_user((void __user *)args->bos, bo_buckets, num_bos * 
> sizeof(*bo_buckets));
> +     if (ret) {
> +             pr_debug("Failed to copy BO information to user\n");
> +             ret = -EFAULT;
> +             goto free_vms;
> +     }
> +
> +     ret = copy_to_user((void __user *)args->vms, vm_buckets, args->num_vms 
> * sizeof(*vm_buckets));
> +     if (ret) {
> +             pr_debug("Failed to copy BO information to user\n");
> +             ret = -EFAULT;
> +             goto free_vms;
> +     }
> +
> +free_vms:
> +     kvfree(vm_buckets);
> +free_buckets:
> +     kvfree(bo_buckets);
> +exit:
> +
> +     return ret;
> +}
> +
> +int amdgpu_criu_op_ioctl(struct drm_device *dev, void *data,
> +                         struct drm_file *filp)
> +{
> +     struct drm_amdgpu_criu_args *args = data;
> +     int ret;
> +
> +     switch (args->op) {
> +     case AMDGPU_CRIU_OP_PROCESS_INFO:
> +             ret = amdgpu_criu_process_info(dev, filp, args);
> +             break;
> +     case AMDGPU_CRIU_OP_CHECKPOINT:
> +             ret = amdgpu_criu_checkpoint(dev, filp, args);
> +             break;
> +     default:
> +             ret = -EINVAL;
> +             break;
> +     }
> +
> +     return ret;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_criu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_criu.h
> new file mode 100644
> index 000000000000..01677072f0ed
> --- /dev/null
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_criu.h
> @@ -0,0 +1,35 @@
> +/* SPDX-License-Identifier: MIT */
> +/*
> +* Copyright 2025 Advanced Micro Devices, Inc.
> +*
> +* Permission is hereby granted, free of charge, to any person obtaining a
> +* copy of this software and associated documentation files (the "Software"),
> +* to deal in the Software without restriction, including without limitation
> +* the rights to use, copy, modify, merge, publish, distribute, sublicense,
> +* and/or sell copies of the Software, and to permit persons to whom the
> +* Software is furnished to do so, subject to the following conditions:
> +*
> +* The above copyright notice and this permission notice shall be included in
> +* all copies or substantial portions of the Software.
> +*
> +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> +* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> +* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> +* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> +* OTHER DEALINGS IN THE SOFTWARE.
> +*/
> +
> +#ifndef __AMDGPU_CRIU_H__
> +#define __AMDGPU_CRIU_H__
> +
> +#include <drm/amdgpu_drm.h>
> +
> +int amdgpu_criu_op_ioctl(struct drm_device *dev, void *data,
> +                         struct drm_file *filp);
> +
> +struct amdgpu_bo *bo_from_criu_global_handle(uint8_t *handle);
> +int insert_bo_at_criu_global_handle(struct amdgpu_bo *bo, uint8_t *handle);
> +
> +#endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 4db92e0a60da..1b8154395615 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -53,6 +53,7 @@
>  #include "amdgpu_xgmi.h"
>  #include "amdgpu_userq.h"
>  #include "amdgpu_userq_fence.h"
> +#include "amdgpu_criu.h"
>  #include "../amdxcp/amdgpu_xcp_drv.h"
>  
>  /*
> @@ -3021,6 +3022,7 @@ const struct drm_ioctl_desc amdgpu_ioctls_kms[] = {
>       DRM_IOCTL_DEF_DRV(AMDGPU_USERQ, amdgpu_userq_ioctl, 
> DRM_AUTH|DRM_RENDER_ALLOW),
>       DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_SIGNAL, amdgpu_userq_signal_ioctl, 
> DRM_AUTH|DRM_RENDER_ALLOW),
>       DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_WAIT, amdgpu_userq_wait_ioctl, 
> DRM_AUTH|DRM_RENDER_ALLOW),
> +     DRM_IOCTL_DEF_DRV(AMDGPU_CRIU_OP, amdgpu_criu_op_ioctl, 
> DRM_AUTH|DRM_RENDER_ALLOW),
>  };
>  
>  static const struct drm_driver amdgpu_kms_driver = {
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> index a2149afa5803..a8cf2d4580cc 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> @@ -45,6 +45,8 @@
>  #include "amdgpu_dma_buf.h"
>  #include "kfd_debug.h"
>  
> +#include "amdgpu_criu.h"
> +
>  static long kfd_ioctl(struct file *, unsigned int, unsigned long);
>  static int kfd_open(struct inode *, struct file *);
>  static int kfd_release(struct inode *, struct file *);
> diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
> index 45c4fa13499c..f7c3b160f396 100644
> --- a/include/uapi/drm/amdgpu_drm.h
> +++ b/include/uapi/drm/amdgpu_drm.h
> @@ -57,6 +57,7 @@ extern "C" {
>  #define DRM_AMDGPU_USERQ             0x16
>  #define DRM_AMDGPU_USERQ_SIGNAL              0x17
>  #define DRM_AMDGPU_USERQ_WAIT                0x18
> +#define DRM_AMDGPU_CRIU_OP           0x19
>  
>  #define DRM_IOCTL_AMDGPU_GEM_CREATE  DRM_IOWR(DRM_COMMAND_BASE + 
> DRM_AMDGPU_GEM_CREATE, union drm_amdgpu_gem_create)
>  #define DRM_IOCTL_AMDGPU_GEM_MMAP    DRM_IOWR(DRM_COMMAND_BASE + 
> DRM_AMDGPU_GEM_MMAP, union drm_amdgpu_gem_mmap)
> @@ -77,6 +78,7 @@ extern "C" {
>  #define DRM_IOCTL_AMDGPU_USERQ               DRM_IOWR(DRM_COMMAND_BASE + 
> DRM_AMDGPU_USERQ, union drm_amdgpu_userq)
>  #define DRM_IOCTL_AMDGPU_USERQ_SIGNAL        DRM_IOWR(DRM_COMMAND_BASE + 
> DRM_AMDGPU_USERQ_SIGNAL, struct drm_amdgpu_userq_signal)
>  #define DRM_IOCTL_AMDGPU_USERQ_WAIT  DRM_IOWR(DRM_COMMAND_BASE + 
> DRM_AMDGPU_USERQ_WAIT, struct drm_amdgpu_userq_wait)
> +#define DRM_IOCTL_AMDGPU_CRIU_OP     DRM_IOWR(DRM_COMMAND_BASE + 
> DRM_AMDGPU_CRIU_OP, struct drm_amdgpu_criu_args)
>  
>  /**
>   * DOC: memory domains
> @@ -1626,6 +1628,49 @@ struct drm_color_ctm_3x4 {
>       __u64 matrix[12];
>  };
>  
> +/* CRIU ioctl
> + *
> + * When checkpointing a process, the CRIU amdgpu plugin will perform:
> + * 1. INFO op to get information about state that needs to be saved. This
> + *    pauses execution until the checkpoint is done.
> + * 2. CHECKPOINT op to save state
> + *
> + * Restore uses other ioctls.
> + */
> +enum drm_amdgpu_criu_op {
> +     AMDGPU_CRIU_OP_PROCESS_INFO,
> +     AMDGPU_CRIU_OP_CHECKPOINT,
> +};
> +
> +struct drm_amdgpu_criu_args {
> +     __u64 bos; /* user pointer to bos array */
> +     __u64 vms; /* user pointer to private data */
> +     __u32 num_bos;
> +     __u32 num_vms;
> +     __u32 pid;
> +     __u32 op;
> +};
> +
> +#define AMDGPU_CRIU_BO_FLAG_IS_IMPORT        (1 << 0)
> +
> +struct drm_amdgpu_criu_bo_bucket {
> +     __u64 addr;
> +     __u64 size;
> +     __u64 offset;
> +     __u64 alloc_flags;
> +     __u32 preferred_domains;
> +     __u32 dmabuf_fd;
> +     __u32 flags;
> +};
> +
> +struct drm_amdgpu_criu_vm_bucket {
> +     __u64 start;
> +     __u64 last;
> +     __u64 offset;
> +     __u64 flags;
> +     __u32 gem_handle;
> +};
> +
>  #if defined(__cplusplus)
>  }
>  #endif

Reply via email to