On 2026-05-08 02:17, Yifan Zhang wrote:
During Mode 1 reset, the ASIC undergoes a reset cycle and becomes temporarily
inaccessible via PCIe. Any attempt to access framebuffer or MMIO registers 
during
this window can result in uncompleted PCIe transactions, leading to NMI panics 
or
system hangs.

To prevent this, Unmap all of the applications mappings of the framebuffer
and doorbell BARs before mode1 reset. Also prevent new mappings from coming in
during the reset process.

v2: remove inode in kfd_dev (Christian)

Signed-off-by: Yifan Zhang<[email protected]>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  1 +
  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 11 ++++++++++
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    |  6 +++++
  drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c       | 17 ++++++++++++--
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      | 22 +++++++++++++++++++
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  1 +
  6 files changed, 56 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 2bf6a31c194d..5333e052d56d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -360,6 +360,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device 
*adev,
                uint64_t size, u32 alloc_flag, int8_t xcp_id);
  void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
                uint64_t size, u32 alloc_flag, int8_t xcp_id);
+void amdgpu_amdkfd_clear_kfd_mapping(struct amdgpu_device *adev);
u64 amdgpu_amdkfd_xcp_memory_size(struct amdgpu_device *adev, int xcp_id); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 7c01492e69dd..2b06a2dae3da 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -139,6 +139,17 @@ void amdgpu_amdkfd_reserve_system_mem(uint64_t size)
        kfd_mem_limit.system_mem_used += size;
  }
+void amdgpu_amdkfd_clear_kfd_mapping(struct amdgpu_device *adev)

This function isn't really related to the KFD GPUVM code. Please move this into amdgpu_amdkfd.c instead.


+{
+       if (!adev->kfd.dev)
+               return;
+
+       kfd_dev_unmap_mapping_range(KFD_MMAP_TYPE_DOORBELL,

This doesn't work correctly. The mapping address also includes the GPUID. See this code in kfd_ioctl_create_queue:

        /* Return gpu_id as doorbell offset for mmap usage */
        args->doorbell_offset = KFD_MMAP_TYPE_DOORBELL;
        args->doorbell_offset |= KFD_MMAP_GPU_ID(args->gpu_id);
I think you'll need to include the GPUID in the address you're unmapping.


+                                   kfd_doorbell_process_slice(adev->kfd.dev));
+       kfd_dev_unmap_mapping_range(KFD_MMAP_TYPE_MMIO, PAGE_SIZE);

Same here. You need to include the GPUID.

        /* MMIO is mapped through kfd device
         * Generate a kfd mmap offset
         */
        if (flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)
                args->mmap_offset = KFD_MMAP_TYPE_MMIO
                                        | KFD_MMAP_GPU_ID(args->gpu_id);

Regards,
  Felix


+}
+
+
  /* Estimate page table size needed to represent a given memory size
   *
   * With 4KB pages, we need one 8 byte PTE for each 4KB of memory
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 073f632f295a..c741a1a2a8cb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5840,6 +5840,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
        /* We need to lock reset domain only once both for XGMI and single 
device */
        amdgpu_device_recovery_get_reset_lock(adev, &device_list);
+ /* unmap all the mappings of doorbell and framebuffer to prevent user space from
+        * accessing them
+        */
+       unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
+       amdgpu_amdkfd_clear_kfd_mapping(adev);
+
        amdgpu_device_halt_activities(adev, job, reset_context, &device_list,
                                      hive, need_emergency_restart);
        if (need_emergency_restart)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 0071d6957828..1dd343f0219f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -46,6 +46,7 @@
  #include "amdgpu_hmm.h"
  #include "amdgpu_xgmi.h"
  #include "amdgpu_vm.h"
+#include "amdgpu_reset.h"
static int
  amdgpu_gem_add_input_fence(struct drm_file *filp,
@@ -118,13 +119,21 @@ amdgpu_gem_update_timeline_node(struct drm_file *filp,
  static vm_fault_t amdgpu_gem_fault(struct vm_fault *vmf)
  {
        struct ttm_buffer_object *bo = vmf->vma->vm_private_data;
+       struct amdgpu_device *adev = amdgpu_ttm_adev(bo->bdev);
        struct drm_device *ddev = bo->base.dev;
        vm_fault_t ret;
        int idx;
+ /* Prevent new mappings from coming in during reset */
+
+       if (!down_read_trylock(&adev->reset_domain->sem))
+               return VM_FAULT_SIGSEGV;
+
        ret = ttm_bo_vm_reserve(bo, vmf);
-       if (ret)
+       if (ret) {
+               up_read(&adev->reset_domain->sem);
                return ret;
+       }
if (drm_dev_enter(ddev, &idx)) {
                ret = amdgpu_bo_fault_reserve_notify(bo);
@@ -140,11 +149,15 @@ static vm_fault_t amdgpu_gem_fault(struct vm_fault *vmf)
        } else {
                ret = ttm_bo_vm_dummy_page(vmf, vmf->vma->vm_page_prot);
        }
-       if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
+       if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
+               up_read(&adev->reset_domain->sem);
                return ret;
+       }
unlock:
        dma_resv_unlock(bo->base.resv);
+       up_read(&adev->reset_domain->sem);
+
        return ret;
  }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 84b9bde7f371..1be1b1dd2341 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -69,6 +69,21 @@ static const struct class kfd_class = {
        .name = kfd_dev_name,
  };
+/*
+ * Cache the address space of the chardev on first open so that the reset
+ * path can drop all userspace mappings of doorbell and MMIO ranges via
+ * unmap_mapping_range().
+ */
+static struct address_space *kfd_dev_mapping;
+
+void kfd_dev_unmap_mapping_range(loff_t const holebegin, loff_t const holelen)
+{
+       struct address_space *mapping = READ_ONCE(kfd_dev_mapping);
+
+       if (mapping)
+               unmap_mapping_range(mapping, holebegin, holelen, 1);
+}
+
  static inline struct kfd_process_device *kfd_lock_pdd_by_id(struct 
kfd_process *p, __u32 gpu_id)
  {
        struct kfd_process_device *pdd;
@@ -135,6 +150,13 @@ static int kfd_open(struct inode *inode, struct file 
*filep)
        if (iminor(inode) != 0)
                return -ENODEV;
+ /*
+        * /dev/kfd is a single chardev so all opens share one inode. Cache
+        * its address_space on the first open for use by the reset path.
+        */
+       if (!READ_ONCE(kfd_dev_mapping))
+               cmpxchg(&kfd_dev_mapping, NULL, inode->i_mapping);
+
        is_32bit_user_mode = in_compat_syscall();
if (is_32bit_user_mode) {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index a6ff1db477f9..f037062c33ea 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -399,6 +399,7 @@ enum kfd_mempool {
  /* Character device interface */
  int kfd_chardev_init(void);
  void kfd_chardev_exit(void);
+void kfd_dev_unmap_mapping_range(loff_t const holebegin, loff_t const holelen);
/**
   * enum kfd_unmap_queues_filter - Enum for queue filters.

Reply via email to