On 2026-05-11 09:22, Yifan Zhang wrote:
During Mode 1 reset, the ASIC undergoes a reset cycle and becomes temporarily
inaccessible via PCIe. Any attempt to access framebuffer or MMIO registers 
during
this window can result in uncompleted PCIe transactions, leading to NMI panics 
or
system hangs.

To prevent this, Unmap all of the applications mappings of the framebuffer
and doorbell BARs before mode1 reset. Also prevent new mappings from coming in
during the reset process.

v2: remove inode in kfd_dev (Christian)
v3: correct unmap offset (Felix), remove prevent new mappings part to avoid 
deadlock (Christian)

Signed-off-by: Yifan Zhang <[email protected]>

As far as KFD is concerned, this patch is

Reviewed-by: Felix Kuehling <[email protected]>


---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 22 ++++++++++++++++++++++
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  6 ++++++
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c   | 22 ++++++++++++++++++++++
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h      |  1 +
  5 files changed, 52 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 7b10bbe28caf..d1dac3412a66 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -36,6 +36,7 @@
  #include "amdgpu_ras.h"
  #include "amdgpu_umc.h"
  #include "amdgpu_reset.h"
+#include "kfd_priv.h"
/* Total memory size in system memory and all GPU VRAM. Used to
   * estimate worst case amount of memory to reserve for page tables
@@ -320,6 +321,27 @@ void amdgpu_amdkfd_gpu_reset(struct amdgpu_device *adev)
                (void)amdgpu_reset_domain_schedule(adev->reset_domain, 
&adev->kfd.reset_work);
  }
+void amdgpu_amdkfd_clear_kfd_mapping(struct amdgpu_device *adev)
+{
+       struct kfd_dev *kfd = adev->kfd.dev;
+       unsigned int i;
+
+       if (!kfd)
+               return;
+
+       for (i = 0; i < kfd->num_nodes; i++) {
+               struct kfd_node *node = kfd->nodes[i];
+
+               kfd_dev_unmap_mapping_range(KFD_MMAP_TYPE_DOORBELL |
+                                           KFD_MMAP_GPU_ID(node->id),
+                                           kfd_doorbell_process_slice(kfd));
+               kfd_dev_unmap_mapping_range(KFD_MMAP_TYPE_MMIO |
+                                           KFD_MMAP_GPU_ID(node->id),
+                                           PAGE_SIZE);
+       }
+}
+
+
  int amdgpu_amdkfd_alloc_kernel_mem(struct amdgpu_device *adev, size_t size,
                                u32 domain, void **mem_obj, uint64_t *gpu_addr,
                                void **cpu_ptr, bool cp_mqd_gfx9)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 2bf6a31c194d..5333e052d56d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -360,6 +360,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device 
*adev,
                uint64_t size, u32 alloc_flag, int8_t xcp_id);
  void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
                uint64_t size, u32 alloc_flag, int8_t xcp_id);
+void amdgpu_amdkfd_clear_kfd_mapping(struct amdgpu_device *adev);
u64 amdgpu_amdkfd_xcp_memory_size(struct amdgpu_device *adev, int xcp_id); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 1202a72ff063..6760c9331f46 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5844,6 +5844,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
        /* We need to lock reset domain only once both for XGMI and single 
device */
        amdgpu_device_recovery_get_reset_lock(adev, &device_list);
+ /* unmap all the mappings of doorbell and framebuffer to prevent user space from
+        * accessing them
+        */
+       unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
+       amdgpu_amdkfd_clear_kfd_mapping(adev);
+
        amdgpu_device_halt_activities(adev, job, reset_context, &device_list,
                                      hive, need_emergency_restart);
        if (need_emergency_restart)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 84b9bde7f371..1be1b1dd2341 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -69,6 +69,21 @@ static const struct class kfd_class = {
        .name = kfd_dev_name,
  };
+/*
+ * Cache the address space of the chardev on first open so that the reset
+ * path can drop all userspace mappings of doorbell and MMIO ranges via
+ * unmap_mapping_range().
+ */
+static struct address_space *kfd_dev_mapping;
+
+void kfd_dev_unmap_mapping_range(loff_t const holebegin, loff_t const holelen)
+{
+       struct address_space *mapping = READ_ONCE(kfd_dev_mapping);
+
+       if (mapping)
+               unmap_mapping_range(mapping, holebegin, holelen, 1);
+}
+
  static inline struct kfd_process_device *kfd_lock_pdd_by_id(struct 
kfd_process *p, __u32 gpu_id)
  {
        struct kfd_process_device *pdd;
@@ -135,6 +150,13 @@ static int kfd_open(struct inode *inode, struct file 
*filep)
        if (iminor(inode) != 0)
                return -ENODEV;
+ /*
+        * /dev/kfd is a single chardev so all opens share one inode. Cache
+        * its address_space on the first open for use by the reset path.
+        */
+       if (!READ_ONCE(kfd_dev_mapping))
+               cmpxchg(&kfd_dev_mapping, NULL, inode->i_mapping);
+
        is_32bit_user_mode = in_compat_syscall();
if (is_32bit_user_mode) {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index a6ff1db477f9..f037062c33ea 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -399,6 +399,7 @@ enum kfd_mempool {
  /* Character device interface */
  int kfd_chardev_init(void);
  void kfd_chardev_exit(void);
+void kfd_dev_unmap_mapping_range(loff_t const holebegin, loff_t const holelen);
/**
   * enum kfd_unmap_queues_filter - Enum for queue filters.

Reply via email to