During Mode 1 reset, the ASIC undergoes a reset cycle and becomes temporarily inaccessible via PCIe. Any attempt to access framebuffer or MMIO registers during this window can result in uncompleted PCIe transactions, leading to NMI panics or system hangs.
To prevent this, Unmap all of the applications mappings of the framebuffer and doorbell BARs before mode1 reset. Also prevent new mappings from coming in during the reset process. Signed-off-by: Yifan Zhang <[email protected]> --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 1 + .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 12 ++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 ++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 17 +++++++++++++++-- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 10 ++++++++++ drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 3 +++ 6 files changed, 47 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 2bf6a31c194d..5333e052d56d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -360,6 +360,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev, uint64_t size, u32 alloc_flag, int8_t xcp_id); void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev, uint64_t size, u32 alloc_flag, int8_t xcp_id); +void amdgpu_amdkfd_clear_kfd_mapping(struct amdgpu_device *adev); u64 amdgpu_amdkfd_xcp_memory_size(struct amdgpu_device *adev, int xcp_id); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 7c01492e69dd..3ac2bd86c08b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -139,6 +139,18 @@ void amdgpu_amdkfd_reserve_system_mem(uint64_t size) kfd_mem_limit.system_mem_used += size; } +void amdgpu_amdkfd_clear_kfd_mapping(struct amdgpu_device *adev) +{ + if (adev->kfd.dev && adev->kfd.dev->inode && + adev->kfd.dev->inode->i_mapping) { + unmap_mapping_range(adev->kfd.dev->inode->i_mapping, + KFD_MMAP_TYPE_DOORBELL, kfd_doorbell_process_slice(adev->kfd.dev), 1); + unmap_mapping_range(adev->kfd.dev->inode->i_mapping, + KFD_MMAP_TYPE_MMIO, PAGE_SIZE, 1); + } +} + + /* Estimate page table size needed to represent a given memory size * * With 4KB pages, we need one 8 byte PTE for each 4KB of memory diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 073f632f295a..c67936d1fb0d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5019,7 +5019,6 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev) /* disable BM */ pci_clear_master(adev->pdev); - if (amdgpu_dpm_is_mode1_reset_supported(adev)) { dev_info(adev->dev, "GPU smu mode1 reset\n"); ret = amdgpu_dpm_mode1_reset(adev); @@ -5840,6 +5839,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, /* We need to lock reset domain only once both for XGMI and single device */ amdgpu_device_recovery_get_reset_lock(adev, &device_list); + /* unmap all the mappings of doorbell and framebuffer to prevent user space from + * accessing them + */ + unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); + amdgpu_amdkfd_clear_kfd_mapping(adev); + amdgpu_device_halt_activities(adev, job, reset_context, &device_list, hive, need_emergency_restart); if (need_emergency_restart) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index 0071d6957828..1dd343f0219f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -46,6 +46,7 @@ #include "amdgpu_hmm.h" #include "amdgpu_xgmi.h" #include "amdgpu_vm.h" +#include "amdgpu_reset.h" static int amdgpu_gem_add_input_fence(struct drm_file *filp, @@ -118,13 +119,21 @@ amdgpu_gem_update_timeline_node(struct drm_file *filp, static vm_fault_t amdgpu_gem_fault(struct vm_fault *vmf) { struct ttm_buffer_object *bo = vmf->vma->vm_private_data; + struct amdgpu_device *adev = amdgpu_ttm_adev(bo->bdev); struct drm_device *ddev = bo->base.dev; vm_fault_t ret; int idx; + /* Prevent new mappings from coming in during reset */ + + if (!down_read_trylock(&adev->reset_domain->sem)) + return VM_FAULT_SIGSEGV; + ret = ttm_bo_vm_reserve(bo, vmf); - if (ret) + if (ret) { + up_read(&adev->reset_domain->sem); return ret; + } if (drm_dev_enter(ddev, &idx)) { ret = amdgpu_bo_fault_reserve_notify(bo); @@ -140,11 +149,15 @@ static vm_fault_t amdgpu_gem_fault(struct vm_fault *vmf) } else { ret = ttm_bo_vm_dummy_page(vmf, vmf->vma->vm_page_prot); } - if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) + if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) { + up_read(&adev->reset_domain->sem); return ret; + } unlock: dma_resv_unlock(bo->base.resv); + up_read(&adev->reset_domain->sem); + return ret; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 84b9bde7f371..d5bf5ba73ca0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -130,11 +130,21 @@ void kfd_chardev_exit(void) static int kfd_open(struct inode *inode, struct file *filep) { struct kfd_process *process; + struct kfd_node *node; bool is_32bit_user_mode; + int i; if (iminor(inode) != 0) return -ENODEV; + /* Save inode in kfd_dev for unmap_mapping_range */ + for (i = 0; kfd_topology_enum_kfd_devices(i, &node) == 0; i++) { + if (node && node->kfd && !node->kfd->inode) { + node->kfd->inode = inode; + break; + } + } + is_32bit_user_mode = in_compat_syscall(); if (is_32bit_user_mode) { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index a6ff1db477f9..bfdd7d3bc7cb 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -373,6 +373,9 @@ struct kfd_dev { struct workqueue_struct *ih_wq; + /* kfd inode */ + struct inode *inode; + /* Kernel doorbells for KFD device */ struct amdgpu_bo *doorbells; -- 2.43.0
