On Thu, May 7, 2026 at 10:54 AM Christian König <[email protected]> wrote: > > On 5/7/26 16:09, Yifan Zhang wrote: > > During Mode 1 reset, the ASIC undergoes a reset cycle and becomes > > temporarily > > inaccessible via PCIe. Any attempt to access framebuffer or MMIO registers > > during > > this window can result in uncompleted PCIe transactions, leading to NMI > > panics or > > system hangs. > > > > To prevent this, Unmap all of the applications mappings of the framebuffer > > and doorbell BARs before mode1 reset. Also prevent new mappings from coming > > in > > during the reset process. > > I think putting the inode into the kfd_dev would break userspace, so that is > a no-go and needs a different solution. > > But apart from that the patch looks like what I suggested before. > > Question is since when does that issue exists? Previously we didn't had to > take care of that.
It's always existed, I guess generally there are not often CPU accesses to VRAM during a reset. It depends on the platform and what level of PCIe error handling it enables. Alex > > Regards, > Christian. > > > > > Signed-off-by: Yifan Zhang <[email protected]> > > --- > > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 1 + > > .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 12 ++++++++++++ > > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 ++++++- > > drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 17 +++++++++++++++-- > > drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 10 ++++++++++ > > drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 3 +++ > > 6 files changed, 47 insertions(+), 3 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > > index 2bf6a31c194d..5333e052d56d 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h > > @@ -360,6 +360,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct > > amdgpu_device *adev, > > uint64_t size, u32 alloc_flag, int8_t xcp_id); > > void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev, > > uint64_t size, u32 alloc_flag, int8_t xcp_id); > > +void amdgpu_amdkfd_clear_kfd_mapping(struct amdgpu_device *adev); > > > > u64 amdgpu_amdkfd_xcp_memory_size(struct amdgpu_device *adev, int xcp_id); > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > > index 7c01492e69dd..3ac2bd86c08b 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c > > @@ -139,6 +139,18 @@ void amdgpu_amdkfd_reserve_system_mem(uint64_t size) > > kfd_mem_limit.system_mem_used += size; > > } > > > > +void amdgpu_amdkfd_clear_kfd_mapping(struct amdgpu_device *adev) > > +{ > > + if (adev->kfd.dev && adev->kfd.dev->inode && > > + adev->kfd.dev->inode->i_mapping) { > > + unmap_mapping_range(adev->kfd.dev->inode->i_mapping, > > + KFD_MMAP_TYPE_DOORBELL, > > kfd_doorbell_process_slice(adev->kfd.dev), 1); > > + unmap_mapping_range(adev->kfd.dev->inode->i_mapping, > > + KFD_MMAP_TYPE_MMIO, PAGE_SIZE, 1); > > + } > > +} > > + > > + > > /* Estimate page table size needed to represent a given memory size > > * > > * With 4KB pages, we need one 8 byte PTE for each 4KB of memory > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > index 073f632f295a..c67936d1fb0d 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > @@ -5019,7 +5019,6 @@ int amdgpu_device_mode1_reset(struct amdgpu_device > > *adev) > > > > /* disable BM */ > > pci_clear_master(adev->pdev); > > - > > if (amdgpu_dpm_is_mode1_reset_supported(adev)) { > > dev_info(adev->dev, "GPU smu mode1 reset\n"); > > ret = amdgpu_dpm_mode1_reset(adev); > > @@ -5840,6 +5839,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device > > *adev, > > /* We need to lock reset domain only once both for XGMI and single > > device */ > > amdgpu_device_recovery_get_reset_lock(adev, &device_list); > > > > + /* unmap all the mappings of doorbell and framebuffer to prevent user > > space from > > + * accessing them > > + */ > > + unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); > > + amdgpu_amdkfd_clear_kfd_mapping(adev); > > + > > amdgpu_device_halt_activities(adev, job, reset_context, &device_list, > > hive, need_emergency_restart); > > if (need_emergency_restart) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c > > index 0071d6957828..1dd343f0219f 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c > > @@ -46,6 +46,7 @@ > > #include "amdgpu_hmm.h" > > #include "amdgpu_xgmi.h" > > #include "amdgpu_vm.h" > > +#include "amdgpu_reset.h" > > > > static int > > amdgpu_gem_add_input_fence(struct drm_file *filp, > > @@ -118,13 +119,21 @@ amdgpu_gem_update_timeline_node(struct drm_file *filp, > > static vm_fault_t amdgpu_gem_fault(struct vm_fault *vmf) > > { > > struct ttm_buffer_object *bo = vmf->vma->vm_private_data; > > + struct amdgpu_device *adev = amdgpu_ttm_adev(bo->bdev); > > struct drm_device *ddev = bo->base.dev; > > vm_fault_t ret; > > int idx; > > > > + /* Prevent new mappings from coming in during reset */ > > + > > + if (!down_read_trylock(&adev->reset_domain->sem)) > > + return VM_FAULT_SIGSEGV; > > + > > ret = ttm_bo_vm_reserve(bo, vmf); > > - if (ret) > > + if (ret) { > > + up_read(&adev->reset_domain->sem); > > return ret; > > + } > > > > if (drm_dev_enter(ddev, &idx)) { > > ret = amdgpu_bo_fault_reserve_notify(bo); > > @@ -140,11 +149,15 @@ static vm_fault_t amdgpu_gem_fault(struct vm_fault > > *vmf) > > } else { > > ret = ttm_bo_vm_dummy_page(vmf, vmf->vma->vm_page_prot); > > } > > - if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) > > + if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) > > { > > + up_read(&adev->reset_domain->sem); > > return ret; > > + } > > > > unlock: > > dma_resv_unlock(bo->base.resv); > > + up_read(&adev->reset_domain->sem); > > + > > return ret; > > } > > > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > > b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > > index 84b9bde7f371..d5bf5ba73ca0 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > > @@ -130,11 +130,21 @@ void kfd_chardev_exit(void) > > static int kfd_open(struct inode *inode, struct file *filep) > > { > > struct kfd_process *process; > > + struct kfd_node *node; > > bool is_32bit_user_mode; > > + int i; > > > > if (iminor(inode) != 0) > > return -ENODEV; > > > > + /* Save inode in kfd_dev for unmap_mapping_range */ > > + for (i = 0; kfd_topology_enum_kfd_devices(i, &node) == 0; i++) { > > + if (node && node->kfd && !node->kfd->inode) { > > + node->kfd->inode = inode; > > + break; > > + } > > + } > > + > > is_32bit_user_mode = in_compat_syscall(); > > > > if (is_32bit_user_mode) { > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > > b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > > index a6ff1db477f9..bfdd7d3bc7cb 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > > @@ -373,6 +373,9 @@ struct kfd_dev { > > > > struct workqueue_struct *ih_wq; > > > > + /* kfd inode */ > > + struct inode *inode; > > + > > /* Kernel doorbells for KFD device */ > > struct amdgpu_bo *doorbells; > > >
