Add a fence callback to the VM update and ACK the retry CAM after the VM update is finished. Previously, we would ACK it immediately after calling amdgpu_vm_handle_fault() which caused a race condition that was likely to trigger the same interrupt again, causing the same fault to be handled multiple times.
Signed-off-by: Timur Kristóf <[email protected]> --- drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 28 +++++++++++++++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h | 8 ++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c | 2 +- 4 files changed, 36 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c index 26aea960e2759..21c8d87477448 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c @@ -545,6 +545,16 @@ void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t addr, } while (fault->timestamp < tmp); } +static void amdgpu_gmc_retry_fault_handled(struct dma_fence *fence, + struct dma_fence_cb *cb) +{ + struct amdgpu_fence_cb *afc = container_of(cb, struct amdgpu_fence_cb, cb); + struct amdgpu_device *adev = afc->adev; + + /* CAM index is the array index of the current callback struct */ + adev->irq.ih_funcs->retry_cam_ack(adev, afc - &adev->gmc.retry_cb[0]); +} + int amdgpu_gmc_handle_retry_fault(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry, u64 addr, @@ -552,6 +562,7 @@ int amdgpu_gmc_handle_retry_fault(struct amdgpu_device *adev, u32 node_id, bool write_fault) { + struct dma_fence *fence = NULL; int ret; if (adev->irq.retry_cam_enabled) { @@ -564,8 +575,21 @@ int amdgpu_gmc_handle_retry_fault(struct amdgpu_device *adev, } ret = amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id, - addr, entry->timestamp, write_fault, NULL); - adev->irq.ih_funcs->retry_cam_ack(adev, cam_index); + addr, entry->timestamp, write_fault, &fence); + + /* If the update is already done, ACK now, otherwise when it's done. */ + if (fence) { + adev->gmc.retry_cb[cam_index].adev = adev; + + if (dma_fence_add_callback(fence, &adev->gmc.retry_cb[cam_index].cb, + amdgpu_gmc_retry_fault_handled)) + adev->irq.ih_funcs->retry_cam_ack(adev, cam_index); + + dma_fence_put(fence); + } else { + adev->irq.ih_funcs->retry_cam_ack(adev, cam_index); + } + if (ret) return 1; } else { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h index 77eb153802845..3bfb06e011a86 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h @@ -27,6 +27,7 @@ #define __AMDGPU_GMC_H__ #include <linux/types.h> +#include <linux/dma-fence.h> #include "amdgpu_irq.h" #include "amdgpu_xgmi.h" @@ -214,6 +215,11 @@ struct amdgpu_gmc_memrange { int nid_mask; }; +struct amdgpu_fence_cb { + struct amdgpu_device *adev; + struct dma_fence_cb cb; +}; + enum amdgpu_gart_placement { AMDGPU_GART_PLACEMENT_BEST_FIT = 0, AMDGPU_GART_PLACEMENT_HIGH, @@ -305,6 +311,8 @@ struct amdgpu_gmc { } fault_hash[AMDGPU_GMC_FAULT_HASH_SIZE]; uint64_t last_fault:AMDGPU_GMC_FAULT_RING_ORDER; + struct amdgpu_fence_cb retry_cb[16]; + bool tmz_enabled; bool is_app_apu; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 8c3ba7213eb22..f5e9b97e92a8c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -3035,7 +3035,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, r = amdgpu_vm_update_pdes(adev, vm, true); - *fence = vm->last_update; + *fence = dma_fence_get(vm->last_update); error_unlock: amdgpu_bo_unreserve(root); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c index 2eb64df6daa94..6e28f0e435bf5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c @@ -132,7 +132,7 @@ static int amdgpu_vm_sdma_commit(struct amdgpu_vm_update_params *p, DMA_RESV_USAGE_BOOKKEEP); } - if (fence && !p->immediate) { + if (fence) { /* * Most hw generations now have a separate queue for page table * updates, but when the queue is shared with userspace we need -- 2.53.0
