amdgpu: ACK the retry CAM after VM update finishes

Timur Kristóf Fri, 29 May 2026 03:31:11 -0700

Add a fence callback to the VM update and ACK the retry CAM
after the VM update is finished. Previously, we would ACK it
immediately after calling amdgpu_vm_handle_fault() which
caused a race condition that was likely to trigger the same
interrupt again, causing the same fault to be handled
multiple times.


Signed-off-by: Timur Kristóf <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c     | 28 +++++++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h     |  8 ++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c      |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c |  2 +-
 4 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 26aea960e2759..21c8d87477448 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -545,6 +545,16 @@ void amdgpu_gmc_filter_faults_remove(struct amdgpu_device 
*adev, uint64_t addr,
        } while (fault->timestamp < tmp);
 }
 
+static void amdgpu_gmc_retry_fault_handled(struct dma_fence *fence,
+                                          struct dma_fence_cb *cb)
+{
+       struct amdgpu_fence_cb *afc = container_of(cb, struct amdgpu_fence_cb, 
cb);
+       struct amdgpu_device *adev = afc->adev;
+
+       /* CAM index is the array index of the current callback struct */
+       adev->irq.ih_funcs->retry_cam_ack(adev, afc - &adev->gmc.retry_cb[0]);
+}
+
 int amdgpu_gmc_handle_retry_fault(struct amdgpu_device *adev,
                                  struct amdgpu_iv_entry *entry,
                                  u64 addr,
@@ -552,6 +562,7 @@ int amdgpu_gmc_handle_retry_fault(struct amdgpu_device 
*adev,
                                  u32 node_id,
                                  bool write_fault)
 {
+       struct dma_fence *fence = NULL;
        int ret;
 
        if (adev->irq.retry_cam_enabled) {
@@ -564,8 +575,21 @@ int amdgpu_gmc_handle_retry_fault(struct amdgpu_device 
*adev,
                }
 
                ret = amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, 
node_id,
-                                            addr, entry->timestamp, 
write_fault, NULL);
-               adev->irq.ih_funcs->retry_cam_ack(adev, cam_index);
+                                            addr, entry->timestamp, 
write_fault, &fence);
+
+               /* If the update is already done, ACK now, otherwise when it's 
done. */
+               if (fence) {
+                       adev->gmc.retry_cb[cam_index].adev = adev;
+
+                       if (dma_fence_add_callback(fence, 
&adev->gmc.retry_cb[cam_index].cb,
+                                                  
amdgpu_gmc_retry_fault_handled))
+                               adev->irq.ih_funcs->retry_cam_ack(adev, 
cam_index);
+
+                       dma_fence_put(fence);
+               } else {
+                       adev->irq.ih_funcs->retry_cam_ack(adev, cam_index);
+               }
+
                if (ret)
                        return 1;
        } else {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index 77eb153802845..3bfb06e011a86 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -27,6 +27,7 @@
 #define __AMDGPU_GMC_H__
 
 #include <linux/types.h>
+#include <linux/dma-fence.h>
 
 #include "amdgpu_irq.h"
 #include "amdgpu_xgmi.h"
@@ -214,6 +215,11 @@ struct amdgpu_gmc_memrange {
        int nid_mask;
 };
 
+struct amdgpu_fence_cb {
+       struct amdgpu_device *adev;
+       struct dma_fence_cb cb;
+};
+
 enum amdgpu_gart_placement {
        AMDGPU_GART_PLACEMENT_BEST_FIT = 0,
        AMDGPU_GART_PLACEMENT_HIGH,
@@ -305,6 +311,8 @@ struct amdgpu_gmc {
        } fault_hash[AMDGPU_GMC_FAULT_HASH_SIZE];
        uint64_t                last_fault:AMDGPU_GMC_FAULT_RING_ORDER;
 
+       struct amdgpu_fence_cb retry_cb[16];
+
        bool tmz_enabled;
        bool is_app_apu;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 8c3ba7213eb22..f5e9b97e92a8c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -3035,7 +3035,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, 
u32 pasid,
 
        r = amdgpu_vm_update_pdes(adev, vm, true);
 
-       *fence = vm->last_update;
+       *fence = dma_fence_get(vm->last_update);
 
 error_unlock:
        amdgpu_bo_unreserve(root);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
index 2eb64df6daa94..6e28f0e435bf5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
@@ -132,7 +132,7 @@ static int amdgpu_vm_sdma_commit(struct 
amdgpu_vm_update_params *p,
                                   DMA_RESV_USAGE_BOOKKEEP);
        }
 
-       if (fence && !p->immediate) {
+       if (fence) {
                /*
                 * Most hw generations now have a separate queue for page table
                 * updates, but when the queue is shared with userspace we need
-- 
2.53.0

[PATCH 2/7] drm/amdgpu: ACK the retry CAM after VM update finishes

Reply via email to