On 29/05/2026 11:30, Timur Kristóf wrote:
Add a fence callback to the VM update and ACK the retry CAM
after the VM update is finished. Previously, we would ACK it
immediately after calling amdgpu_vm_handle_fault() which
caused a race condition that was likely to trigger the same
interrupt again, causing the same fault to be handled
multiple times.
Signed-off-by: Timur Kristóf <[email protected]>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 28 +++++++++++++++++++--
drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h | 8 ++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 2 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c | 2 +-
4 files changed, 36 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 26aea960e2759..21c8d87477448 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -545,6 +545,16 @@ void amdgpu_gmc_filter_faults_remove(struct amdgpu_device
*adev, uint64_t addr,
} while (fault->timestamp < tmp);
}
+static void amdgpu_gmc_retry_fault_handled(struct dma_fence *fence,
+ struct dma_fence_cb *cb)
+{
+ struct amdgpu_fence_cb *afc = container_of(cb, struct amdgpu_fence_cb,
cb);
+ struct amdgpu_device *adev = afc->adev;
+
+ /* CAM index is the array index of the current callback struct */
+ adev->irq.ih_funcs->retry_cam_ack(adev, afc - &adev->gmc.retry_cb[0]);
Is the "afc - &adev->gmc.retry_cb[0]" part correct? It will be the index
of the array element, while ->retry_cam_ack() expects the content of
that element, no?
+}
+
int amdgpu_gmc_handle_retry_fault(struct amdgpu_device *adev,
struct amdgpu_iv_entry *entry,
u64 addr,
@@ -552,6 +562,7 @@ int amdgpu_gmc_handle_retry_fault(struct amdgpu_device
*adev,
u32 node_id,
bool write_fault)
{
+ struct dma_fence *fence = NULL;
int ret;
if (adev->irq.retry_cam_enabled) {
@@ -564,8 +575,21 @@ int amdgpu_gmc_handle_retry_fault(struct amdgpu_device
*adev,
}
ret = amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id,
- addr, entry->timestamp,
write_fault, NULL);
- adev->irq.ih_funcs->retry_cam_ack(adev, cam_index);
+ addr, entry->timestamp, write_fault,
&fence);
+
+ /* If the update is already done, ACK now, otherwise when it's
done. */
+ if (fence) {
+ adev->gmc.retry_cb[cam_index].adev = adev;
Why is 16 retry_cb elements enough? I see in the code cam_index extraced
from the IV entry with a mask such as 0x3ff.
+
+ if (dma_fence_add_callback(fence,
&adev->gmc.retry_cb[cam_index].cb,
+
amdgpu_gmc_retry_fault_handled))
+ adev->irq.ih_funcs->retry_cam_ack(adev,
cam_index);
+
+ dma_fence_put(fence);
+ } else {
+ adev->irq.ih_funcs->retry_cam_ack(adev, cam_index);
+ }
+
if (ret)
return 1;
} else {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index 77eb153802845..3bfb06e011a86 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -27,6 +27,7 @@
#define __AMDGPU_GMC_H__
#include <linux/types.h>
+#include <linux/dma-fence.h>
#include "amdgpu_irq.h"
#include "amdgpu_xgmi.h"
@@ -214,6 +215,11 @@ struct amdgpu_gmc_memrange {
int nid_mask;
};
+struct amdgpu_fence_cb {
+ struct amdgpu_device *adev;
+ struct dma_fence_cb cb;
+};
+
enum amdgpu_gart_placement {
AMDGPU_GART_PLACEMENT_BEST_FIT = 0,
AMDGPU_GART_PLACEMENT_HIGH,
@@ -305,6 +311,8 @@ struct amdgpu_gmc {
} fault_hash[AMDGPU_GMC_FAULT_HASH_SIZE];
uint64_t last_fault:AMDGPU_GMC_FAULT_RING_ORDER;
+ struct amdgpu_fence_cb retry_cb[16];
+
bool tmz_enabled;
bool is_app_apu;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 8c3ba7213eb22..f5e9b97e92a8c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -3035,7 +3035,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev,
u32 pasid,
r = amdgpu_vm_update_pdes(adev, vm, true);
- *fence = vm->last_update;
+ *fence = dma_fence_get(vm->last_update);
Ah! But passing over since you said you are dropping that patch anyway.
error_unlock:
amdgpu_bo_unreserve(root);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
index 2eb64df6daa94..6e28f0e435bf5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c
@@ -132,7 +132,7 @@ static int amdgpu_vm_sdma_commit(struct
amdgpu_vm_update_params *p,
DMA_RESV_USAGE_BOOKKEEP);
}
- if (fence && !p->immediate) {
+ if (fence) {
Is this deliberate and if so what it is about? Commit message should
explain it as well.
Regards,
Tvrtko
/*
* Most hw generations now have a separate queue for page table
* updates, but when the queue is shared with userspace we need