From: Vitaly Prosyak <[email protected]>

Add amdgpu_ctx_mgr_wait_idle() which waits for all outstanding HW
fences to signal before proceeding with VM page table destruction.

This fixes a race condition reproduced by the IGT test amd_close_race
(subtests: close-race-low, close-race-medium, close-race-high). The
test spawns multiple threads that submit GPU work and immediately
close the fd, stressing the window between job dispatch and file
teardown.

Without this fix, close(fd) tears down VM page tables via
amdgpu_vm_fini() while the GPU is still fetching indirect buffers
from those pages. The existing amdgpu_flush() path only waits for
scheduler entities to drain (drm_sched_entity_flush), but this does
not guarantee hardware completion. Jobs that have already been
dispatched to HW rings continue executing against freed page tables,
causing:
- Illegal opcode interrupts (reading garbage from freed memory)

The fix inserts amdgpu_ctx_mgr_wait_idle() in amdgpu_drm_release(),
before drm_release() tears down GEM objects and page tables. It
iterates all contexts and waits on the last submitted fence per
entity, ensuring hardware has completed all accesses to VM page
tables before they are freed.

Cc: Christian König <[email protected]>
Cc: Alex Deucher <[email protected]>
Signed-off-by: Vitaly Prosyak <[email protected]>
Change-Id: I33fea5eddaddf1a246d21293d784cc6b7f544541
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 44 +++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  5 +++
 3 files changed, 50 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index c273557fb1ae..7fc14b7b4374 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -957,6 +957,50 @@ static void amdgpu_ctx_mgr_entity_fini(struct 
amdgpu_ctx_mgr *mgr)
        }
 }
 
+/**
+ * amdgpu_ctx_mgr_wait_idle - wait for all HW fences to complete
+ *
+ * @mgr: context manager
+ *
+ * Wait for the hardware to finish processing all outstanding submissions
+ * across every context and entity. This must be called before VM teardown
+ * to prevent the GPU from accessing freed page tables.
+ */
+void amdgpu_ctx_mgr_wait_idle(struct amdgpu_ctx_mgr *mgr)
+{
+       struct amdgpu_ctx *ctx;
+       uint32_t id, i, j;
+
+       mutex_lock(&mgr->lock);
+       idr_for_each_entry(&mgr->ctx_handles, ctx, id) {
+               for (i = 0; i < AMDGPU_HW_IP_NUM; ++i) {
+                       for (j = 0; j < amdgpu_ctx_num_entities[i]; ++j) {
+                               struct amdgpu_ctx_entity *centity;
+                               struct dma_fence *fence;
+
+                               centity = ctx->entities[i][j];
+                               if (!centity)
+                                       continue;
+
+                               spin_lock(&ctx->ring_lock);
+                               if (centity->sequence == 0) {
+                                       spin_unlock(&ctx->ring_lock);
+                                       continue;
+                               }
+                               fence = 
dma_fence_get(centity->fences[(centity->sequence - 1) &
+                                                                    
(amdgpu_sched_jobs - 1)]);
+                               spin_unlock(&ctx->ring_lock);
+
+                               if (fence) {
+                                       dma_fence_wait(fence, false);
+                                       dma_fence_put(fence);
+                               }
+                       }
+               }
+       }
+       mutex_unlock(&mgr->lock);
+}
+
 void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr)
 {
        amdgpu_ctx_mgr_entity_fini(mgr);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
index cf8d700a22fe..0ea86235d0df 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
@@ -92,6 +92,7 @@ int amdgpu_ctx_wait_prev_fence(struct amdgpu_ctx *ctx,
 void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr,
                         struct amdgpu_device *adev);
 long amdgpu_ctx_mgr_entity_flush(struct amdgpu_ctx_mgr *mgr, long timeout);
+void amdgpu_ctx_mgr_wait_idle(struct amdgpu_ctx_mgr *mgr);
 void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr);
 void amdgpu_ctx_mgr_usage(struct amdgpu_ctx_mgr *mgr,
                          ktime_t usage[AMDGPU_HW_IP_NUM]);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 4d4d21babc61..6089bc30c915 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2972,6 +2972,11 @@ static int amdgpu_drm_release(struct inode *inode, 
struct file *filp)
                drm_dev_exit(idx);
        }
 
+
+       /* Wait for all HW fences before drm_release tears down GEM/page tables 
*/
+       if (fpriv)
+               amdgpu_ctx_mgr_wait_idle(&fpriv->ctx_mgr);
+
        return drm_release(inode, filp);
 }
 
-- 
2.54.0

Reply via email to