amdgpu: set an error on all fences from a bad context

Alex Deucher Mon, 15 Sep 2025 10:22:45 -0700

When we backup ring contents to reemit after a queue reset,
we don't backup ring contents from the bad context.  When
we signal the fences, we should set an error on those
fences as well.


v2: misc cleanups
v3: add locking for fence error, fix comment (Christian)
v4: fix wrap around, locking (Christian)

Fixes: 77cc0da39c7c ("drm/amdgpu: track ring state associated with a fence")
Signed-off-by: Alex Deucher <alexander.deuc...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 39 ++++++++++++++++++++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c  |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h  |  2 +-
 3 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index e270df30c2790..18a7829122d24 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -758,11 +758,42 @@ void amdgpu_fence_driver_force_completion(struct 
amdgpu_ring *ring)
  * @fence: fence of the ring to signal
  *
  */
-void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *fence)
+void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *af)
 {
-       dma_fence_set_error(&fence->base, -ETIME);
-       amdgpu_fence_write(fence->ring, fence->seq);
-       amdgpu_fence_process(fence->ring);
+       struct dma_fence *unprocessed;
+       struct dma_fence __rcu **ptr;
+       struct amdgpu_fence *fence;
+       struct amdgpu_ring *ring = af->ring;
+       unsigned long flags;
+       u32 seq, last_seq;
+
+       last_seq = amdgpu_fence_read(ring) & ring->fence_drv.num_fences_mask;
+       seq = ring->fence_drv.sync_seq & ring->fence_drv.num_fences_mask;
+
+       /* mark all fences from the guilty context with an error */
+       spin_lock_irqsave(&ring->fence_drv.lock, flags);
+       do {
+               last_seq++;
+               last_seq &= ring->fence_drv.num_fences_mask;
+
+               ptr = &ring->fence_drv.fences[last_seq];
+               rcu_read_lock();
+               unprocessed = rcu_dereference(*ptr);
+
+               if (unprocessed && !dma_fence_is_signaled_locked(unprocessed)) {
+                       fence = container_of(unprocessed, struct amdgpu_fence, 
base);
+
+                       if (fence == af)
+                               dma_fence_set_error(&fence->base, -ETIME);
+                       else if (fence->context == af->context)
+                               dma_fence_set_error(&fence->base, -ECANCELED);
+               }
+               rcu_read_unlock();
+       } while (last_seq != seq);
+       spin_unlock_irqrestore(&ring->fence_drv.lock, flags);
+       /* signal the guilty fence */
+       amdgpu_fence_write(ring, af->seq);
+       amdgpu_fence_process(ring);
 }
 
 void amdgpu_fence_save_wptr(struct dma_fence *fence)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 8f6ce948c6841..5ec5c3ff22bb0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -811,7 +811,7 @@ int amdgpu_ring_reset_helper_end(struct amdgpu_ring *ring,
        if (r)
                return r;
 
-       /* signal the fence of the bad job */
+       /* signal the guilty fence and set an error on all fences from the 
context */
        if (guilty_fence)
                amdgpu_fence_driver_guilty_force_completion(guilty_fence);
        /* Re-emit the non-guilty commands */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index b6b6491797761..4b46e3c26ff39 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -155,7 +155,7 @@ extern const struct drm_sched_backend_ops amdgpu_sched_ops;
 void amdgpu_fence_driver_clear_job_fences(struct amdgpu_ring *ring);
 void amdgpu_fence_driver_set_error(struct amdgpu_ring *ring, int error);
 void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring);
-void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *fence);
+void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *af);
 void amdgpu_fence_save_wptr(struct dma_fence *fence);
 
 int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring);
-- 
2.51.0

[PATCH 2/3] drm/amdgpu: set an error on all fences from a bad context

Reply via email to