amdgpu: add MEC pipe reset helpers

Jesse Zhang Fri, 08 May 2026 00:14:18 -0700

Add IP-agnostic helpers to coordinate a MEC pipe reset across all KCQs on
the same (me, pipe): prepare (back up siblings, stop schedulers, stop KFD),
restart_schedulers, and recover_queues (re-init/remap KCQs and run
amdgpu_ring_reset_helper_end on each — guilty gets the timeout fence,
siblings get a synthetic context so collateral work is reemitted).


Suggested-by: Manu Rastogi <[email protected]>
Suggested-by: Alex Deucher <[email protected]>
Signed-off-by: Jesse Zhang <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 115 ++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h |  11 +++
 2 files changed, 126 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 515cc4a2aeb4..8cfb73fda4bb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -69,6 +69,121 @@ void amdgpu_queue_mask_bit_to_mec_queue(struct 
amdgpu_device *adev, int bit,
 
 }
 
+static bool amdgpu_gfx_ring_on_mec_pipe(struct amdgpu_ring *ring, u32 me, u32 
pipe)
+{
+       if (!ring || !ring->funcs || ring->funcs->type != 
AMDGPU_RING_TYPE_COMPUTE)
+               return false;
+       if (ring->no_scheduler)
+               return false;
+
+       return ring->me == me && ring->pipe == pipe;
+}
+
+static unsigned int amdgpu_gfx_mec_pipe_compute_ring_base(struct amdgpu_device 
*adev,
+                                                        u32 xcc_id)
+{
+       int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) : 1;
+
+       if (num_xcc <= 1)
+               return 0;
+       return xcc_id * adev->gfx.num_compute_rings;
+}
+
+void amdgpu_gfx_mec_pipe_reset_prepare(struct amdgpu_device *adev,
+                                      struct amdgpu_ring *guilty)
+{
+       struct amdgpu_ring *r;
+       unsigned int j, base;
+
+       base = amdgpu_gfx_mec_pipe_compute_ring_base(adev, guilty->xcc_id);
+       for (j = 0; j < adev->gfx.num_compute_rings; j++) {
+               r = &adev->gfx.compute_ring[base + j];
+               if (!amdgpu_gfx_ring_on_mec_pipe(r, guilty->me, guilty->pipe))
+                       continue;
+               if (r != guilty)
+                       amdgpu_ring_backup_unprocessed_commands(r, NULL);
+               if (amdgpu_ring_sched_ready(r))
+                       drm_sched_wqueue_stop(&r->sched);
+       }
+
+       if (adev->kfd.init_complete)
+               amdgpu_amdkfd_stop_sched(adev, guilty->xcc_id);
+}
+
+void amdgpu_gfx_mec_pipe_restart_schedulers(struct amdgpu_device *adev,
+                                           u32 me, u32 pipe, u32 xcc_id)
+{
+       struct amdgpu_ring *r;
+       unsigned int j, base;
+
+       base = amdgpu_gfx_mec_pipe_compute_ring_base(adev, xcc_id);
+       for (j = 0; j < adev->gfx.num_compute_rings; j++) {
+               r = &adev->gfx.compute_ring[base + j];
+               if (!amdgpu_gfx_ring_on_mec_pipe(r, me, pipe))
+                       continue;
+               if (amdgpu_ring_sched_ready(r))
+                       drm_sched_wqueue_start(&r->sched);
+       }
+
+       if (adev->kfd.init_complete)
+               amdgpu_amdkfd_start_sched(adev, xcc_id);
+}
+
+int amdgpu_gfx_mec_pipe_reset_recover_queues(struct amdgpu_device *adev,
+                                            struct amdgpu_ring *guilty,
+                                            struct amdgpu_fence 
*timedout_fence,
+                                            amdgpu_gfx_kcq_init_queue_t 
kcq_init)
+{
+       struct amdgpu_fence collateral_reemit = {};
+       struct amdgpu_ring *r;
+       unsigned int j, base;
+       int err = 0;
+
+       if (!timedout_fence)
+               return -EINVAL;
+
+       collateral_reemit.context = (u64)-1;
+
+       base = amdgpu_gfx_mec_pipe_compute_ring_base(adev, guilty->xcc_id);
+       for (j = 0; j < adev->gfx.num_compute_rings; j++) {
+               r = &adev->gfx.compute_ring[base + j];
+               if (!amdgpu_gfx_ring_on_mec_pipe(r, guilty->me, guilty->pipe))
+                       continue;
+
+               err = kcq_init(r, true);
+               if (err)
+                       goto err_sched;
+               err = amdgpu_mes_map_legacy_queue(adev, r, 0);
+               if (err)
+                       goto err_sched;
+       }
+
+       amdgpu_gfx_mec_pipe_restart_schedulers(adev, guilty->me, guilty->pipe,
+                                              guilty->xcc_id);
+
+       for (j = 0; j < adev->gfx.num_compute_rings; j++) {
+               r = &adev->gfx.compute_ring[base + j];
+               if (!amdgpu_gfx_ring_on_mec_pipe(r, guilty->me, guilty->pipe))
+                       continue;
+
+               err = amdgpu_ring_reset_helper_end(
+                       r, r == guilty ? timedout_fence : &collateral_reemit);
+               if (err) {
+                       dev_err(adev->dev,
+                               "ring %s failed recover after MEC pipe reset 
(%d)\n",
+                               r->name, err);
+                       return err;
+               }
+       }
+
+       return 0;
+
+err_sched:
+       amdgpu_gfx_mec_pipe_restart_schedulers(adev, guilty->me, guilty->pipe,
+                                              guilty->xcc_id);
+       return err;
+}
+
 bool amdgpu_gfx_is_mec_queue_enabled(struct amdgpu_device *adev,
                                     int xcc_id, int mec, int pipe, int queue)
 {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index 77050f9884f2..38b317b91bbe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -603,6 +603,17 @@ int amdgpu_gfx_mec_queue_to_bit(struct amdgpu_device 
*adev, int mec,
                                int pipe, int queue);
 void amdgpu_queue_mask_bit_to_mec_queue(struct amdgpu_device *adev, int bit,
                                 int *mec, int *pipe, int *queue);
+
+typedef int (*amdgpu_gfx_kcq_init_queue_t)(struct amdgpu_ring *ring, bool 
clear);
+
+void amdgpu_gfx_mec_pipe_reset_prepare(struct amdgpu_device *adev,
+                                      struct amdgpu_ring *guilty);
+void amdgpu_gfx_mec_pipe_restart_schedulers(struct amdgpu_device *adev,
+                                           u32 me, u32 pipe, u32 xcc_id);
+int amdgpu_gfx_mec_pipe_reset_recover_queues(
+       struct amdgpu_device *adev, struct amdgpu_ring *guilty,
+       struct amdgpu_fence *timedout_fence,
+       amdgpu_gfx_kcq_init_queue_t kcq_init);
 bool amdgpu_gfx_is_mec_queue_enabled(struct amdgpu_device *adev, int xcc_id,
                                     int mec, int pipe, int queue);
 bool amdgpu_gfx_is_high_priority_compute_queue(struct amdgpu_device *adev,
-- 
2.49.0

[PATCH 1/4] drm/amdgpu: add MEC pipe reset helpers

Reply via email to