On Fri, May 8, 2026 at 3:14 AM Jesse Zhang <[email protected]> wrote:
>
> Add IP-agnostic helpers to coordinate a MEC pipe reset across all KCQs on
> the same (me, pipe): prepare (back up siblings, stop schedulers, stop KFD),
> restart_schedulers, and recover_queues (re-init/remap KCQs and run
> amdgpu_ring_reset_helper_end on each — guilty gets the timeout fence,
> siblings get a synthetic context so collateral work is reemitted).
>
> Suggested-by: Manu Rastogi <[email protected]>
> Suggested-by: Alex Deucher <[email protected]>
> Signed-off-by: Jesse Zhang <[email protected]>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 115 ++++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 11 +++
> 2 files changed, 126 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index 515cc4a2aeb4..8cfb73fda4bb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -69,6 +69,121 @@ void amdgpu_queue_mask_bit_to_mec_queue(struct
> amdgpu_device *adev, int bit,
>
> }
>
> +static bool amdgpu_gfx_ring_on_mec_pipe(struct amdgpu_ring *ring, u32 me,
> u32 pipe)
> +{
> + if (!ring || !ring->funcs || ring->funcs->type !=
> AMDGPU_RING_TYPE_COMPUTE)
> + return false;
> + if (ring->no_scheduler)
> + return false;
> +
> + return ring->me == me && ring->pipe == pipe;
> +}
> +
> +static unsigned int amdgpu_gfx_mec_pipe_compute_ring_base(struct
> amdgpu_device *adev,
> + u32 xcc_id)
> +{
> + int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) : 1;
> +
> + if (num_xcc <= 1)
> + return 0;
> + return xcc_id * adev->gfx.num_compute_rings;
> +}
> +
> +void amdgpu_gfx_mec_pipe_reset_prepare(struct amdgpu_device *adev,
> + struct amdgpu_ring *guilty)
> +{
> + struct amdgpu_ring *r;
> + unsigned int j, base;
> +
> + base = amdgpu_gfx_mec_pipe_compute_ring_base(adev, guilty->xcc_id);
> + for (j = 0; j < adev->gfx.num_compute_rings; j++) {
> + r = &adev->gfx.compute_ring[base + j];
> + if (!amdgpu_gfx_ring_on_mec_pipe(r, guilty->me, guilty->pipe))
> + continue;
> + if (r != guilty)
> + amdgpu_ring_backup_unprocessed_commands(r, NULL);
> + if (amdgpu_ring_sched_ready(r))
> + drm_sched_wqueue_stop(&r->sched);
> + }
> +
> + if (adev->kfd.init_complete)
> + amdgpu_amdkfd_stop_sched(adev, guilty->xcc_id);
> +}
> +
> +void amdgpu_gfx_mec_pipe_restart_schedulers(struct amdgpu_device *adev,
> + u32 me, u32 pipe, u32 xcc_id)
> +{
> + struct amdgpu_ring *r;
> + unsigned int j, base;
> +
> + base = amdgpu_gfx_mec_pipe_compute_ring_base(adev, xcc_id);
> + for (j = 0; j < adev->gfx.num_compute_rings; j++) {
> + r = &adev->gfx.compute_ring[base + j];
> + if (!amdgpu_gfx_ring_on_mec_pipe(r, me, pipe))
> + continue;
> + if (amdgpu_ring_sched_ready(r))
> + drm_sched_wqueue_start(&r->sched);
> + }
> +
> + if (adev->kfd.init_complete)
> + amdgpu_amdkfd_start_sched(adev, xcc_id);
> +}
> +
> +int amdgpu_gfx_mec_pipe_reset_recover_queues(struct amdgpu_device *adev,
> + struct amdgpu_ring *guilty,
> + struct amdgpu_fence
> *timedout_fence,
> + amdgpu_gfx_kcq_init_queue_t
> kcq_init)
> +{
> + struct amdgpu_fence collateral_reemit = {};
> + struct amdgpu_ring *r;
> + unsigned int j, base;
> + int err = 0;
> +
> + if (!timedout_fence)
> + return -EINVAL;
> +
> + collateral_reemit.context = (u64)-1;
> +
> + base = amdgpu_gfx_mec_pipe_compute_ring_base(adev, guilty->xcc_id);
> + for (j = 0; j < adev->gfx.num_compute_rings; j++) {
> + r = &adev->gfx.compute_ring[base + j];
> + if (!amdgpu_gfx_ring_on_mec_pipe(r, guilty->me, guilty->pipe))
> + continue;
> +
> + err = kcq_init(r, true);
> + if (err)
> + goto err_sched;
> + err = amdgpu_mes_map_legacy_queue(adev, r, 0);
> + if (err)
> + goto err_sched;
> + }
> +
> + amdgpu_gfx_mec_pipe_restart_schedulers(adev, guilty->me, guilty->pipe,
> + guilty->xcc_id);
> +
> + for (j = 0; j < adev->gfx.num_compute_rings; j++) {
> + r = &adev->gfx.compute_ring[base + j];
> + if (!amdgpu_gfx_ring_on_mec_pipe(r, guilty->me, guilty->pipe))
> + continue;
> +
> + err = amdgpu_ring_reset_helper_end(
> + r, r == guilty ? timedout_fence : &collateral_reemit);
I don't think this will work. We don't actually know which job hung
the pipe so we can't resubmit all of the jobs. We either need to
discard all jobs (amdgpu_fence_driver_force_completion()) or just the
ones associated with the context from the earliest job on the queue.
Rather than stopping the jobs and doing all of the resets here, it
might be cleaner to have a worker thread which schedules a call to
drm_sched_fault() for all of the affected queues. Then each queue
reset will run sequentially. You need to be careful however since you
could get into a loop where each reset ends up scheduling more resets
if we end up falling back to the pipe reset handler for some reason.
Alex
> + if (err) {
> + dev_err(adev->dev,
> + "ring %s failed recover after MEC pipe reset
> (%d)\n",
> + r->name, err);
> + return err;
> + }
> + }
> +
> + return 0;
> +
> +err_sched:
> + amdgpu_gfx_mec_pipe_restart_schedulers(adev, guilty->me, guilty->pipe,
> + guilty->xcc_id);
> + return err;
> +}
> +
> bool amdgpu_gfx_is_mec_queue_enabled(struct amdgpu_device *adev,
> int xcc_id, int mec, int pipe, int queue)
> {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index 77050f9884f2..38b317b91bbe 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -603,6 +603,17 @@ int amdgpu_gfx_mec_queue_to_bit(struct amdgpu_device
> *adev, int mec,
> int pipe, int queue);
> void amdgpu_queue_mask_bit_to_mec_queue(struct amdgpu_device *adev, int bit,
> int *mec, int *pipe, int *queue);
> +
> +typedef int (*amdgpu_gfx_kcq_init_queue_t)(struct amdgpu_ring *ring, bool
> clear);
> +
> +void amdgpu_gfx_mec_pipe_reset_prepare(struct amdgpu_device *adev,
> + struct amdgpu_ring *guilty);
> +void amdgpu_gfx_mec_pipe_restart_schedulers(struct amdgpu_device *adev,
> + u32 me, u32 pipe, u32 xcc_id);
> +int amdgpu_gfx_mec_pipe_reset_recover_queues(
> + struct amdgpu_device *adev, struct amdgpu_ring *guilty,
> + struct amdgpu_fence *timedout_fence,
> + amdgpu_gfx_kcq_init_queue_t kcq_init);
> bool amdgpu_gfx_is_mec_queue_enabled(struct amdgpu_device *adev, int xcc_id,
> int mec, int pipe, int queue);
> bool amdgpu_gfx_is_high_priority_compute_queue(struct amdgpu_device *adev,
> --
> 2.49.0
>