AMD General

Hi Alex,
> -----Original Message-----
> From: Alex Deucher <[email protected]>
> Sent: Friday, May 8, 2026 9:42 PM
> To: Zhang, Jesse(Jie) <[email protected]>
> Cc: [email protected]; Deucher, Alexander
> <[email protected]>; Koenig, Christian
> <[email protected]>; Rastogi, Manu <[email protected]>
> Subject: Re: [PATCH 1/4] drm/amdgpu: add MEC pipe reset helpers
>
> On Fri, May 8, 2026 at 3:14 AM Jesse Zhang <[email protected]> wrote:
> >
> > Add IP-agnostic helpers to coordinate a MEC pipe reset across all KCQs
> > on the same (me, pipe): prepare (back up siblings, stop schedulers,
> > stop KFD), restart_schedulers, and recover_queues (re-init/remap KCQs
> > and run amdgpu_ring_reset_helper_end on each — guilty gets the timeout
> > fence, siblings get a synthetic context so collateral work is reemitted).
> >
> > Suggested-by: Manu Rastogi <[email protected]>
> > Suggested-by: Alex Deucher <[email protected]>
> > Signed-off-by: Jesse Zhang <[email protected]>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 115
> > ++++++++++++++++++++++++  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h |
> > 11 +++
> >  2 files changed, 126 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > index 515cc4a2aeb4..8cfb73fda4bb 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > @@ -69,6 +69,121 @@ void amdgpu_queue_mask_bit_to_mec_queue(struct
> > amdgpu_device *adev, int bit,
> >
> >  }
> >
> > +static bool amdgpu_gfx_ring_on_mec_pipe(struct amdgpu_ring *ring, u32
> > +me, u32 pipe) {
> > +       if (!ring || !ring->funcs || ring->funcs->type !=
> AMDGPU_RING_TYPE_COMPUTE)
> > +               return false;
> > +       if (ring->no_scheduler)
> > +               return false;
> > +
> > +       return ring->me == me && ring->pipe == pipe; }
> > +
> > +static unsigned int amdgpu_gfx_mec_pipe_compute_ring_base(struct
> amdgpu_device *adev,
> > +                                                        u32 xcc_id) {
> > +       int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask)
> > +: 1;
> > +
> > +       if (num_xcc <= 1)
> > +               return 0;
> > +       return xcc_id * adev->gfx.num_compute_rings; }
> > +
> > +void amdgpu_gfx_mec_pipe_reset_prepare(struct amdgpu_device *adev,
> > +                                      struct amdgpu_ring *guilty) {
> > +       struct amdgpu_ring *r;
> > +       unsigned int j, base;
> > +
> > +       base = amdgpu_gfx_mec_pipe_compute_ring_base(adev, guilty->xcc_id);
> > +       for (j = 0; j < adev->gfx.num_compute_rings; j++) {
> > +               r = &adev->gfx.compute_ring[base + j];
> > +               if (!amdgpu_gfx_ring_on_mec_pipe(r, guilty->me, 
> > guilty->pipe))
> > +                       continue;
> > +               if (r != guilty)
> > +                       amdgpu_ring_backup_unprocessed_commands(r, NULL);
> > +               if (amdgpu_ring_sched_ready(r))
> > +                       drm_sched_wqueue_stop(&r->sched);
> > +       }
> > +
> > +       if (adev->kfd.init_complete)
> > +               amdgpu_amdkfd_stop_sched(adev, guilty->xcc_id); }
> > +
> > +void amdgpu_gfx_mec_pipe_restart_schedulers(struct amdgpu_device *adev,
> > +                                           u32 me, u32 pipe, u32
> > +xcc_id) {
> > +       struct amdgpu_ring *r;
> > +       unsigned int j, base;
> > +
> > +       base = amdgpu_gfx_mec_pipe_compute_ring_base(adev, xcc_id);
> > +       for (j = 0; j < adev->gfx.num_compute_rings; j++) {
> > +               r = &adev->gfx.compute_ring[base + j];
> > +               if (!amdgpu_gfx_ring_on_mec_pipe(r, me, pipe))
> > +                       continue;
> > +               if (amdgpu_ring_sched_ready(r))
> > +                       drm_sched_wqueue_start(&r->sched);
> > +       }
> > +
> > +       if (adev->kfd.init_complete)
> > +               amdgpu_amdkfd_start_sched(adev, xcc_id); }
> > +
> > +int amdgpu_gfx_mec_pipe_reset_recover_queues(struct amdgpu_device *adev,
> > +                                            struct amdgpu_ring *guilty,
> > +                                            struct amdgpu_fence 
> > *timedout_fence,
> > +
> > +amdgpu_gfx_kcq_init_queue_t kcq_init) {
> > +       struct amdgpu_fence collateral_reemit = {};
> > +       struct amdgpu_ring *r;
> > +       unsigned int j, base;
> > +       int err = 0;
> > +
> > +       if (!timedout_fence)
> > +               return -EINVAL;
> > +
> > +       collateral_reemit.context = (u64)-1;
> > +
> > +       base = amdgpu_gfx_mec_pipe_compute_ring_base(adev, guilty->xcc_id);
> > +       for (j = 0; j < adev->gfx.num_compute_rings; j++) {
> > +               r = &adev->gfx.compute_ring[base + j];
> > +               if (!amdgpu_gfx_ring_on_mec_pipe(r, guilty->me, 
> > guilty->pipe))
> > +                       continue;
> > +
> > +               err = kcq_init(r, true);
> > +               if (err)
> > +                       goto err_sched;
> > +               err = amdgpu_mes_map_legacy_queue(adev, r, 0);
> > +               if (err)
> > +                       goto err_sched;
> > +       }
> > +
> > +       amdgpu_gfx_mec_pipe_restart_schedulers(adev, guilty->me, 
> > guilty->pipe,
> > +                                              guilty->xcc_id);
> > +
> > +       for (j = 0; j < adev->gfx.num_compute_rings; j++) {
> > +               r = &adev->gfx.compute_ring[base + j];
> > +               if (!amdgpu_gfx_ring_on_mec_pipe(r, guilty->me, 
> > guilty->pipe))
> > +                       continue;
> > +
> > +               err = amdgpu_ring_reset_helper_end(
> > +                       r, r == guilty ? timedout_fence :
> > + &collateral_reemit);
>
> I don't think this will work.  We don't actually know which job hung the pipe 
> so we
> can't resubmit all of the jobs.  We either need to discard all jobs
> (amdgpu_fence_driver_force_completion()) or just the ones associated with the
> context from the earliest job on the queue.
Agreed.

> Rather than stopping the jobs and doing all of the resets here, it might be 
> cleaner to
> have a worker thread which schedules a call to
> drm_sched_fault() for all of the affected queues.  Then each queue reset will 
> run
> sequentially.  You need to be careful however since you could get into a loop 
> where
> each reset ends up scheduling more resets if we end up falling back to the 
> pipe
> reset handler for some reason.
do you mean that the DRM scheduler doesn't need to be stopped before the 
pipeline reset, and it doesn't need to be restarted after the pipeline reset.
just need to call the `drm_sched_fault()` for the other queues on the pipe 
after the pipe reset, right?

 How about this? Simply remove the patch, just like the pipe reset in 
`gfx_v9_4_3_reset_kcq`.
The pipe reset follows the same sequence without drm_sched_fault after reset。
If there are running jobs in the affected queues, those jobs should 
automatically time out.
This will then trigger the queues/pipelines in sequence

Thanks
Jesse
>
> Alex
>
>
> > +               if (err) {
> > +                       dev_err(adev->dev,
> > +                               "ring %s failed recover after MEC pipe 
> > reset (%d)\n",
> > +                               r->name, err);
> > +                       return err;
> > +               }
> > +       }
> > +
> > +       return 0;
> > +
> > +err_sched:
> > +       amdgpu_gfx_mec_pipe_restart_schedulers(adev, guilty->me, 
> > guilty->pipe,
> > +                                              guilty->xcc_id);
> > +       return err;
> > +}
> > +
> >  bool amdgpu_gfx_is_mec_queue_enabled(struct amdgpu_device *adev,
> >                                      int xcc_id, int mec, int pipe,
> > int queue)  { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > index 77050f9884f2..38b317b91bbe 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > @@ -603,6 +603,17 @@ int amdgpu_gfx_mec_queue_to_bit(struct
> amdgpu_device *adev, int mec,
> >                                 int pipe, int queue);  void
> > amdgpu_queue_mask_bit_to_mec_queue(struct amdgpu_device *adev, int bit,
> >                                  int *mec, int *pipe, int *queue);
> > +
> > +typedef int (*amdgpu_gfx_kcq_init_queue_t)(struct amdgpu_ring *ring,
> > +bool clear);
> > +
> > +void amdgpu_gfx_mec_pipe_reset_prepare(struct amdgpu_device *adev,
> > +                                      struct amdgpu_ring *guilty);
> > +void amdgpu_gfx_mec_pipe_restart_schedulers(struct amdgpu_device *adev,
> > +                                           u32 me, u32 pipe, u32
> > +xcc_id); int amdgpu_gfx_mec_pipe_reset_recover_queues(
> > +       struct amdgpu_device *adev, struct amdgpu_ring *guilty,
> > +       struct amdgpu_fence *timedout_fence,
> > +       amdgpu_gfx_kcq_init_queue_t kcq_init);
> >  bool amdgpu_gfx_is_mec_queue_enabled(struct amdgpu_device *adev, int
> xcc_id,
> >                                      int mec, int pipe, int queue);
> > bool amdgpu_gfx_is_high_priority_compute_queue(struct amdgpu_device
> > *adev,
> > --
> > 2.49.0
> >

Reply via email to