On Tue, May 12, 2026 at 4:59 AM Jesse Zhang <[email protected]> wrote:
>
> Introduce helpers to prepare for and recover from a MEC pipe
> reset.  The pre‑reset handler stops the KFD scheduler if the KFD is
> initialised, preventing new submissions while the pipe is being
> reset.  The post‑reset handler iterates over all compute rings
> sharing the same MEC pipe (on the affected XCC) and marks any
> non‑guilty ring’s scheduler as faulted via drm_sched_fault().
>
> v2: drop the stop drm scheduer, have a worker thread which schedules a call to
> drm_sched_fault() for all of the affected queues (Alex)
>
> Suggested-by: Alex Deucher <[email protected]>
> Signed-off-by: Jesse Zhang <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 45 +++++++++++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h |  5 +++
>  2 files changed, 50 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index 515cc4a2aeb4..a9fd639e4cd6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -69,6 +69,51 @@ void amdgpu_queue_mask_bit_to_mec_queue(struct 
> amdgpu_device *adev, int bit,
>
>  }
>
> +static bool amdgpu_gfx_ring_on_mec_pipe(struct amdgpu_ring *ring, u32 me, 
> u32 pipe)
> +{
> +       if (!ring || !ring->funcs || ring->funcs->type != 
> AMDGPU_RING_TYPE_COMPUTE)
> +               return false;
> +
> +       return ring->me == me && ring->pipe == pipe;
> +}
> +
> +static unsigned int amdgpu_gfx_mec_pipe_compute_ring_base(struct 
> amdgpu_device *adev,
> +                                                        u32 xcc_id)
> +{
> +       int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) : 1;
> +
> +       if (num_xcc <= 1)
> +               return 0;
> +       return xcc_id * adev->gfx.num_compute_rings;
> +}
> +
> +void amdgpu_gfx_mec_pre_pipe_reset(struct amdgpu_device *adev,
> +                                      struct amdgpu_ring *guilty)
> +{
> +       if (adev->kfd.init_complete)
> +               amdgpu_amdkfd_stop_sched(adev, guilty->xcc_id);
> +}
> +
> +void amdgpu_gfx_mec_post_pipe_reset(struct amdgpu_device *adev, struct 
> amdgpu_ring *guilty)
> +{
> +       struct amdgpu_ring *ring;
> +       unsigned int j, base;
> +
> +       base = amdgpu_gfx_mec_pipe_compute_ring_base(adev, guilty->xcc_id);
> +       for (j = 0; j < adev->gfx.num_compute_rings; j++) {
> +               ring = &adev->gfx.compute_ring[base + j];
> +               if (!amdgpu_gfx_ring_on_mec_pipe(ring, guilty->me, 
> guilty->pipe))
> +                       continue;
> +
> +               if (ring != guilty)
> +                       drm_sched_fault(&ring->sched);

You can't call this from the reset handler since you are already in
the work queue that this modifies.

Alex

> +       }
> +
> +       if (adev->kfd.init_complete)
> +               amdgpu_amdkfd_start_sched(adev, guilty->xcc_id);
> +}
> +
>  bool amdgpu_gfx_is_mec_queue_enabled(struct amdgpu_device *adev,
>                                      int xcc_id, int mec, int pipe, int queue)
>  {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index 77050f9884f2..1deb82836f02 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -603,6 +603,11 @@ int amdgpu_gfx_mec_queue_to_bit(struct amdgpu_device 
> *adev, int mec,
>                                 int pipe, int queue);
>  void amdgpu_queue_mask_bit_to_mec_queue(struct amdgpu_device *adev, int bit,
>                                  int *mec, int *pipe, int *queue);
> +
> +void amdgpu_gfx_mec_pre_pipe_reset(struct amdgpu_device *adev,
> +                                       struct amdgpu_ring *guilty);
> +void amdgpu_gfx_mec_post_pipe_reset(struct amdgpu_device *adev,
> +                                       struct amdgpu_ring *guilty);
>  bool amdgpu_gfx_is_mec_queue_enabled(struct amdgpu_device *adev, int xcc_id,
>                                      int mec, int pipe, int queue);
>  bool amdgpu_gfx_is_high_priority_compute_queue(struct amdgpu_device *adev,
> --
> 2.49.0
>

Reply via email to