gfx: defer per-queue helper_end until after MES resume

Liang, Prike Mon, 08 Jun 2026 20:00:17 -0700

Public

Regards,
      Prike


> -----Original Message-----
> From: amd-gfx <[email protected]> On Behalf Of Jesse
> Zhang
> Sent: Monday, June 8, 2026 11:06 AM
> To: [email protected]
> Cc: Deucher, Alexander <[email protected]>; Koenig, Christian
> <[email protected]>; Zhang, Jesse(Jie) <[email protected]>;
> Zhang, Jesse(Jie) <[email protected]>
> Subject: [PATCH] drm/amdgpu/gfx: defer per-queue helper_end until after MES
> resume
>
> amdgpu_gfx_reset_mes_compute() runs amdgpu_mes_suspend(adev, 0) to quiesce
> all gangs, resets the offending queue(s), then resumes. The existing
> amdgpu_gfx_mes_reset_queue() called amdgpu_ring_reset_helper_end() right after
> unmap/restore/map of the reset queue, which re-emits backed-up commands and
> rings the doorbell. That doorbell hits a still-suspended CP:
> on the subsequent resume the queue partially wedges -- the first new IB after 
> the
> reset may execute but later submissions stall, which surfaces as repeated 
> timeouts
> on the same ring under concurrent workloads.
>
> Split out amdgpu_gfx_mes_reset_queue_no_end() (backup + MES reset +
> unmap/restore/map only) and defer helper_end. amdgpu_gfx_reset_mes_compute()
> collects the (ring, fence) pair for every queue it resets and runs helper_end 
> on each
> after amdgpu_mes_resume(), so the re-emit doorbells land on a running CP.
> amdgpu_gfx_reset_mes_kcq() now reports the matched ring/fence back to the 
> caller
> for the same reason.
>
> Signed-off-by: Jesse Zhang <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 68 ++++++++++++++++++++++---
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h |  5 ++
>  2 files changed, 65 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index ff5a55f5f3c9..b6202095f256 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -1989,10 +1989,10 @@ static ssize_t
> amdgpu_gfx_get_compute_reset_mask(struct device *dev,
>       return amdgpu_show_reset_mask(buf, adev->gfx.compute_supported_reset);
>  }
>
> -int amdgpu_gfx_mes_reset_queue(struct amdgpu_ring *ring,
> -                            unsigned int vmid,
> -                            struct amdgpu_fence *timedout_fence,
> -                            bool use_mmio)
> +static int amdgpu_gfx_mes_reset_queue_no_end(struct amdgpu_ring *ring,
> +                                          unsigned int vmid,
> +                                          struct amdgpu_fence 
> *timedout_fence,
> +                                          bool use_mmio)
>  {
>       struct amdgpu_device *adev = ring->adev;
>       bool reinit_queue;
> @@ -2026,7 +2026,20 @@ int amdgpu_gfx_mes_reset_queue(struct amdgpu_ring
> *ring,
>                       return r;
>               }
>       }
> +     return 0;
> +}
>
> +int amdgpu_gfx_mes_reset_queue(struct amdgpu_ring *ring,
> +                            unsigned int vmid,
> +                            struct amdgpu_fence *timedout_fence,
> +                            bool use_mmio)
> +{
> +     int r;
> +
> +     r = amdgpu_gfx_mes_reset_queue_no_end(ring, vmid, timedout_fence,
> +                                           use_mmio);
> +     if (r)
> +             return r;
>       return amdgpu_ring_reset_helper_end(ring, timedout_fence);  }
>
> @@ -2216,24 +2229,37 @@ static void
> amdgpu_gfx_reset_stop_compute_scheds(struct amdgpu_device *adev,
>       }
>  }
>
> +/*
> + * Match the MES-reported hung doorbell against a compute ring and run
> + * the core reset (no helper_end). On hit, the matched ring and its
> +guilty
> + * fence are returned via *out_ring / *out_fence so the caller can
> +defer
> + * helper_end until after MES has resumed all gangs.
> + */
>  static int amdgpu_gfx_reset_mes_kcq(struct amdgpu_device *adev,
>                                   struct amdgpu_ring *guilty_ring,
> -                                 unsigned int db)
> +                                 unsigned int db,
> +                                 struct amdgpu_ring **out_ring,
> +                                 struct amdgpu_fence **out_fence)
>  {
>       bool use_mmio = adev->gfx.mec.use_mmio_for_reset;
>       struct amdgpu_fence *fence;
>       struct amdgpu_ring *ring;
>       int i, r;
>
> +     *out_ring = NULL;
> +     *out_fence = NULL;
>       for (i = 0; i < adev->gfx.num_compute_rings; i++) {
>               ring = &adev->gfx.compute_ring[i];
>               if (ring == guilty_ring)
>                       continue;
>               if (ring->doorbell_index == db) {
>                       fence = amdgpu_ring_find_guilty_fence(ring);
> -                     r = amdgpu_gfx_mes_reset_queue(ring, 0, fence, 
> use_mmio);

If it's possible to haven't any guilty fence return from 
amdgpu_ring_find_guilty_fence(), then we need to handle this case respectively 
in the amdgpu_gfx_mes_reset_queue_no_end()?

> +                     r = amdgpu_gfx_mes_reset_queue_no_end(ring, 0, fence,
> +                                                           use_mmio);
>                       if (r)
>                               return r;
> +                     *out_ring = ring;
> +                     *out_fence = fence;
>                       break;
>               }
>       }
> @@ -2254,6 +2280,8 @@ int amdgpu_gfx_reset_mes_compute(struct
> amdgpu_device *adev,
>       unsigned int num_hung = 0;
>       bool use_mmio = adev->gfx.mec.use_mmio_for_reset;
>       struct mes_remove_queue_input *queue_input = (struct
> mes_remove_queue_input *)faulty_queue_input;
> +     struct amdgpu_gfx_deferred_entry
> deferred_end[AMDGPU_MAX_COMPUTE_RINGS + 1];
> +     int n_deferred = 0;
>
>       guard(mutex)(&adev->gfx.mec.reset_mutex);
>       /* stop the drm schedulers for all compute queues */ @@ -2278,9 +2306,13
> @@ int amdgpu_gfx_reset_mes_compute(struct amdgpu_device *adev,
>  fence_reset:
>       /* reset the queue this came from if specified */
>       if (ring) {
> -             r = amdgpu_gfx_mes_reset_queue(ring, 0, guilty_fence, use_mmio);
> +             r = amdgpu_gfx_mes_reset_queue_no_end(ring, 0, guilty_fence,
> +                                                   use_mmio);
>               if (r)
>                       goto out;
> +             deferred_end[n_deferred].ring = ring;
> +             deferred_end[n_deferred].fence = guilty_fence;
> +             n_deferred++;
>       }
>       if (uq) {
>               r = mes_userq_reset(uq);
> @@ -2288,15 +2320,24 @@ int amdgpu_gfx_reset_mes_compute(struct
> amdgpu_device *adev,
>                       goto out;
>       }
>       for (i = 0; i < num_hung; i++) {
> +             struct amdgpu_ring *hr = NULL;
> +             struct amdgpu_fence *hf = NULL;
> +
>               pipe = hqd_info[i].pipe_index;
>               queue = hqd_info[i].queue_index;
>               queue_type = hqd_info[i].queue_type;
>
>               /* reset any KCQs */
>               r = amdgpu_gfx_reset_mes_kcq(adev, ring,
> -                                          
> adev->gfx.mec.mes_hung_db_array[i]);
> +                                          adev->gfx.mec.mes_hung_db_array[i],
> +                                          &hr, &hf);
>               if (r)
>                       goto out;
> +             if (hr) {
> +                     deferred_end[n_deferred].ring = hr;
> +                     deferred_end[n_deferred].fence = hf;

Is it possible that the deferred test ring has been duplicated from the 
subsequent setting in amdgpu_gfx_mes_reset_queue_no_end()?

> +                     n_deferred++;
> +             }
>               /* reset any KFD queues */
>               r = amdgpu_amdkfd_reset_mes_queue(adev, 0, queue_type, pipe,
> queue,
>                                                 adev-
> >gfx.mec.mes_hung_db_array[i]);
> @@ -2325,6 +2366,17 @@ int amdgpu_gfx_reset_mes_compute(struct
> amdgpu_device *adev,
>  out:
>       /* resume all will enable the non-hung queues */
>       amdgpu_mes_resume(adev, 0);
> +
> +     /* Now CP is running again — replay backed-up commands and ring
> +      * doorbells on each reset queue.
> +      */
> +     for (i = 0; i < n_deferred; i++) {
> +             int er = amdgpu_ring_reset_helper_end(deferred_end[i].ring,
> +                                                   deferred_end[i].fence);
> +             if (er && !r)
> +                     r = er;
> +     }
> +
>       if (!r)
>               amdgpu_gfx_reset_start_compute_scheds(adev, ring);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index 4003360c7d9a..381fc17274b9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -549,6 +549,11 @@ struct amdgpu_gfx {
>       bool                            disable_uq;
>  };
>
> +struct amdgpu_gfx_deferred_entry {
> +     struct amdgpu_ring      *ring;
> +     struct amdgpu_fence     *fence;
> +};
> +
>  struct amdgpu_gfx_ras_reg_entry {
>       struct amdgpu_ras_err_status_reg_entry reg_entry;
>       enum amdgpu_gfx_ras_mem_id_type mem_id_type;
> --
> 2.49.0

RE: [PATCH] drm/amdgpu/gfx: defer per-queue helper_end until after MES resume

Reply via email to