On Sun, Jun 7, 2026 at 11:24 PM Jesse Zhang <[email protected]> wrote:
>
> amdgpu_gfx_reset_mes_compute() runs amdgpu_mes_suspend(adev, 0) to
> quiesce all gangs, resets the offending queue(s), then resumes. The
> existing amdgpu_gfx_mes_reset_queue() called amdgpu_ring_reset_helper_end()
> right after unmap/restore/map of the reset queue, which re-emits backed-up
> commands and rings the doorbell. That doorbell hits a still-suspended CP:
> on the subsequent resume the queue partially wedges -- the first new IB
> after the reset may execute but later submissions stall, which surfaces
> as repeated timeouts on the same ring under concurrent workloads.
>
> Split out amdgpu_gfx_mes_reset_queue_no_end() (backup + MES reset +
> unmap/restore/map only) and defer helper_end. amdgpu_gfx_reset_mes_compute()
> collects the (ring, fence) pair for every queue it resets and runs
> helper_end on each after amdgpu_mes_resume(), so the re-emit doorbells
> land on a running CP. amdgpu_gfx_reset_mes_kcq() now reports the matched
> ring/fence back to the caller for the same reason.
>
> Signed-off-by: Jesse Zhang <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 68 ++++++++++++++++++++++---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h |  5 ++
>  2 files changed, 65 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index ff5a55f5f3c9..b6202095f256 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -1989,10 +1989,10 @@ static ssize_t 
> amdgpu_gfx_get_compute_reset_mask(struct device *dev,
>         return amdgpu_show_reset_mask(buf, adev->gfx.compute_supported_reset);
>  }
>
> -int amdgpu_gfx_mes_reset_queue(struct amdgpu_ring *ring,
> -                              unsigned int vmid,
> -                              struct amdgpu_fence *timedout_fence,
> -                              bool use_mmio)
> +static int amdgpu_gfx_mes_reset_queue_no_end(struct amdgpu_ring *ring,

_no_end() sounds weird.  How about _start() instead?  With that fixed,
the patch is:
Reviewed-by: Alex Deucher <[email protected]>


> +                                            unsigned int vmid,
> +                                            struct amdgpu_fence 
> *timedout_fence,
> +                                            bool use_mmio)
>  {
>         struct amdgpu_device *adev = ring->adev;
>         bool reinit_queue;
> @@ -2026,7 +2026,20 @@ int amdgpu_gfx_mes_reset_queue(struct amdgpu_ring 
> *ring,
>                         return r;
>                 }
>         }
> +       return 0;
> +}
>
> +int amdgpu_gfx_mes_reset_queue(struct amdgpu_ring *ring,
> +                              unsigned int vmid,
> +                              struct amdgpu_fence *timedout_fence,
> +                              bool use_mmio)
> +{
> +       int r;
> +
> +       r = amdgpu_gfx_mes_reset_queue_no_end(ring, vmid, timedout_fence,
> +                                             use_mmio);
> +       if (r)
> +               return r;
>         return amdgpu_ring_reset_helper_end(ring, timedout_fence);
>  }
>
> @@ -2216,24 +2229,37 @@ static void 
> amdgpu_gfx_reset_stop_compute_scheds(struct amdgpu_device *adev,
>         }
>  }
>
> +/*
> + * Match the MES-reported hung doorbell against a compute ring and run
> + * the core reset (no helper_end). On hit, the matched ring and its guilty
> + * fence are returned via *out_ring / *out_fence so the caller can defer
> + * helper_end until after MES has resumed all gangs.
> + */
>  static int amdgpu_gfx_reset_mes_kcq(struct amdgpu_device *adev,
>                                     struct amdgpu_ring *guilty_ring,
> -                                   unsigned int db)
> +                                   unsigned int db,
> +                                   struct amdgpu_ring **out_ring,
> +                                   struct amdgpu_fence **out_fence)
>  {
>         bool use_mmio = adev->gfx.mec.use_mmio_for_reset;
>         struct amdgpu_fence *fence;
>         struct amdgpu_ring *ring;
>         int i, r;
>
> +       *out_ring = NULL;
> +       *out_fence = NULL;
>         for (i = 0; i < adev->gfx.num_compute_rings; i++) {
>                 ring = &adev->gfx.compute_ring[i];
>                 if (ring == guilty_ring)
>                         continue;
>                 if (ring->doorbell_index == db) {
>                         fence = amdgpu_ring_find_guilty_fence(ring);
> -                       r = amdgpu_gfx_mes_reset_queue(ring, 0, fence, 
> use_mmio);
> +                       r = amdgpu_gfx_mes_reset_queue_no_end(ring, 0, fence,
> +                                                             use_mmio);
>                         if (r)
>                                 return r;
> +                       *out_ring = ring;
> +                       *out_fence = fence;
>                         break;
>                 }
>         }
> @@ -2254,6 +2280,8 @@ int amdgpu_gfx_reset_mes_compute(struct amdgpu_device 
> *adev,
>         unsigned int num_hung = 0;
>         bool use_mmio = adev->gfx.mec.use_mmio_for_reset;
>         struct mes_remove_queue_input *queue_input = (struct 
> mes_remove_queue_input *)faulty_queue_input;
> +       struct amdgpu_gfx_deferred_entry 
> deferred_end[AMDGPU_MAX_COMPUTE_RINGS + 1];
> +       int n_deferred = 0;
>
>         guard(mutex)(&adev->gfx.mec.reset_mutex);
>         /* stop the drm schedulers for all compute queues */
> @@ -2278,9 +2306,13 @@ int amdgpu_gfx_reset_mes_compute(struct amdgpu_device 
> *adev,
>  fence_reset:
>         /* reset the queue this came from if specified */
>         if (ring) {
> -               r = amdgpu_gfx_mes_reset_queue(ring, 0, guilty_fence, 
> use_mmio);
> +               r = amdgpu_gfx_mes_reset_queue_no_end(ring, 0, guilty_fence,
> +                                                     use_mmio);
>                 if (r)
>                         goto out;
> +               deferred_end[n_deferred].ring = ring;
> +               deferred_end[n_deferred].fence = guilty_fence;
> +               n_deferred++;
>         }
>         if (uq) {
>                 r = mes_userq_reset(uq);
> @@ -2288,15 +2320,24 @@ int amdgpu_gfx_reset_mes_compute(struct amdgpu_device 
> *adev,
>                         goto out;
>         }
>         for (i = 0; i < num_hung; i++) {
> +               struct amdgpu_ring *hr = NULL;
> +               struct amdgpu_fence *hf = NULL;
> +
>                 pipe = hqd_info[i].pipe_index;
>                 queue = hqd_info[i].queue_index;
>                 queue_type = hqd_info[i].queue_type;
>
>                 /* reset any KCQs */
>                 r = amdgpu_gfx_reset_mes_kcq(adev, ring,
> -                                            
> adev->gfx.mec.mes_hung_db_array[i]);
> +                                            
> adev->gfx.mec.mes_hung_db_array[i],
> +                                            &hr, &hf);
>                 if (r)
>                         goto out;
> +               if (hr) {
> +                       deferred_end[n_deferred].ring = hr;
> +                       deferred_end[n_deferred].fence = hf;
> +                       n_deferred++;
> +               }
>                 /* reset any KFD queues */
>                 r = amdgpu_amdkfd_reset_mes_queue(adev, 0, queue_type, pipe, 
> queue,
>                                                   
> adev->gfx.mec.mes_hung_db_array[i]);
> @@ -2325,6 +2366,17 @@ int amdgpu_gfx_reset_mes_compute(struct amdgpu_device 
> *adev,
>  out:
>         /* resume all will enable the non-hung queues */
>         amdgpu_mes_resume(adev, 0);
> +
> +       /* Now CP is running again — replay backed-up commands and ring
> +        * doorbells on each reset queue.
> +        */
> +       for (i = 0; i < n_deferred; i++) {
> +               int er = amdgpu_ring_reset_helper_end(deferred_end[i].ring,
> +                                                     deferred_end[i].fence);
> +               if (er && !r)
> +                       r = er;
> +       }
> +
>         if (!r)
>                 amdgpu_gfx_reset_start_compute_scheds(adev, ring);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index 4003360c7d9a..381fc17274b9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -549,6 +549,11 @@ struct amdgpu_gfx {
>         bool                            disable_uq;
>  };
>
> +struct amdgpu_gfx_deferred_entry {
> +       struct amdgpu_ring      *ring;
> +       struct amdgpu_fence     *fence;
> +};
> +
>  struct amdgpu_gfx_ras_reg_entry {
>         struct amdgpu_ras_err_status_reg_entry reg_entry;
>         enum amdgpu_gfx_ras_mem_id_type mem_id_type;
> --
> 2.49.0
>

Reply via email to