On Sun, Jun 7, 2026 at 11:24 PM Jesse Zhang <[email protected]> wrote: > > amdgpu_gfx_reset_mes_compute() runs amdgpu_mes_suspend(adev, 0) to > quiesce all gangs, resets the offending queue(s), then resumes. The > existing amdgpu_gfx_mes_reset_queue() called amdgpu_ring_reset_helper_end() > right after unmap/restore/map of the reset queue, which re-emits backed-up > commands and rings the doorbell. That doorbell hits a still-suspended CP: > on the subsequent resume the queue partially wedges -- the first new IB > after the reset may execute but later submissions stall, which surfaces > as repeated timeouts on the same ring under concurrent workloads. > > Split out amdgpu_gfx_mes_reset_queue_no_end() (backup + MES reset + > unmap/restore/map only) and defer helper_end. amdgpu_gfx_reset_mes_compute() > collects the (ring, fence) pair for every queue it resets and runs > helper_end on each after amdgpu_mes_resume(), so the re-emit doorbells > land on a running CP. amdgpu_gfx_reset_mes_kcq() now reports the matched > ring/fence back to the caller for the same reason. > > Signed-off-by: Jesse Zhang <[email protected]> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 68 ++++++++++++++++++++++--- > drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 5 ++ > 2 files changed, 65 insertions(+), 8 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > index ff5a55f5f3c9..b6202095f256 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > @@ -1989,10 +1989,10 @@ static ssize_t > amdgpu_gfx_get_compute_reset_mask(struct device *dev, > return amdgpu_show_reset_mask(buf, adev->gfx.compute_supported_reset); > } > > -int amdgpu_gfx_mes_reset_queue(struct amdgpu_ring *ring, > - unsigned int vmid, > - struct amdgpu_fence *timedout_fence, > - bool use_mmio) > +static int amdgpu_gfx_mes_reset_queue_no_end(struct amdgpu_ring *ring,
_no_end() sounds weird. How about _start() instead? With that fixed, the patch is: Reviewed-by: Alex Deucher <[email protected]> > + unsigned int vmid, > + struct amdgpu_fence > *timedout_fence, > + bool use_mmio) > { > struct amdgpu_device *adev = ring->adev; > bool reinit_queue; > @@ -2026,7 +2026,20 @@ int amdgpu_gfx_mes_reset_queue(struct amdgpu_ring > *ring, > return r; > } > } > + return 0; > +} > > +int amdgpu_gfx_mes_reset_queue(struct amdgpu_ring *ring, > + unsigned int vmid, > + struct amdgpu_fence *timedout_fence, > + bool use_mmio) > +{ > + int r; > + > + r = amdgpu_gfx_mes_reset_queue_no_end(ring, vmid, timedout_fence, > + use_mmio); > + if (r) > + return r; > return amdgpu_ring_reset_helper_end(ring, timedout_fence); > } > > @@ -2216,24 +2229,37 @@ static void > amdgpu_gfx_reset_stop_compute_scheds(struct amdgpu_device *adev, > } > } > > +/* > + * Match the MES-reported hung doorbell against a compute ring and run > + * the core reset (no helper_end). On hit, the matched ring and its guilty > + * fence are returned via *out_ring / *out_fence so the caller can defer > + * helper_end until after MES has resumed all gangs. > + */ > static int amdgpu_gfx_reset_mes_kcq(struct amdgpu_device *adev, > struct amdgpu_ring *guilty_ring, > - unsigned int db) > + unsigned int db, > + struct amdgpu_ring **out_ring, > + struct amdgpu_fence **out_fence) > { > bool use_mmio = adev->gfx.mec.use_mmio_for_reset; > struct amdgpu_fence *fence; > struct amdgpu_ring *ring; > int i, r; > > + *out_ring = NULL; > + *out_fence = NULL; > for (i = 0; i < adev->gfx.num_compute_rings; i++) { > ring = &adev->gfx.compute_ring[i]; > if (ring == guilty_ring) > continue; > if (ring->doorbell_index == db) { > fence = amdgpu_ring_find_guilty_fence(ring); > - r = amdgpu_gfx_mes_reset_queue(ring, 0, fence, > use_mmio); > + r = amdgpu_gfx_mes_reset_queue_no_end(ring, 0, fence, > + use_mmio); > if (r) > return r; > + *out_ring = ring; > + *out_fence = fence; > break; > } > } > @@ -2254,6 +2280,8 @@ int amdgpu_gfx_reset_mes_compute(struct amdgpu_device > *adev, > unsigned int num_hung = 0; > bool use_mmio = adev->gfx.mec.use_mmio_for_reset; > struct mes_remove_queue_input *queue_input = (struct > mes_remove_queue_input *)faulty_queue_input; > + struct amdgpu_gfx_deferred_entry > deferred_end[AMDGPU_MAX_COMPUTE_RINGS + 1]; > + int n_deferred = 0; > > guard(mutex)(&adev->gfx.mec.reset_mutex); > /* stop the drm schedulers for all compute queues */ > @@ -2278,9 +2306,13 @@ int amdgpu_gfx_reset_mes_compute(struct amdgpu_device > *adev, > fence_reset: > /* reset the queue this came from if specified */ > if (ring) { > - r = amdgpu_gfx_mes_reset_queue(ring, 0, guilty_fence, > use_mmio); > + r = amdgpu_gfx_mes_reset_queue_no_end(ring, 0, guilty_fence, > + use_mmio); > if (r) > goto out; > + deferred_end[n_deferred].ring = ring; > + deferred_end[n_deferred].fence = guilty_fence; > + n_deferred++; > } > if (uq) { > r = mes_userq_reset(uq); > @@ -2288,15 +2320,24 @@ int amdgpu_gfx_reset_mes_compute(struct amdgpu_device > *adev, > goto out; > } > for (i = 0; i < num_hung; i++) { > + struct amdgpu_ring *hr = NULL; > + struct amdgpu_fence *hf = NULL; > + > pipe = hqd_info[i].pipe_index; > queue = hqd_info[i].queue_index; > queue_type = hqd_info[i].queue_type; > > /* reset any KCQs */ > r = amdgpu_gfx_reset_mes_kcq(adev, ring, > - > adev->gfx.mec.mes_hung_db_array[i]); > + > adev->gfx.mec.mes_hung_db_array[i], > + &hr, &hf); > if (r) > goto out; > + if (hr) { > + deferred_end[n_deferred].ring = hr; > + deferred_end[n_deferred].fence = hf; > + n_deferred++; > + } > /* reset any KFD queues */ > r = amdgpu_amdkfd_reset_mes_queue(adev, 0, queue_type, pipe, > queue, > > adev->gfx.mec.mes_hung_db_array[i]); > @@ -2325,6 +2366,17 @@ int amdgpu_gfx_reset_mes_compute(struct amdgpu_device > *adev, > out: > /* resume all will enable the non-hung queues */ > amdgpu_mes_resume(adev, 0); > + > + /* Now CP is running again — replay backed-up commands and ring > + * doorbells on each reset queue. > + */ > + for (i = 0; i < n_deferred; i++) { > + int er = amdgpu_ring_reset_helper_end(deferred_end[i].ring, > + deferred_end[i].fence); > + if (er && !r) > + r = er; > + } > + > if (!r) > amdgpu_gfx_reset_start_compute_scheds(adev, ring); > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > index 4003360c7d9a..381fc17274b9 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h > @@ -549,6 +549,11 @@ struct amdgpu_gfx { > bool disable_uq; > }; > > +struct amdgpu_gfx_deferred_entry { > + struct amdgpu_ring *ring; > + struct amdgpu_fence *fence; > +}; > + > struct amdgpu_gfx_ras_reg_entry { > struct amdgpu_ras_err_status_reg_entry reg_entry; > enum amdgpu_gfx_ras_mem_id_type mem_id_type; > -- > 2.49.0 >
