amdgpu_gfx_reset_mes_compute() runs amdgpu_mes_suspend(adev, 0) to quiesce all gangs, resets the offending queue(s), then resumes. The existing amdgpu_gfx_mes_reset_queue() called amdgpu_ring_reset_helper_end() right after unmap/restore/map of the reset queue, which re-emits backed-up commands and rings the doorbell. That doorbell hits a still-suspended CP: on the subsequent resume the queue partially wedges -- the first new IB after the reset may execute but later submissions stall, which surfaces as repeated timeouts on the same ring under concurrent workloads.
Split out amdgpu_gfx_mes_reset_queue_no_end() (backup + MES reset + unmap/restore/map only) and defer helper_end. amdgpu_gfx_reset_mes_compute() collects the (ring, fence) pair for every queue it resets and runs helper_end on each after amdgpu_mes_resume(), so the re-emit doorbells land on a running CP. amdgpu_gfx_reset_mes_kcq() now reports the matched ring/fence back to the caller for the same reason. Signed-off-by: Jesse Zhang <[email protected]> --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 68 ++++++++++++++++++++++--- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 5 ++ 2 files changed, 65 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index ff5a55f5f3c9..b6202095f256 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -1989,10 +1989,10 @@ static ssize_t amdgpu_gfx_get_compute_reset_mask(struct device *dev, return amdgpu_show_reset_mask(buf, adev->gfx.compute_supported_reset); } -int amdgpu_gfx_mes_reset_queue(struct amdgpu_ring *ring, - unsigned int vmid, - struct amdgpu_fence *timedout_fence, - bool use_mmio) +static int amdgpu_gfx_mes_reset_queue_no_end(struct amdgpu_ring *ring, + unsigned int vmid, + struct amdgpu_fence *timedout_fence, + bool use_mmio) { struct amdgpu_device *adev = ring->adev; bool reinit_queue; @@ -2026,7 +2026,20 @@ int amdgpu_gfx_mes_reset_queue(struct amdgpu_ring *ring, return r; } } + return 0; +} +int amdgpu_gfx_mes_reset_queue(struct amdgpu_ring *ring, + unsigned int vmid, + struct amdgpu_fence *timedout_fence, + bool use_mmio) +{ + int r; + + r = amdgpu_gfx_mes_reset_queue_no_end(ring, vmid, timedout_fence, + use_mmio); + if (r) + return r; return amdgpu_ring_reset_helper_end(ring, timedout_fence); } @@ -2216,24 +2229,37 @@ static void amdgpu_gfx_reset_stop_compute_scheds(struct amdgpu_device *adev, } } +/* + * Match the MES-reported hung doorbell against a compute ring and run + * the core reset (no helper_end). On hit, the matched ring and its guilty + * fence are returned via *out_ring / *out_fence so the caller can defer + * helper_end until after MES has resumed all gangs. + */ static int amdgpu_gfx_reset_mes_kcq(struct amdgpu_device *adev, struct amdgpu_ring *guilty_ring, - unsigned int db) + unsigned int db, + struct amdgpu_ring **out_ring, + struct amdgpu_fence **out_fence) { bool use_mmio = adev->gfx.mec.use_mmio_for_reset; struct amdgpu_fence *fence; struct amdgpu_ring *ring; int i, r; + *out_ring = NULL; + *out_fence = NULL; for (i = 0; i < adev->gfx.num_compute_rings; i++) { ring = &adev->gfx.compute_ring[i]; if (ring == guilty_ring) continue; if (ring->doorbell_index == db) { fence = amdgpu_ring_find_guilty_fence(ring); - r = amdgpu_gfx_mes_reset_queue(ring, 0, fence, use_mmio); + r = amdgpu_gfx_mes_reset_queue_no_end(ring, 0, fence, + use_mmio); if (r) return r; + *out_ring = ring; + *out_fence = fence; break; } } @@ -2254,6 +2280,8 @@ int amdgpu_gfx_reset_mes_compute(struct amdgpu_device *adev, unsigned int num_hung = 0; bool use_mmio = adev->gfx.mec.use_mmio_for_reset; struct mes_remove_queue_input *queue_input = (struct mes_remove_queue_input *)faulty_queue_input; + struct amdgpu_gfx_deferred_entry deferred_end[AMDGPU_MAX_COMPUTE_RINGS + 1]; + int n_deferred = 0; guard(mutex)(&adev->gfx.mec.reset_mutex); /* stop the drm schedulers for all compute queues */ @@ -2278,9 +2306,13 @@ int amdgpu_gfx_reset_mes_compute(struct amdgpu_device *adev, fence_reset: /* reset the queue this came from if specified */ if (ring) { - r = amdgpu_gfx_mes_reset_queue(ring, 0, guilty_fence, use_mmio); + r = amdgpu_gfx_mes_reset_queue_no_end(ring, 0, guilty_fence, + use_mmio); if (r) goto out; + deferred_end[n_deferred].ring = ring; + deferred_end[n_deferred].fence = guilty_fence; + n_deferred++; } if (uq) { r = mes_userq_reset(uq); @@ -2288,15 +2320,24 @@ int amdgpu_gfx_reset_mes_compute(struct amdgpu_device *adev, goto out; } for (i = 0; i < num_hung; i++) { + struct amdgpu_ring *hr = NULL; + struct amdgpu_fence *hf = NULL; + pipe = hqd_info[i].pipe_index; queue = hqd_info[i].queue_index; queue_type = hqd_info[i].queue_type; /* reset any KCQs */ r = amdgpu_gfx_reset_mes_kcq(adev, ring, - adev->gfx.mec.mes_hung_db_array[i]); + adev->gfx.mec.mes_hung_db_array[i], + &hr, &hf); if (r) goto out; + if (hr) { + deferred_end[n_deferred].ring = hr; + deferred_end[n_deferred].fence = hf; + n_deferred++; + } /* reset any KFD queues */ r = amdgpu_amdkfd_reset_mes_queue(adev, 0, queue_type, pipe, queue, adev->gfx.mec.mes_hung_db_array[i]); @@ -2325,6 +2366,17 @@ int amdgpu_gfx_reset_mes_compute(struct amdgpu_device *adev, out: /* resume all will enable the non-hung queues */ amdgpu_mes_resume(adev, 0); + + /* Now CP is running again — replay backed-up commands and ring + * doorbells on each reset queue. + */ + for (i = 0; i < n_deferred; i++) { + int er = amdgpu_ring_reset_helper_end(deferred_end[i].ring, + deferred_end[i].fence); + if (er && !r) + r = er; + } + if (!r) amdgpu_gfx_reset_start_compute_scheds(adev, ring); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index 4003360c7d9a..381fc17274b9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -549,6 +549,11 @@ struct amdgpu_gfx { bool disable_uq; }; +struct amdgpu_gfx_deferred_entry { + struct amdgpu_ring *ring; + struct amdgpu_fence *fence; +}; + struct amdgpu_gfx_ras_reg_entry { struct amdgpu_ras_err_status_reg_entry reg_entry; enum amdgpu_gfx_ras_mem_id_type mem_id_type; -- 2.49.0
