Add IP-agnostic helpers to coordinate a MEC pipe reset across all KCQs on the same (me, pipe): prepare (back up siblings, stop schedulers, stop KFD), restart_schedulers, and recover_queues (re-init/remap KCQs and run amdgpu_ring_reset_helper_end on each — guilty gets the timeout fence, siblings get a synthetic context so collateral work is reemitted).
Suggested-by: Manu Rastogi <[email protected]> Suggested-by: Alex Deucher <[email protected]> Signed-off-by: Jesse Zhang <[email protected]> --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 115 ++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 11 +++ 2 files changed, 126 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 515cc4a2aeb4..8cfb73fda4bb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -69,6 +69,121 @@ void amdgpu_queue_mask_bit_to_mec_queue(struct amdgpu_device *adev, int bit, } +static bool amdgpu_gfx_ring_on_mec_pipe(struct amdgpu_ring *ring, u32 me, u32 pipe) +{ + if (!ring || !ring->funcs || ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE) + return false; + if (ring->no_scheduler) + return false; + + return ring->me == me && ring->pipe == pipe; +} + +static unsigned int amdgpu_gfx_mec_pipe_compute_ring_base(struct amdgpu_device *adev, + u32 xcc_id) +{ + int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) : 1; + + if (num_xcc <= 1) + return 0; + return xcc_id * adev->gfx.num_compute_rings; +} + +void amdgpu_gfx_mec_pipe_reset_prepare(struct amdgpu_device *adev, + struct amdgpu_ring *guilty) +{ + struct amdgpu_ring *r; + unsigned int j, base; + + base = amdgpu_gfx_mec_pipe_compute_ring_base(adev, guilty->xcc_id); + for (j = 0; j < adev->gfx.num_compute_rings; j++) { + r = &adev->gfx.compute_ring[base + j]; + if (!amdgpu_gfx_ring_on_mec_pipe(r, guilty->me, guilty->pipe)) + continue; + if (r != guilty) + amdgpu_ring_backup_unprocessed_commands(r, NULL); + if (amdgpu_ring_sched_ready(r)) + drm_sched_wqueue_stop(&r->sched); + } + + if (adev->kfd.init_complete) + amdgpu_amdkfd_stop_sched(adev, guilty->xcc_id); +} + +void amdgpu_gfx_mec_pipe_restart_schedulers(struct amdgpu_device *adev, + u32 me, u32 pipe, u32 xcc_id) +{ + struct amdgpu_ring *r; + unsigned int j, base; + + base = amdgpu_gfx_mec_pipe_compute_ring_base(adev, xcc_id); + for (j = 0; j < adev->gfx.num_compute_rings; j++) { + r = &adev->gfx.compute_ring[base + j]; + if (!amdgpu_gfx_ring_on_mec_pipe(r, me, pipe)) + continue; + if (amdgpu_ring_sched_ready(r)) + drm_sched_wqueue_start(&r->sched); + } + + if (adev->kfd.init_complete) + amdgpu_amdkfd_start_sched(adev, xcc_id); +} + +int amdgpu_gfx_mec_pipe_reset_recover_queues(struct amdgpu_device *adev, + struct amdgpu_ring *guilty, + struct amdgpu_fence *timedout_fence, + amdgpu_gfx_kcq_init_queue_t kcq_init) +{ + struct amdgpu_fence collateral_reemit = {}; + struct amdgpu_ring *r; + unsigned int j, base; + int err = 0; + + if (!timedout_fence) + return -EINVAL; + + collateral_reemit.context = (u64)-1; + + base = amdgpu_gfx_mec_pipe_compute_ring_base(adev, guilty->xcc_id); + for (j = 0; j < adev->gfx.num_compute_rings; j++) { + r = &adev->gfx.compute_ring[base + j]; + if (!amdgpu_gfx_ring_on_mec_pipe(r, guilty->me, guilty->pipe)) + continue; + + err = kcq_init(r, true); + if (err) + goto err_sched; + err = amdgpu_mes_map_legacy_queue(adev, r, 0); + if (err) + goto err_sched; + } + + amdgpu_gfx_mec_pipe_restart_schedulers(adev, guilty->me, guilty->pipe, + guilty->xcc_id); + + for (j = 0; j < adev->gfx.num_compute_rings; j++) { + r = &adev->gfx.compute_ring[base + j]; + if (!amdgpu_gfx_ring_on_mec_pipe(r, guilty->me, guilty->pipe)) + continue; + + err = amdgpu_ring_reset_helper_end( + r, r == guilty ? timedout_fence : &collateral_reemit); + if (err) { + dev_err(adev->dev, + "ring %s failed recover after MEC pipe reset (%d)\n", + r->name, err); + return err; + } + } + + return 0; + +err_sched: + amdgpu_gfx_mec_pipe_restart_schedulers(adev, guilty->me, guilty->pipe, + guilty->xcc_id); + return err; +} + bool amdgpu_gfx_is_mec_queue_enabled(struct amdgpu_device *adev, int xcc_id, int mec, int pipe, int queue) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index 77050f9884f2..38b317b91bbe 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -603,6 +603,17 @@ int amdgpu_gfx_mec_queue_to_bit(struct amdgpu_device *adev, int mec, int pipe, int queue); void amdgpu_queue_mask_bit_to_mec_queue(struct amdgpu_device *adev, int bit, int *mec, int *pipe, int *queue); + +typedef int (*amdgpu_gfx_kcq_init_queue_t)(struct amdgpu_ring *ring, bool clear); + +void amdgpu_gfx_mec_pipe_reset_prepare(struct amdgpu_device *adev, + struct amdgpu_ring *guilty); +void amdgpu_gfx_mec_pipe_restart_schedulers(struct amdgpu_device *adev, + u32 me, u32 pipe, u32 xcc_id); +int amdgpu_gfx_mec_pipe_reset_recover_queues( + struct amdgpu_device *adev, struct amdgpu_ring *guilty, + struct amdgpu_fence *timedout_fence, + amdgpu_gfx_kcq_init_queue_t kcq_init); bool amdgpu_gfx_is_mec_queue_enabled(struct amdgpu_device *adev, int xcc_id, int mec, int pipe, int queue); bool amdgpu_gfx_is_high_priority_compute_queue(struct amdgpu_device *adev, -- 2.49.0
