Introduce helpers to prepare for and recover from a MEC pipe reset. The pre‑reset handler stops the KFD scheduler if the KFD is initialised, preventing new submissions while the pipe is being reset. The post‑reset handler iterates over all compute rings sharing the same MEC pipe (on the affected XCC) and marks any non‑guilty ring’s scheduler as faulted via drm_sched_fault().
v2: drop the stop drm scheduer, have a worker thread which schedules a call to drm_sched_fault() for all of the affected queues (Alex) Suggested-by: Alex Deucher <[email protected]> Signed-off-by: Jesse Zhang <[email protected]> --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 45 +++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 5 +++ 2 files changed, 50 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 515cc4a2aeb4..a9fd639e4cd6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -69,6 +69,51 @@ void amdgpu_queue_mask_bit_to_mec_queue(struct amdgpu_device *adev, int bit, } +static bool amdgpu_gfx_ring_on_mec_pipe(struct amdgpu_ring *ring, u32 me, u32 pipe) +{ + if (!ring || !ring->funcs || ring->funcs->type != AMDGPU_RING_TYPE_COMPUTE) + return false; + + return ring->me == me && ring->pipe == pipe; +} + +static unsigned int amdgpu_gfx_mec_pipe_compute_ring_base(struct amdgpu_device *adev, + u32 xcc_id) +{ + int num_xcc = adev->gfx.xcc_mask ? NUM_XCC(adev->gfx.xcc_mask) : 1; + + if (num_xcc <= 1) + return 0; + return xcc_id * adev->gfx.num_compute_rings; +} + +void amdgpu_gfx_mec_pre_pipe_reset(struct amdgpu_device *adev, + struct amdgpu_ring *guilty) +{ + if (adev->kfd.init_complete) + amdgpu_amdkfd_stop_sched(adev, guilty->xcc_id); +} + +void amdgpu_gfx_mec_post_pipe_reset(struct amdgpu_device *adev, struct amdgpu_ring *guilty) +{ + struct amdgpu_ring *ring; + unsigned int j, base; + + base = amdgpu_gfx_mec_pipe_compute_ring_base(adev, guilty->xcc_id); + for (j = 0; j < adev->gfx.num_compute_rings; j++) { + ring = &adev->gfx.compute_ring[base + j]; + if (!amdgpu_gfx_ring_on_mec_pipe(ring, guilty->me, guilty->pipe)) + continue; + + if (ring != guilty) + drm_sched_fault(&ring->sched); + } + + if (adev->kfd.init_complete) + amdgpu_amdkfd_start_sched(adev, guilty->xcc_id); +} + bool amdgpu_gfx_is_mec_queue_enabled(struct amdgpu_device *adev, int xcc_id, int mec, int pipe, int queue) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index 77050f9884f2..1deb82836f02 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -603,6 +603,11 @@ int amdgpu_gfx_mec_queue_to_bit(struct amdgpu_device *adev, int mec, int pipe, int queue); void amdgpu_queue_mask_bit_to_mec_queue(struct amdgpu_device *adev, int bit, int *mec, int *pipe, int *queue); + +void amdgpu_gfx_mec_pre_pipe_reset(struct amdgpu_device *adev, + struct amdgpu_ring *guilty); +void amdgpu_gfx_mec_post_pipe_reset(struct amdgpu_device *adev, + struct amdgpu_ring *guilty); bool amdgpu_gfx_is_mec_queue_enabled(struct amdgpu_device *adev, int xcc_id, int mec, int pipe, int queue); bool amdgpu_gfx_is_high_priority_compute_queue(struct amdgpu_device *adev, -- 2.49.0
