Replace the queue remove/add approach with suspend/resume semantics for user queue preemption. This change:
1. Maintains queue scheduling registration while only preempting execution - Previously used remove_queue/add_queue would fully deregister queues - New suspend/resume approach keeps scheduler state while preempting 2. Introduces proper preemption helpers: - amdgpu_userqueue_preempt_helper(): Suspends queue execution - Transitions MAPPED→UNMAPPED state on success - Marks as HUNG and triggers reset on failure - amdgpu_userqueue_restore_helper(): Resumes queue execution - Transitions UNMAPPED→MAPPED state on success - Triggers GPU reset on failure Signed-off-by: Alex Deucher <alexander.deuc...@amd.com> Signed-off-by: Jesse Zhang <jesse.zh...@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 64 +++++++++++++++++++---- 1 file changed, 53 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index 0c91302162fa..3a8da1f47159 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -72,7 +72,7 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr) bool has_gfx = false, has_compute = false, has_sdma = false; struct amdgpu_usermode_queue *userq; bool gpu_reset = false; - int gpu_suspend, id, r = 0; + int id, r = 0; if (idr_is_empty(&uq_mgr->userq_idr)) return false; @@ -98,7 +98,6 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr) dev_err(adev->dev, "userq reset disabled by debug mask\n"); } else if (amdgpu_gpu_recovery) { if (has_compute && userq_compute_funcs->detect_and_reset) { - gpu_suspend = amdgpu_mes_suspend(adev); r = userq_compute_funcs->detect_and_reset(adev, AMDGPU_RING_TYPE_COMPUTE); if (r) { gpu_reset = true; @@ -127,9 +126,6 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr) if (gpu_reset) amdgpu_userq_gpu_reset(adev); - if ((!gpu_suspend) && has_compute) - amdgpu_mes_resume(adev); - return r; } @@ -143,7 +139,8 @@ amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr, bool gpu_reset = false; int r = 0; - if (queue->state == AMDGPU_USERQ_STATE_MAPPED) { + if ((queue->state == AMDGPU_USERQ_STATE_MAPPED) || + (queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) { r = userq_funcs->unmap(uq_mgr, queue); if (r) { queue->state = AMDGPU_USERQ_STATE_HUNG; @@ -185,6 +182,54 @@ amdgpu_userq_map_helper(struct amdgpu_userq_mgr *uq_mgr, return r; } +static int +amdgpu_userqueue_preempt_helper(struct amdgpu_userq_mgr *uq_mgr, + struct amdgpu_usermode_queue *queue) +{ + struct amdgpu_device *adev = uq_mgr->adev; + const struct amdgpu_userq_funcs *userq_funcs = + adev->userq_funcs[queue->queue_type]; + int r = 0; + + if (queue->state == AMDGPU_USERQ_STATE_MAPPED) { + r = userq_funcs->preempt(uq_mgr, queue); + if (r) { + amdgpu_userq_detect_and_reset_queues(uq_mgr); + queue->state = AMDGPU_USERQ_STATE_HUNG; + } else { + queue->state = AMDGPU_USERQ_STATE_PREEMPTED; + } + } + + return r; +} + +static int +amdgpu_userqueue_restore_helper(struct amdgpu_userq_mgr *uq_mgr, + struct amdgpu_usermode_queue *queue) +{ + struct amdgpu_device *adev = uq_mgr->adev; + const struct amdgpu_userq_funcs *userq_funcs = + adev->userq_funcs[queue->queue_type]; + bool gpu_reset = false; + int r = 0; + + if (queue->state == AMDGPU_USERQ_STATE_PREEMPTED) { + r = userq_funcs->restore(uq_mgr, queue); + if (r) { + queue->state = AMDGPU_USERQ_STATE_HUNG; + gpu_reset = true; + } else { + queue->state = AMDGPU_USERQ_STATE_MAPPED; + } + } + + if (gpu_reset) + amdgpu_userq_gpu_reset(adev); + + return r; +} + static void amdgpu_userq_wait_for_last_fence(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_usermode_queue *queue) @@ -639,7 +684,7 @@ amdgpu_userq_restore_all(struct amdgpu_userq_mgr *uq_mgr) /* Resume all the queues for this process */ idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) { - r = amdgpu_userq_map_helper(uq_mgr, queue); + r = amdgpu_userqueue_restore_helper(uq_mgr, queue); if (r) ret = r; } @@ -794,10 +839,9 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr) int queue_id; int ret = 0, r; - amdgpu_userq_detect_and_reset_queues(uq_mgr); /* Try to unmap all the queues in this process ctx */ idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) { - r = amdgpu_userq_unmap_helper(uq_mgr, queue); + r = amdgpu_userqueue_preempt_helper(uq_mgr, queue); if (r) ret = r; } @@ -900,7 +944,6 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr) uint32_t queue_id; cancel_delayed_work_sync(&userq_mgr->resume_work); - mutex_lock(&adev->userq_mutex); mutex_lock(&userq_mgr->userq_mutex); amdgpu_userq_detect_and_reset_queues(userq_mgr); @@ -909,7 +952,6 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr) amdgpu_userq_unmap_helper(userq_mgr, queue); amdgpu_userq_cleanup(userq_mgr, queue, queue_id); } - list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) { if (uqm == userq_mgr) { list_del(&uqm->list); -- 2.49.0