On Mon, Aug 4, 2025 at 4:41 AM Jesse.Zhang <jesse.zh...@amd.com> wrote: > > Replace the queue remove/add approach with suspend/resume semantics > for user queue preemption. This change: > > 1. Maintains queue scheduling registration while only preempting execution > - Previously used remove_queue/add_queue would fully deregister queues > - New suspend/resume approach keeps scheduler state while preempting > > 2. Introduces proper preemption helpers: > - amdgpu_userqueue_preempt_helper(): Suspends queue execution > - Transitions MAPPED→UNMAPPED state on success > - Marks as HUNG and triggers reset on failure > - amdgpu_userqueue_restore_helper(): Resumes queue execution > - Transitions UNMAPPED→MAPPED state on success > - Triggers GPU reset on failure
I would move the preempt/restore patches to the start of the series. Use preempt/restore for all of the cases where we need to preempt the queues and only use map/unmap for device init/fini and system suspend/resume. Alex > > Signed-off-by: Alex Deucher <alexander.deuc...@amd.com> > Signed-off-by: Jesse Zhang <jesse.zh...@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 64 +++++++++++++++++++---- > 1 file changed, 53 insertions(+), 11 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > index 0c91302162fa..3a8da1f47159 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > @@ -72,7 +72,7 @@ amdgpu_userq_detect_and_reset_queues(struct > amdgpu_userq_mgr *uq_mgr) > bool has_gfx = false, has_compute = false, has_sdma = false; > struct amdgpu_usermode_queue *userq; > bool gpu_reset = false; > - int gpu_suspend, id, r = 0; > + int id, r = 0; > > if (idr_is_empty(&uq_mgr->userq_idr)) > return false; > @@ -98,7 +98,6 @@ amdgpu_userq_detect_and_reset_queues(struct > amdgpu_userq_mgr *uq_mgr) > dev_err(adev->dev, "userq reset disabled by debug mask\n"); > } else if (amdgpu_gpu_recovery) { > if (has_compute && userq_compute_funcs->detect_and_reset) { > - gpu_suspend = amdgpu_mes_suspend(adev); > r = userq_compute_funcs->detect_and_reset(adev, > AMDGPU_RING_TYPE_COMPUTE); > if (r) { > gpu_reset = true; > @@ -127,9 +126,6 @@ amdgpu_userq_detect_and_reset_queues(struct > amdgpu_userq_mgr *uq_mgr) > if (gpu_reset) > amdgpu_userq_gpu_reset(adev); > > - if ((!gpu_suspend) && has_compute) > - amdgpu_mes_resume(adev); > - > return r; > } > > @@ -143,7 +139,8 @@ amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr, > bool gpu_reset = false; > int r = 0; > > - if (queue->state == AMDGPU_USERQ_STATE_MAPPED) { > + if ((queue->state == AMDGPU_USERQ_STATE_MAPPED) || > + (queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) { > r = userq_funcs->unmap(uq_mgr, queue); > if (r) { > queue->state = AMDGPU_USERQ_STATE_HUNG; > @@ -185,6 +182,54 @@ amdgpu_userq_map_helper(struct amdgpu_userq_mgr *uq_mgr, > return r; > } > > +static int > +amdgpu_userqueue_preempt_helper(struct amdgpu_userq_mgr *uq_mgr, > + struct amdgpu_usermode_queue *queue) > +{ > + struct amdgpu_device *adev = uq_mgr->adev; > + const struct amdgpu_userq_funcs *userq_funcs = > + adev->userq_funcs[queue->queue_type]; > + int r = 0; > + > + if (queue->state == AMDGPU_USERQ_STATE_MAPPED) { > + r = userq_funcs->preempt(uq_mgr, queue); > + if (r) { > + amdgpu_userq_detect_and_reset_queues(uq_mgr); > + queue->state = AMDGPU_USERQ_STATE_HUNG; > + } else { > + queue->state = AMDGPU_USERQ_STATE_PREEMPTED; > + } > + } > + > + return r; > +} > + > +static int > +amdgpu_userqueue_restore_helper(struct amdgpu_userq_mgr *uq_mgr, > + struct amdgpu_usermode_queue *queue) > +{ > + struct amdgpu_device *adev = uq_mgr->adev; > + const struct amdgpu_userq_funcs *userq_funcs = > + adev->userq_funcs[queue->queue_type]; > + bool gpu_reset = false; > + int r = 0; > + > + if (queue->state == AMDGPU_USERQ_STATE_PREEMPTED) { > + r = userq_funcs->restore(uq_mgr, queue); > + if (r) { > + queue->state = AMDGPU_USERQ_STATE_HUNG; > + gpu_reset = true; > + } else { > + queue->state = AMDGPU_USERQ_STATE_MAPPED; > + } > + } > + > + if (gpu_reset) > + amdgpu_userq_gpu_reset(adev); > + > + return r; > +} > + > static void > amdgpu_userq_wait_for_last_fence(struct amdgpu_userq_mgr *uq_mgr, > struct amdgpu_usermode_queue *queue) > @@ -639,7 +684,7 @@ amdgpu_userq_restore_all(struct amdgpu_userq_mgr *uq_mgr) > > /* Resume all the queues for this process */ > idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) { > - r = amdgpu_userq_map_helper(uq_mgr, queue); > + r = amdgpu_userqueue_restore_helper(uq_mgr, queue); > if (r) > ret = r; > } > @@ -794,10 +839,9 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr) > int queue_id; > int ret = 0, r; > > - amdgpu_userq_detect_and_reset_queues(uq_mgr); > /* Try to unmap all the queues in this process ctx */ > idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) { > - r = amdgpu_userq_unmap_helper(uq_mgr, queue); > + r = amdgpu_userqueue_preempt_helper(uq_mgr, queue); > if (r) > ret = r; > } > @@ -900,7 +944,6 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr > *userq_mgr) > uint32_t queue_id; > > cancel_delayed_work_sync(&userq_mgr->resume_work); > - > mutex_lock(&adev->userq_mutex); > mutex_lock(&userq_mgr->userq_mutex); > amdgpu_userq_detect_and_reset_queues(userq_mgr); > @@ -909,7 +952,6 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr > *userq_mgr) > amdgpu_userq_unmap_helper(userq_mgr, queue); > amdgpu_userq_cleanup(userq_mgr, queue, queue_id); > } > - > list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) { > if (uqm == userq_mgr) { > list_del(&uqm->list); > -- > 2.49.0 >