[AMD Official Use Only - AMD Internal Distribution Only] -----Original Message----- From: amd-gfx <amd-gfx-boun...@lists.freedesktop.org> On Behalf Of Alex Deucher Sent: Wednesday, September 3, 2025 5:39 AM To: Zhang, Jesse(Jie) <jesse.zh...@amd.com> Cc: amd-gfx@lists.freedesktop.org; Deucher, Alexander <alexander.deuc...@amd.com>; Koenig, Christian <christian.koe...@amd.com> Subject: Re: [v13 11/11] drm/amdgpu: Implement user queue reset functionality
On Tue, Sep 2, 2025 at 4:0 AM Jesse.Zhang <jesse.zh...@amd.com> wrote: > > From: Alex Deucher <alexander.deuc...@amd.com> > > This patch adds robust reset handling for user queues (userq) to > improve recovery from queue failures. The key components include: > > 1. Queue detection and reset logic: > - amdgpu_userq_detect_and_reset_queues() identifies failed queues > - Per-IP detect_and_reset callbacks for targeted recovery > - Falls back to full GPU reset when needed > > 2. Reset infrastructure: > - Adds userq_reset_work workqueue for async reset handling > - Implements pre/post reset handlers for queue state management > - Integrates with existing GPU reset framework > > 3. Error handling improvements: > - Enhanced state tracking with HUNG state > - Automatic reset triggering on critical failures > - VRAM loss handling during recovery > > 4. Integration points: > - Added to device init/reset paths > - Called during queue destroy, suspend, and isolation events > - Handles both individual queue and full GPU resets > > The reset functionality works with both compute and graphics queues, > providing better resilience against queue failures while minimizing > disruption to unaffected queues. > > v2: add detection and reset calls when preemption/unmaped fails. > add a per device userq counter for each user queue type.(Alex) > > Signed-off-by: Alex Deucher <alexander.deuc...@amd.com> > Signed-off-by: Jesse Zhang <jesse.zh...@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 8 + > drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 203 +++++++++++++++++++-- > drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 8 + > 4 files changed, 209 insertions(+), 11 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index ef3af170dda4..9db05cdc7304 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -1302,6 +1302,7 @@ struct amdgpu_device { > struct list_head userq_mgr_list; > struct mutex userq_mutex; > bool userq_halt_for_enforce_isolation; > + struct work_struct userq_reset_work; > }; > > static inline uint32_t amdgpu_ip_version(const struct amdgpu_device > *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 3757634613c3..1dc88b0055dd 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -4475,6 +4475,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, > } > > INIT_WORK(&adev->xgmi_reset_work, > amdgpu_device_xgmi_reset_func); > + INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work); > > adev->gfx.gfx_off_req_count = 1; > adev->gfx.gfx_off_residency = 0; @@ -5880,6 +5881,10 @@ int > amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) > if (r) > goto out; > > + r = amdgpu_userq_post_reset(tmp_adev, > vram_lost); > + if (r) > + goto out; > + > > drm_client_dev_resume(adev_to_drm(tmp_adev), false); > > /* > @@ -6102,6 +6107,7 @@ static inline void > amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) > if (!amdgpu_sriov_vf(adev)) > cancel_work(&adev->reset_work); #endif > + cancel_work(&adev->userq_reset_work); > > if (adev->kfd.dev) > cancel_work(&adev->kfd.reset_work); > @@ -6232,6 +6238,8 @@ static void amdgpu_device_halt_activities(struct > amdgpu_device *adev, > amdgpu_device_ip_need_full_reset(tmp_adev)) > amdgpu_ras_suspend(tmp_adev); > > + amdgpu_userq_pre_reset(tmp_adev); > + > for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { > struct amdgpu_ring *ring = tmp_adev->rings[i]; > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > index 54851ba8756a..87672b33102e 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > @@ -25,8 +25,10 @@ > #include <drm/drm_auth.h> > #include <drm/drm_exec.h> > #include <linux/pm_runtime.h> > +#include <drm/drm_drv.h> > > #include "amdgpu.h" > +#include "amdgpu_reset.h" > #include "amdgpu_vm.h" > #include "amdgpu_userq.h" > #include "amdgpu_userq_fence.h" > @@ -44,6 +46,64 @@ u32 amdgpu_userq_get_supported_ip_mask(struct > amdgpu_device *adev) > return userq_ip_mask; > } > > +static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev) > +{ > + if (amdgpu_device_should_recover_gpu(adev)) { > + amdgpu_reset_domain_schedule(adev->reset_domain, > + &adev->userq_reset_work); > + /* Wait for the reset job to complete */ > + flush_work(&adev->userq_reset_work); > + } > +} > + > +static int > +amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr) > We need to make sure we hold the adev->userq_mutex as well as the per >process user queue mutexes when we call this function because the >detect_and_reset() functions can mess with queues from any process. >We may want to get rid of the per process userq mutex and just use the >global one for all of these cases. Thanks, Alex. I'll update the patch and ensure all callers hold adev->userq_mutex. Also, we'll check for a global lock in this function. Like this: WARN_ON(!mutex_is_locked(&adev->userq_mutex)); Regards Jesse > Alex > +{ > + struct amdgpu_device *adev = uq_mgr->adev; > + const struct amdgpu_userq_funcs *userq_gfx_funcs = > + adev->userq_funcs[AMDGPU_RING_TYPE_GFX]; > + const struct amdgpu_userq_funcs *userq_compute_funcs = > + adev->userq_funcs[AMDGPU_RING_TYPE_COMPUTE]; > + const struct amdgpu_userq_funcs *userq_sdma_funcs = > + adev->userq_funcs[AMDGPU_RING_TYPE_SDMA]; > + bool gpu_reset = false; > + int r = 0; > + > + if (unlikely(adev->debug_disable_gpu_ring_reset)) { > + dev_err(adev->dev, "userq reset disabled by debug mask\n"); > + } else if (amdgpu_gpu_recovery) { > + if ((atomic_read(&uq_mgr->userq_compute_count) > 0) && > userq_compute_funcs->detect_and_reset) { > + r = userq_compute_funcs->detect_and_reset(adev, > AMDGPU_RING_TYPE_COMPUTE); > + if (r) { > + gpu_reset = true; > + goto gpu_reset; > + } > + } > + > + if ((atomic_read(&uq_mgr->userq_gfx_count) > 0) && > userq_gfx_funcs->detect_and_reset) { > + r = userq_gfx_funcs->detect_and_reset(adev, > AMDGPU_RING_TYPE_GFX); > + if (r) { > + gpu_reset = true; > + goto gpu_reset; > + } > + } > + > + if ((atomic_read(&uq_mgr->userq_sdma_count) > 0) && > userq_sdma_funcs->detect_and_reset) { > + r = userq_sdma_funcs->detect_and_reset(adev, > AMDGPU_RING_TYPE_SDMA); > + if (r) { > + gpu_reset = true; > + goto gpu_reset; > + } > + } > + } > + > +gpu_reset: > + if (gpu_reset) > + amdgpu_userq_gpu_reset(adev); > + > + return r; > +} > + > static int > amdgpu_userq_preempt_helper(struct amdgpu_userq_mgr *uq_mgr, > struct amdgpu_usermode_queue *queue) > @@ -51,17 +111,22 @@ amdgpu_userq_preempt_helper(struct amdgpu_userq_mgr > *uq_mgr, > struct amdgpu_device *adev = uq_mgr->adev; > const struct amdgpu_userq_funcs *userq_funcs = > adev->userq_funcs[queue->queue_type]; > + bool found_hung_queue = false; > int r = 0; > > if (queue->state == AMDGPU_USERQ_STATE_MAPPED) { > r = userq_funcs->preempt(uq_mgr, queue); > if (r) { > queue->state = AMDGPU_USERQ_STATE_HUNG; > + found_hung_queue = true; > } else { > queue->state = AMDGPU_USERQ_STATE_PREEMPTED; > } > } > > + if (found_hung_queue) > + amdgpu_userq_detect_and_reset_queues(uq_mgr); > + > return r; > } > > @@ -93,16 +158,23 @@ amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr > *uq_mgr, > struct amdgpu_device *adev = uq_mgr->adev; > const struct amdgpu_userq_funcs *userq_funcs = > adev->userq_funcs[queue->queue_type]; > + bool found_hung_queue = false; > int r = 0; > > if ((queue->state == AMDGPU_USERQ_STATE_MAPPED) || > (queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) { > r = userq_funcs->unmap(uq_mgr, queue); > - if (r) > + if (r) { > queue->state = AMDGPU_USERQ_STATE_HUNG; > - else > + found_hung_queue = true; > + } else { > queue->state = AMDGPU_USERQ_STATE_UNMAPPED; > + } > } > + > + if (found_hung_queue) > + amdgpu_userq_detect_and_reset_queues(uq_mgr); > + > return r; > } > > @@ -113,16 +185,22 @@ amdgpu_userq_map_helper(struct amdgpu_userq_mgr *uq_mgr, > struct amdgpu_device *adev = uq_mgr->adev; > const struct amdgpu_userq_funcs *userq_funcs = > adev->userq_funcs[queue->queue_type]; > + bool gpu_reset = false; > int r = 0; > > if (queue->state == AMDGPU_USERQ_STATE_UNMAPPED) { > r = userq_funcs->map(uq_mgr, queue); > if (r) { > queue->state = AMDGPU_USERQ_STATE_HUNG; > + gpu_reset = true; > } else { > queue->state = AMDGPU_USERQ_STATE_MAPPED; > } > } > + > + if (gpu_reset) > + amdgpu_userq_gpu_reset(adev); > + > return r; > } > > @@ -361,6 +439,18 @@ amdgpu_userq_destroy(struct drm_file *filp, int queue_id) > amdgpu_bo_unreserve(queue->db_obj.obj); > } > amdgpu_bo_unref(&queue->db_obj.obj); > + switch (queue->queue_type) { > + case AMDGPU_RING_TYPE_GFX: > + atomic_dec(&uq_mgr->userq_gfx_count); > + break; > + case AMDGPU_RING_TYPE_COMPUTE: > + atomic_dec(&uq_mgr->userq_compute_count); > + break; > + case AMDGPU_RING_TYPE_SDMA: > + atomic_dec(&uq_mgr->userq_sdma_count); > + break; > + } > + amdgpu_userq_detect_and_reset_queues(uq_mgr); > r = amdgpu_userq_unmap_helper(uq_mgr, queue); > amdgpu_userq_cleanup(uq_mgr, queue, queue_id); > mutex_unlock(&uq_mgr->userq_mutex); > @@ -520,6 +610,19 @@ amdgpu_userq_create(struct drm_file *filp, union > drm_amdgpu_userq *args) > > > args->out.queue_id = qid; > + switch (queue->queue_type) { > + case AMDGPU_RING_TYPE_GFX: > + atomic_inc(&uq_mgr->userq_gfx_count); > + break; > + case AMDGPU_RING_TYPE_COMPUTE: > + atomic_inc(&uq_mgr->userq_compute_count); > + break; > + case AMDGPU_RING_TYPE_SDMA: > + atomic_inc(&uq_mgr->userq_sdma_count); > + break; > + default: > + break; > + } > > unlock: > mutex_unlock(&uq_mgr->userq_mutex); > @@ -734,6 +837,7 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr) > int queue_id; > int ret = 0, r; > > + amdgpu_userq_detect_and_reset_queues(uq_mgr); > /* Try to unmap all the queues in this process ctx */ > idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) { > r = amdgpu_userq_preempt_helper(uq_mgr, queue); > @@ -746,6 +850,23 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr) > return ret; > } > > +void amdgpu_userq_reset_work(struct work_struct *work) > +{ > + struct amdgpu_device *adev = container_of(work, struct amdgpu_device, > + userq_reset_work); > + struct amdgpu_reset_context reset_context; > + > + memset(&reset_context, 0, sizeof(reset_context)); > + > + reset_context.method = AMD_RESET_METHOD_NONE; > + reset_context.reset_req_dev = adev; > + reset_context.src = AMDGPU_RESET_SRC_USERQ; > + set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); > + /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/ > + > + amdgpu_device_gpu_recover(adev, NULL, &reset_context); > +} > + > static int > amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr) > { > @@ -772,22 +893,19 @@ void > amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr, > struct amdgpu_eviction_fence *ev_fence) > { > - int ret; > struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr); > struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr; > + struct amdgpu_device *adev = uq_mgr->adev; > + int ret; > > /* Wait for any pending userqueue fence work to finish */ > ret = amdgpu_userq_wait_for_signal(uq_mgr); > - if (ret) { > - drm_file_err(uq_mgr->file, "Not evicting userqueue, timeout > waiting for work\n"); > - return; > - } > + if (ret) > + dev_err(adev->dev, "Not evicting userqueue, timeout waiting > for work\n"); > > ret = amdgpu_userq_evict_all(uq_mgr); > - if (ret) { > - drm_file_err(uq_mgr->file, "Failed to evict userqueue\n"); > - return; > - } > + if (ret) > + dev_err(adev->dev, "Failed to evict userqueue\n"); > > /* Signal current eviction fence */ > amdgpu_eviction_fence_signal(evf_mgr, ev_fence); > @@ -811,6 +929,9 @@ int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr > *userq_mgr, struct drm_file *f > > mutex_lock(&adev->userq_mutex); > list_add(&userq_mgr->list, &adev->userq_mgr_list); > + atomic_set(&userq_mgr->userq_gfx_count, 0); > + atomic_set(&userq_mgr->userq_compute_count, 0); > + atomic_set(&userq_mgr->userq_sdma_count, 0); > mutex_unlock(&adev->userq_mutex); > > INIT_DELAYED_WORK(&userq_mgr->resume_work, > amdgpu_userq_restore_worker); > @@ -828,6 +949,7 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr > *userq_mgr) > > mutex_lock(&adev->userq_mutex); > mutex_lock(&userq_mgr->userq_mutex); > + amdgpu_userq_detect_and_reset_queues(userq_mgr); > idr_for_each_entry(&userq_mgr->userq_idr, queue, queue_id) { > amdgpu_userq_wait_for_last_fence(userq_mgr, queue); > amdgpu_userq_unmap_helper(userq_mgr, queue); > @@ -861,6 +983,7 @@ int amdgpu_userq_suspend(struct amdgpu_device *adev) > list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) { > cancel_delayed_work_sync(&uqm->resume_work); > mutex_lock(&uqm->userq_mutex); > + amdgpu_userq_detect_and_reset_queues(uqm); > idr_for_each_entry(&uqm->userq_idr, queue, queue_id) { > r = amdgpu_userq_unmap_helper(uqm, queue); > if (r) > @@ -917,6 +1040,7 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct > amdgpu_device *adev, > list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) { > cancel_delayed_work_sync(&uqm->resume_work); > mutex_lock(&uqm->userq_mutex); > + amdgpu_userq_detect_and_reset_queues(uqm); > idr_for_each_entry(&uqm->userq_idr, queue, queue_id) { > if (((queue->queue_type == AMDGPU_HW_IP_GFX) || > (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) && > @@ -965,3 +1089,60 @@ int > amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev, > mutex_unlock(&adev->userq_mutex); > return ret; > } > + > +void amdgpu_userq_pre_reset(struct amdgpu_device *adev) > +{ > + const struct amdgpu_userq_funcs *userq_funcs; > + struct amdgpu_usermode_queue *queue; > + struct amdgpu_userq_mgr *uqm, *tmp; > + int queue_id; > + > + list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) { > + cancel_delayed_work_sync(&uqm->resume_work); > + idr_for_each_entry(&uqm->userq_idr, queue, queue_id) { > + if (queue->state == AMDGPU_USERQ_STATE_MAPPED) { > + amdgpu_userq_wait_for_last_fence(uqm, queue); > + userq_funcs = > adev->userq_funcs[queue->queue_type]; > + userq_funcs->unmap(uqm, queue); > + /* just mark all queues as hung at this point. > + * if unmap succeeds, we could map again > + * in amdgpu_userq_post_reset() if vram is > not lost > + */ > + queue->state = AMDGPU_USERQ_STATE_HUNG; > + > amdgpu_userq_fence_driver_force_completion(queue); > + } > + } > + } > +} > + > +int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost) > +{ > + /* if any queue state is AMDGPU_USERQ_STATE_UNMAPPED > + * at this point, we should be able to map it again > + * and continue if vram is not lost. > + */ > + struct amdgpu_userq_mgr *uqm; > + struct amdgpu_usermode_queue *queue; > + const struct amdgpu_userq_funcs *userq_funcs; > + int queue_id, r = 0; > + > + list_for_each_entry(uqm, &adev->userq_mgr_list, list) { > + idr_for_each_entry(&uqm->userq_idr, queue, queue_id) { > + if (queue->state == AMDGPU_USERQ_STATE_HUNG && > !vram_lost) { > + userq_funcs = > adev->userq_funcs[queue->queue_type]; > + > + r = userq_funcs->map(uqm, queue); // Re-map > queue > + if (r) { > + dev_err(adev->dev, "Failed to remap > queue %d\n", queue_id); > + continue; > + } > + queue->state = AMDGPU_USERQ_STATE_MAPPED; > + } > + } > + > + /* Restart resume work after reset */ > + //queue_delayed_work(system_wq, &uqm->resume_work, > msecs_to_jiffies(100)); > + } > + > + return r; > +} > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h > index 9fa0d1a88d71..ff9aa41c4ff8 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h > @@ -93,6 +93,9 @@ struct amdgpu_userq_mgr { > struct delayed_work resume_work; > struct list_head list; > struct drm_file *file; > + atomic_t userq_gfx_count; > + atomic_t userq_compute_count; > + atomic_t userq_sdma_count; > }; > > struct amdgpu_db_info { > @@ -138,4 +141,9 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct > amdgpu_device *adev, > int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device > *adev, > u32 idx); > > +void amdgpu_userq_reset_work(struct work_struct *work); > + > +void amdgpu_userq_pre_reset(struct amdgpu_device *adev); > +int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost); > + > #endif > -- > 2.49.0 >