On Mon, Aug 18, 2025 at 10:58 PM Zhang, Jesse(Jie) <jesse.zh...@amd.com> wrote: > > [AMD Official Use Only - AMD Internal Distribution Only] > > Ping.. on this series.
See my comments on patch 2 from last week. Once that is fixed, I think this series is good to go. Alex > > -----Original Message----- > From: Jesse.Zhang <jesse.zh...@amd.com> > Sent: Monday, August 11, 2025 5:57 PM > To: amd-gfx@lists.freedesktop.org > Cc: Deucher, Alexander <alexander.deuc...@amd.com>; Koenig, Christian > <christian.koe...@amd.com>; Deucher, Alexander <alexander.deuc...@amd.com>; > Zhang, Jesse(Jie) <jesse.zh...@amd.com> > Subject: [v8 11/11] drm/amdgpu: Implement user queue reset functionality > > From: Alex Deucher <alexander.deuc...@amd.com> > > This patch adds robust reset handling for user queues (userq) to improve > recovery from queue failures. The key components include: > > 1. Queue detection and reset logic: > - amdgpu_userq_detect_and_reset_queues() identifies failed queues > - Per-IP detect_and_reset callbacks for targeted recovery > - Falls back to full GPU reset when needed > > 2. Reset infrastructure: > - Adds userq_reset_work workqueue for async reset handling > - Implements pre/post reset handlers for queue state management > - Integrates with existing GPU reset framework > > 3. Error handling improvements: > - Enhanced state tracking with HUNG state > - Automatic reset triggering on critical failures > - VRAM loss handling during recovery > > 4. Integration points: > - Added to device init/reset paths > - Called during queue destroy, suspend, and isolation events > - Handles both individual queue and full GPU resets > > The reset functionality works with both compute and graphics queues, > providing better resilience against queue failures while minimizing > disruption to unaffected queues. > > Signed-off-by: Alex Deucher <alexander.deuc...@amd.com> > Signed-off-by: Jesse Zhang <jesse.zh...@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 8 + > drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 199 +++++++++++++++++++-- > drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 5 + > 4 files changed, 202 insertions(+), 11 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index ef3af170dda4..9db05cdc7304 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -1302,6 +1302,7 @@ struct amdgpu_device { > struct list_head userq_mgr_list; > struct mutex userq_mutex; > bool userq_halt_for_enforce_isolation; > + struct work_struct userq_reset_work; > }; > > static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev, > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 3757634613c3..1dc88b0055dd 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -4475,6 +4475,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, > } > > INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); > + INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work); > > adev->gfx.gfx_off_req_count = 1; > adev->gfx.gfx_off_residency = 0; > @@ -5880,6 +5881,10 @@ int amdgpu_device_reinit_after_reset(struct > amdgpu_reset_context *reset_context) > if (r) > goto out; > > + r = amdgpu_userq_post_reset(tmp_adev, > vram_lost); > + if (r) > + goto out; > + > drm_client_dev_resume(adev_to_drm(tmp_adev), > false); > > /* > @@ -6102,6 +6107,7 @@ static inline void > amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) > if (!amdgpu_sriov_vf(adev)) > cancel_work(&adev->reset_work); > #endif > + cancel_work(&adev->userq_reset_work); > > if (adev->kfd.dev) > cancel_work(&adev->kfd.reset_work); > @@ -6232,6 +6238,8 @@ static void amdgpu_device_halt_activities(struct > amdgpu_device *adev, > amdgpu_device_ip_need_full_reset(tmp_adev)) > amdgpu_ras_suspend(tmp_adev); > > + amdgpu_userq_pre_reset(tmp_adev); > + > for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { > struct amdgpu_ring *ring = tmp_adev->rings[i]; > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > index decedf8057ac..028de887df26 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > @@ -25,8 +25,10 @@ > #include <drm/drm_auth.h> > #include <drm/drm_exec.h> > #include <linux/pm_runtime.h> > +#include <drm/drm_drv.h> > > #include "amdgpu.h" > +#include "amdgpu_reset.h" > #include "amdgpu_vm.h" > #include "amdgpu_userq.h" > #include "amdgpu_userq_fence.h" > @@ -44,6 +46,88 @@ u32 amdgpu_userq_get_supported_ip_mask(struct > amdgpu_device *adev) > return userq_ip_mask; > } > > +static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev) { > + > + if (amdgpu_device_should_recover_gpu(adev)) { > + amdgpu_reset_domain_schedule(adev->reset_domain, > + &adev->userq_reset_work); > + /* Wait for the reset job to complete */ > + flush_work(&adev->userq_reset_work); > + } > +} > + > +static int > +amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr) { > + struct amdgpu_device *adev = uq_mgr->adev; > + const struct amdgpu_userq_funcs *userq_gfx_funcs = > + adev->userq_funcs[AMDGPU_RING_TYPE_GFX]; > + const struct amdgpu_userq_funcs *userq_compute_funcs = > + adev->userq_funcs[AMDGPU_RING_TYPE_COMPUTE]; > + const struct amdgpu_userq_funcs *userq_sdma_funcs = > + adev->userq_funcs[AMDGPU_RING_TYPE_SDMA]; > + > + bool has_gfx = false, has_compute = false, has_sdma = false; > + struct amdgpu_usermode_queue *userq; > + bool gpu_reset = false; > + int id, r = 0; > + > + if (idr_is_empty(&uq_mgr->userq_idr)) > + return false; > + > + /* Detect which types of queues are present */ > + idr_for_each_entry(&uq_mgr->userq_idr, userq, id) { > + switch (userq->queue_type) { > + case AMDGPU_RING_TYPE_GFX: > + has_gfx = true; > + break; > + case AMDGPU_RING_TYPE_COMPUTE: > + has_compute = true; > + break; > + case AMDGPU_RING_TYPE_SDMA: > + has_sdma = true; > + break; > + default: > + break; > + } > + } > + > + if (unlikely(adev->debug_disable_gpu_ring_reset)) { > + dev_err(adev->dev, "userq reset disabled by debug mask\n"); > + } else if (amdgpu_gpu_recovery) { > + if (has_compute && userq_compute_funcs->detect_and_reset) { > + r = userq_compute_funcs->detect_and_reset(adev, > AMDGPU_RING_TYPE_COMPUTE); > + if (r) { > + gpu_reset = true; > + goto gpu_reset; > + } > + } > + > + if (has_gfx && userq_gfx_funcs->detect_and_reset) { > + r = userq_gfx_funcs->detect_and_reset(adev, > AMDGPU_RING_TYPE_GFX); > + if (r) { > + gpu_reset = true; > + goto gpu_reset; > + } > + } > + > + if (has_sdma && userq_sdma_funcs && > userq_sdma_funcs->detect_and_reset) { > + r = userq_sdma_funcs->detect_and_reset(adev, > AMDGPU_RING_TYPE_SDMA); > + if (r) { > + gpu_reset = true; > + goto gpu_reset; > + } > + } > + } > + > +gpu_reset: > + if (gpu_reset) > + amdgpu_userq_gpu_reset(adev); > + > + return r; > +} > + > static int > amdgpu_userqueue_preempt_helper(struct amdgpu_userq_mgr *uq_mgr, > struct amdgpu_usermode_queue *queue) @@ -56,6 > +140,7 @@ amdgpu_userqueue_preempt_helper(struct amdgpu_userq_mgr *uq_mgr, > if (queue->state == AMDGPU_USERQ_STATE_MAPPED) { > r = userq_funcs->preempt(uq_mgr, queue); > if (r) { > + amdgpu_userq_detect_and_reset_queues(uq_mgr); > queue->state = AMDGPU_USERQ_STATE_HUNG; > } else { > queue->state = AMDGPU_USERQ_STATE_PREEMPTED; @@ > -72,17 +157,21 @@ amdgpu_userqueue_restore_helper(struct amdgpu_userq_mgr > *uq_mgr, > struct amdgpu_device *adev = uq_mgr->adev; > const struct amdgpu_userq_funcs *userq_funcs = > adev->userq_funcs[queue->queue_type]; > + bool gpu_reset = false; > int r = 0; > > if (queue->state == AMDGPU_USERQ_STATE_PREEMPTED) { > r = userq_funcs->restore(uq_mgr, queue); > if (r) { > queue->state = AMDGPU_USERQ_STATE_HUNG; > + gpu_reset = true; > } else { > queue->state = AMDGPU_USERQ_STATE_MAPPED; > } > } > > + if (gpu_reset) > + amdgpu_userq_gpu_reset(adev); > return r; > } > > @@ -93,16 +182,23 @@ amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr > *uq_mgr, > struct amdgpu_device *adev = uq_mgr->adev; > const struct amdgpu_userq_funcs *userq_funcs = > adev->userq_funcs[queue->queue_type]; > + bool gpu_reset = false; > int r = 0; > > if ((queue->state == AMDGPU_USERQ_STATE_MAPPED) || > (queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) { > r = userq_funcs->unmap(uq_mgr, queue); > - if (r) > + if (r) { > queue->state = AMDGPU_USERQ_STATE_HUNG; > - else > + gpu_reset = true; > + } else { > queue->state = AMDGPU_USERQ_STATE_UNMAPPED; > + } > } > + > + if (gpu_reset) > + amdgpu_userq_gpu_reset(adev); > + > return r; > } > > @@ -113,16 +209,22 @@ amdgpu_userq_map_helper(struct amdgpu_userq_mgr *uq_mgr, > struct amdgpu_device *adev = uq_mgr->adev; > const struct amdgpu_userq_funcs *userq_funcs = > adev->userq_funcs[queue->queue_type]; > + bool gpu_reset = false; > int r = 0; > > if (queue->state == AMDGPU_USERQ_STATE_UNMAPPED) { > r = userq_funcs->map(uq_mgr, queue); > if (r) { > queue->state = AMDGPU_USERQ_STATE_HUNG; > + gpu_reset = true; > } else { > queue->state = AMDGPU_USERQ_STATE_MAPPED; > } > } > + > + if (gpu_reset) > + amdgpu_userq_gpu_reset(adev); > + > return r; > } > > @@ -361,6 +463,7 @@ amdgpu_userq_destroy(struct drm_file *filp, int queue_id) > amdgpu_bo_unreserve(queue->db_obj.obj); > } > amdgpu_bo_unref(&queue->db_obj.obj); > + amdgpu_userq_detect_and_reset_queues(uq_mgr); > r = amdgpu_userq_unmap_helper(uq_mgr, queue); > amdgpu_userq_cleanup(uq_mgr, queue, queue_id); > mutex_unlock(&uq_mgr->userq_mutex); > @@ -746,6 +849,23 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr) > return ret; > } > > +void amdgpu_userq_reset_work(struct work_struct *work) { > + struct amdgpu_device *adev = container_of(work, struct amdgpu_device, > + userq_reset_work); > + struct amdgpu_reset_context reset_context; > + > + memset(&reset_context, 0, sizeof(reset_context)); > + > + reset_context.method = AMD_RESET_METHOD_NONE; > + reset_context.reset_req_dev = adev; > + reset_context.src = AMDGPU_RESET_SRC_USERQ; > + set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); > + /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/ > + > + amdgpu_device_gpu_recover(adev, NULL, &reset_context); } > + > static int > amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr) { @@ -772,22 > +892,19 @@ void amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr, > struct amdgpu_eviction_fence *ev_fence) { > - int ret; > struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr); > struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr; > + struct amdgpu_device *adev = uq_mgr->adev; > + int ret; > > /* Wait for any pending userqueue fence work to finish */ > ret = amdgpu_userq_wait_for_signal(uq_mgr); > - if (ret) { > - drm_file_err(uq_mgr->file, "Not evicting userqueue, timeout > waiting for work\n"); > - return; > - } > + if (ret) > + dev_err(adev->dev, "Not evicting userqueue, timeout waiting > for > +work\n"); > > ret = amdgpu_userq_evict_all(uq_mgr); > - if (ret) { > - drm_file_err(uq_mgr->file, "Failed to evict userqueue\n"); > - return; > - } > + if (ret) > + dev_err(adev->dev, "Failed to evict userqueue\n"); > > /* Signal current eviction fence */ > amdgpu_eviction_fence_signal(evf_mgr, ev_fence); @@ -828,6 +945,7 @@ > void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr) > > mutex_lock(&adev->userq_mutex); > mutex_lock(&userq_mgr->userq_mutex); > + amdgpu_userq_detect_and_reset_queues(userq_mgr); > idr_for_each_entry(&userq_mgr->userq_idr, queue, queue_id) { > amdgpu_userq_wait_for_last_fence(userq_mgr, queue); > amdgpu_userq_unmap_helper(userq_mgr, queue); @@ -861,6 +979,7 > @@ int amdgpu_userq_suspend(struct amdgpu_device *adev) > list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) { > cancel_delayed_work_sync(&uqm->resume_work); > mutex_lock(&uqm->userq_mutex); > + amdgpu_userq_detect_and_reset_queues(uqm); > idr_for_each_entry(&uqm->userq_idr, queue, queue_id) { > r = amdgpu_userq_unmap_helper(uqm, queue); > if (r) > @@ -917,6 +1036,7 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct > amdgpu_device *adev, > list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) { > cancel_delayed_work_sync(&uqm->resume_work); > mutex_lock(&uqm->userq_mutex); > + amdgpu_userq_detect_and_reset_queues(uqm); > idr_for_each_entry(&uqm->userq_idr, queue, queue_id) { > if (((queue->queue_type == AMDGPU_HW_IP_GFX) || > (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) && > @@ -965,3 +1085,60 @@ int > amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev, > mutex_unlock(&adev->userq_mutex); > return ret; > } > + > +void amdgpu_userq_pre_reset(struct amdgpu_device *adev) { > + const struct amdgpu_userq_funcs *userq_funcs; > + struct amdgpu_usermode_queue *queue; > + struct amdgpu_userq_mgr *uqm, *tmp; > + int queue_id; > + > + list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) { > + cancel_delayed_work_sync(&uqm->resume_work); > + idr_for_each_entry(&uqm->userq_idr, queue, queue_id) { > + if (queue->state == AMDGPU_USERQ_STATE_MAPPED) { > + amdgpu_userq_wait_for_last_fence(uqm, queue); > + userq_funcs = > adev->userq_funcs[queue->queue_type]; > + userq_funcs->unmap(uqm, queue); > + /* just mark all queues as hung at this point. > + * if unmap succeeds, we could map again > + * in amdgpu_userq_post_reset() if vram is > not lost > + */ > + queue->state = AMDGPU_USERQ_STATE_HUNG; > + > amdgpu_userq_fence_driver_force_completion(queue); > + } > + } > + } > +} > + > +int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost) > +{ > + /* if any queue state is AMDGPU_USERQ_STATE_UNMAPPED > + * at this point, we should be able to map it again > + * and continue if vram is not lost. > + */ > + struct amdgpu_userq_mgr *uqm; > + struct amdgpu_usermode_queue *queue; > + const struct amdgpu_userq_funcs *userq_funcs; > + int queue_id, r = 0; > + > + list_for_each_entry(uqm, &adev->userq_mgr_list, list) { > + idr_for_each_entry(&uqm->userq_idr, queue, queue_id) { > + if (queue->state == AMDGPU_USERQ_STATE_HUNG && > !vram_lost) { > + userq_funcs = > adev->userq_funcs[queue->queue_type]; > + > + r = userq_funcs->map(uqm, queue); // Re-map > queue > + if (r) { > + dev_err(adev->dev, "Failed to remap > queue %d\n", queue_id); > + continue; > + } > + queue->state = AMDGPU_USERQ_STATE_MAPPED; > + } > + } > + > + /* Restart resume work after reset */ > + //queue_delayed_work(system_wq, &uqm->resume_work, > msecs_to_jiffies(100)); > + } > + > + return r; > +} > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h > index 9fa0d1a88d71..e68bb144b26f 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h > @@ -138,4 +138,9 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct > amdgpu_device *adev, int > amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev, > u32 idx); > > +void amdgpu_userq_reset_work(struct work_struct *work); > + > +void amdgpu_userq_pre_reset(struct amdgpu_device *adev); int > +amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost); > + > #endif > -- > 2.49.0 >