This patch adds robust reset handling for user queues (userq) to improve recovery from queue failures. The key components include:
1. Queue detection and reset logic: - amdgpu_userq_detect_and_reset_queues() identifies failed queues - Per-IP detect_and_reset callbacks for targeted recovery - Falls back to full GPU reset when needed 2. Reset infrastructure: - Adds userq_reset_work workqueue for async reset handling - Implements pre/post reset handlers for queue state management - Integrates with existing GPU reset framework 3. Error handling improvements: - Enhanced state tracking with HUNG state - Automatic reset triggering on critical failures - VRAM loss handling during recovery 4. Integration points: - Added to device init/reset paths - Called during queue destroy, suspend, and isolation events - Handles both individual queue and full GPU resets The reset functionality works with both gfx/compute and sdma queues, providing better resilience against queue failures while minimizing disruption to unaffected queues. v2: add detection and reset calls when preemption/unmaped fails. add a per device userq counter for each user queue type.(Alex) v3: make sure we hold the adev->userq_mutex when we call amdgpu_userq_detect_and_reset_queues. (Alex) warn if the adev->userq_mutex is not held. v4: make sure we have all of the uqm->userq_mutex held. warn if the uqm->userq_mutex is not held. v5: Use array for user queue type counters.(Alex) all of the uqm->userq_mutex need to be held when calling detect and reset. (Alex) v6: simply the userq lock (Alex) Signed-off-by: Alex Deucher <alexander.deuc...@amd.com> Signed-off-by: Jesse Zhang <jesse.zh...@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 8 + drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 192 +++++++++++++++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 5 + 5 files changed, 193 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index c56839528843..93c255d1ddfa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1311,6 +1311,7 @@ struct amdgpu_device { struct list_head userq_mgr_list; struct mutex userq_mutex; bool userq_halt_for_enforce_isolation; + struct work_struct userq_reset_work; struct amdgpu_uid *uid_info; /* KFD diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index b20383021b50..f1d3c16c67cd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4534,6 +4534,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, } INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); + INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work); adev->gfx.gfx_off_req_count = 1; adev->gfx.gfx_off_residency = 0; @@ -5951,6 +5952,10 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) if (r) goto out; + r = amdgpu_userq_post_reset(tmp_adev, vram_lost); + if (r) + goto out; + drm_client_dev_resume(adev_to_drm(tmp_adev), false); /* @@ -6173,6 +6178,7 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) if (!amdgpu_sriov_vf(adev)) cancel_work(&adev->reset_work); #endif + cancel_work(&adev->userq_reset_work); if (adev->kfd.dev) cancel_work(&adev->kfd.reset_work); @@ -6293,6 +6299,8 @@ static void amdgpu_device_halt_activities(struct amdgpu_device *adev, amdgpu_device_ip_need_full_reset(tmp_adev)) amdgpu_ras_suspend(tmp_adev); + amdgpu_userq_pre_reset(tmp_adev); + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = tmp_adev->rings[i]; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h index 80b85547c810..cf21f36b5761 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h @@ -83,6 +83,7 @@ enum amdgpu_ring_type { AMDGPU_RING_TYPE_MES, AMDGPU_RING_TYPE_UMSCH_MM, AMDGPU_RING_TYPE_CPER, + AMDGPU_RING_TYPE_MAX, }; enum amdgpu_ib_pool_type { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index af92450ea6eb..adfacfa495c8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -25,8 +25,10 @@ #include <drm/drm_auth.h> #include <drm/drm_exec.h> #include <linux/pm_runtime.h> +#include <drm/drm_drv.h> #include "amdgpu.h" +#include "amdgpu_reset.h" #include "amdgpu_vm.h" #include "amdgpu_userq.h" #include "amdgpu_userq_fence.h" @@ -44,6 +46,67 @@ u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev) return userq_ip_mask; } +static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev) +{ + if (amdgpu_device_should_recover_gpu(adev)) { + amdgpu_reset_domain_schedule(adev->reset_domain, + &adev->userq_reset_work); + /* Wait for the reset job to complete */ + flush_work(&adev->userq_reset_work); + } +} + +static int +amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr) +{ + struct amdgpu_device *adev = uq_mgr->adev; + const struct amdgpu_userq_funcs *userq_gfx_funcs = + adev->userq_funcs[AMDGPU_RING_TYPE_GFX]; + const struct amdgpu_userq_funcs *userq_compute_funcs = + adev->userq_funcs[AMDGPU_RING_TYPE_COMPUTE]; + const struct amdgpu_userq_funcs *userq_sdma_funcs = + adev->userq_funcs[AMDGPU_RING_TYPE_SDMA]; + bool gpu_reset = false; + int r = 0; + + /* warning if global mutex is not held */ + WARN_ON(!mutex_is_locked(&adev->userq_mutex)); + + if (unlikely(adev->debug_disable_gpu_ring_reset)) { + dev_err(adev->dev, "userq reset disabled by debug mask\n"); + } else if (amdgpu_gpu_recovery) { + if ((atomic_read(&uq_mgr->userq_count[AMDGPU_RING_TYPE_COMPUTE]) > 0) && userq_compute_funcs->detect_and_reset) { + r = userq_compute_funcs->detect_and_reset(adev, AMDGPU_RING_TYPE_COMPUTE); + if (r) { + gpu_reset = true; + goto gpu_reset; + } + } + + if ((atomic_read(&uq_mgr->userq_count[AMDGPU_RING_TYPE_GFX]) > 0) && userq_gfx_funcs->detect_and_reset) { + r = userq_gfx_funcs->detect_and_reset(adev, AMDGPU_RING_TYPE_GFX); + if (r) { + gpu_reset = true; + goto gpu_reset; + } + } + + if ((atomic_read(&uq_mgr->userq_count[AMDGPU_RING_TYPE_SDMA]) > 0) && userq_sdma_funcs->detect_and_reset) { + r = userq_sdma_funcs->detect_and_reset(adev, AMDGPU_RING_TYPE_SDMA); + if (r) { + gpu_reset = true; + goto gpu_reset; + } + } + } + +gpu_reset: + if (gpu_reset) + amdgpu_userq_gpu_reset(adev); + + return r; +} + int amdgpu_userq_input_va_validate(struct amdgpu_vm *vm, u64 addr, u64 expected_size) { @@ -83,17 +146,22 @@ amdgpu_userq_preempt_helper(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_device *adev = uq_mgr->adev; const struct amdgpu_userq_funcs *userq_funcs = adev->userq_funcs[queue->queue_type]; + bool found_hung_queue = false; int r = 0; if (queue->state == AMDGPU_USERQ_STATE_MAPPED) { r = userq_funcs->preempt(uq_mgr, queue); if (r) { queue->state = AMDGPU_USERQ_STATE_HUNG; + found_hung_queue = true; } else { queue->state = AMDGPU_USERQ_STATE_PREEMPTED; } } + if (found_hung_queue) + amdgpu_userq_detect_and_reset_queues(uq_mgr); + return r; } @@ -125,16 +193,23 @@ amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_device *adev = uq_mgr->adev; const struct amdgpu_userq_funcs *userq_funcs = adev->userq_funcs[queue->queue_type]; + bool found_hung_queue = false; int r = 0; if ((queue->state == AMDGPU_USERQ_STATE_MAPPED) || (queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) { r = userq_funcs->unmap(uq_mgr, queue); - if (r) + if (r) { queue->state = AMDGPU_USERQ_STATE_HUNG; - else + found_hung_queue = true; + } else { queue->state = AMDGPU_USERQ_STATE_UNMAPPED; + } } + + if (found_hung_queue) + amdgpu_userq_detect_and_reset_queues(uq_mgr); + return r; } @@ -145,16 +220,22 @@ amdgpu_userq_map_helper(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_device *adev = uq_mgr->adev; const struct amdgpu_userq_funcs *userq_funcs = adev->userq_funcs[queue->queue_type]; + bool gpu_reset = false; int r = 0; if (queue->state == AMDGPU_USERQ_STATE_UNMAPPED) { r = userq_funcs->map(uq_mgr, queue); if (r) { queue->state = AMDGPU_USERQ_STATE_HUNG; + gpu_reset = true; } else { queue->state = AMDGPU_USERQ_STATE_MAPPED; } } + + if (gpu_reset) + amdgpu_userq_gpu_reset(adev); + return r; } @@ -378,10 +459,11 @@ amdgpu_userq_destroy(struct drm_file *filp, int queue_id) amdgpu_bo_unreserve(queue->db_obj.obj); } amdgpu_bo_unref(&queue->db_obj.obj); - + atomic_dec(&uq_mgr->userq_count[queue->queue_type]); #if defined(CONFIG_DEBUG_FS) debugfs_remove_recursive(queue->debugfs_queue); #endif + amdgpu_userq_detect_and_reset_queues(uq_mgr); r = amdgpu_userq_unmap_helper(uq_mgr, queue); /*TODO: It requires a reset for userq hw unmap error*/ if (unlikely(r != AMDGPU_USERQ_STATE_UNMAPPED)) { @@ -587,6 +669,7 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args) kfree(queue_name); args->out.queue_id = qid; + atomic_inc(&uq_mgr->userq_count[queue->queue_type]); unlock: mutex_unlock(&adev->userq_mutex); @@ -847,6 +930,7 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr) int queue_id; int ret = 0, r; + amdgpu_userq_detect_and_reset_queues(uq_mgr); /* Try to unmap all the queues in this process ctx */ idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) { r = amdgpu_userq_preempt_helper(uq_mgr, queue); @@ -859,6 +943,23 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr) return ret; } +void amdgpu_userq_reset_work(struct work_struct *work) +{ + struct amdgpu_device *adev = container_of(work, struct amdgpu_device, + userq_reset_work); + struct amdgpu_reset_context reset_context; + + memset(&reset_context, 0, sizeof(reset_context)); + + reset_context.method = AMD_RESET_METHOD_NONE; + reset_context.reset_req_dev = adev; + reset_context.src = AMDGPU_RESET_SRC_USERQ; + set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/ + + amdgpu_device_gpu_recover(adev, NULL, &reset_context); +} + static int amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr) { @@ -885,22 +986,19 @@ void amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_eviction_fence *ev_fence) { - int ret; struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr); struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr; + struct amdgpu_device *adev = uq_mgr->adev; + int ret; /* Wait for any pending userqueue fence work to finish */ ret = amdgpu_userq_wait_for_signal(uq_mgr); - if (ret) { - drm_file_err(uq_mgr->file, "Not evicting userqueue, timeout waiting for work\n"); - return; - } + if (ret) + dev_err(adev->dev, "Not evicting userqueue, timeout waiting for work\n"); ret = amdgpu_userq_evict_all(uq_mgr); - if (ret) { - drm_file_err(uq_mgr->file, "Failed to evict userqueue\n"); - return; - } + if (ret) + dev_err(adev->dev, "Failed to evict userqueue\n"); /* Signal current eviction fence */ amdgpu_eviction_fence_signal(evf_mgr, ev_fence); @@ -909,7 +1007,6 @@ amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr, cancel_delayed_work_sync(&uq_mgr->resume_work); return; } - /* Schedule a resume work */ schedule_delayed_work(&uq_mgr->resume_work, 0); } @@ -917,12 +1014,18 @@ amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr, int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file *file_priv, struct amdgpu_device *adev) { + int i; + idr_init_base(&userq_mgr->userq_idr, 1); userq_mgr->adev = adev; userq_mgr->file = file_priv; mutex_lock(&adev->userq_mutex); list_add(&userq_mgr->list, &adev->userq_mgr_list); + /* Initialize all queue type counters to zero */ + for (i = 0; i < AMDGPU_RING_TYPE_MAX; i++) { + atomic_set(&userq_mgr->userq_count[i], 0); + } mutex_unlock(&adev->userq_mutex); INIT_DELAYED_WORK(&userq_mgr->resume_work, amdgpu_userq_restore_worker); @@ -939,6 +1042,7 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr) cancel_delayed_work_sync(&userq_mgr->resume_work); mutex_lock(&adev->userq_mutex); + amdgpu_userq_detect_and_reset_queues(userq_mgr); idr_for_each_entry(&userq_mgr->userq_idr, queue, queue_id) { amdgpu_userq_wait_for_last_fence(userq_mgr, queue); amdgpu_userq_unmap_helper(userq_mgr, queue); @@ -967,6 +1071,7 @@ int amdgpu_userq_suspend(struct amdgpu_device *adev) return 0; mutex_lock(&adev->userq_mutex); + amdgpu_userq_detect_and_reset_queues(uqm); list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) { cancel_delayed_work_sync(&uqm->resume_work); idr_for_each_entry(&uqm->userq_idr, queue, queue_id) { @@ -1021,13 +1126,15 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev, adev->userq_halt_for_enforce_isolation = true; list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) { cancel_delayed_work_sync(&uqm->resume_work); + amdgpu_userq_detect_and_reset_queues(uqm); idr_for_each_entry(&uqm->userq_idr, queue, queue_id) { if (((queue->queue_type == AMDGPU_HW_IP_GFX) || (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) && (queue->xcp_id == idx)) { r = amdgpu_userq_preempt_helper(uqm, queue); - if (r) + if (r) { ret = r; + } } } } @@ -1066,3 +1173,60 @@ int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev, mutex_unlock(&adev->userq_mutex); return ret; } + +void amdgpu_userq_pre_reset(struct amdgpu_device *adev) +{ + const struct amdgpu_userq_funcs *userq_funcs; + struct amdgpu_usermode_queue *queue; + struct amdgpu_userq_mgr *uqm, *tmp; + int queue_id; + + list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) { + cancel_delayed_work_sync(&uqm->resume_work); + idr_for_each_entry(&uqm->userq_idr, queue, queue_id) { + if (queue->state == AMDGPU_USERQ_STATE_MAPPED) { + amdgpu_userq_wait_for_last_fence(uqm, queue); + userq_funcs = adev->userq_funcs[queue->queue_type]; + userq_funcs->unmap(uqm, queue); + /* just mark all queues as hung at this point. + * if unmap succeeds, we could map again + * in amdgpu_userq_post_reset() if vram is not lost + */ + queue->state = AMDGPU_USERQ_STATE_HUNG; + amdgpu_userq_fence_driver_force_completion(queue); + } + } + } +} + +int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost) +{ + /* if any queue state is AMDGPU_USERQ_STATE_UNMAPPED + * at this point, we should be able to map it again + * and continue if vram is not lost. + */ + struct amdgpu_userq_mgr *uqm; + struct amdgpu_usermode_queue *queue; + const struct amdgpu_userq_funcs *userq_funcs; + int queue_id, r = 0; + + list_for_each_entry(uqm, &adev->userq_mgr_list, list) { + idr_for_each_entry(&uqm->userq_idr, queue, queue_id) { + if (queue->state == AMDGPU_USERQ_STATE_HUNG && !vram_lost) { + userq_funcs = adev->userq_funcs[queue->queue_type]; + + r = userq_funcs->map(uqm, queue); // Re-map queue + if (r) { + dev_err(adev->dev, "Failed to remap queue %d\n", queue_id); + continue; + } + queue->state = AMDGPU_USERQ_STATE_MAPPED; + } + } + + /* Restart resume work after reset */ + //queue_delayed_work(system_wq, &uqm->resume_work, msecs_to_jiffies(100)); + } + + return r; +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h index 2d63308d55c3..7cd5344c0344 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h @@ -93,6 +93,7 @@ struct amdgpu_userq_mgr { struct delayed_work resume_work; struct list_head list; struct drm_file *file; + atomic_t userq_count[AMDGPU_RING_TYPE_MAX]; }; struct amdgpu_db_info { @@ -136,6 +137,10 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev, int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev, u32 idx); +void amdgpu_userq_reset_work(struct work_struct *work); +void amdgpu_userq_pre_reset(struct amdgpu_device *adev); +int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost); + int amdgpu_userq_input_va_validate(struct amdgpu_vm *vm, u64 addr, u64 expected_size); #endif -- 2.49.0