This patch introduces GPU reset support for userq. It adds detection and recovery logic for gfx, compute, and SDMA queues when errors occur during queue operations (e.g., map/unmap failures).
Key changes include: - New function `amdgpu_userq_detect_and_reset_queues()` to scan active queues and attempt recovery via queue-specific handlers. - Integration of userq reset logic into suspend/resume, eviction, and reset paths. - New pre-reset and post-reset hooks (`amdgpu_userq_pre_reset()` and `amdgpu_userq_post_reset()`) for cleaning up and reinitializing user queues during full GPU reset. - A new work_struct (`userq_reset_work`) is introduced and used in `amdgpu_userq_gpu_reset()` to trigger GPU recovery using the standard GPU reset framework. - Improved robustness and logging during user queue unmap/map operations. This enhances user queue resiliency and ensures graceful recovery in case of individual queue failure without immediately requiring full GPU reset. Signed-off-by: Alex Deucher <alexander.deuc...@amd.com> Signed-off-by: Jesse Zhang <jesse.zh...@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 8 + drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 200 +++++++++++++++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 5 + 4 files changed, 203 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index ef3af170dda4..9db05cdc7304 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1302,6 +1302,7 @@ struct amdgpu_device { struct list_head userq_mgr_list; struct mutex userq_mutex; bool userq_halt_for_enforce_isolation; + struct work_struct userq_reset_work; }; static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 3757634613c3..1dc88b0055dd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4475,6 +4475,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, } INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); + INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work); adev->gfx.gfx_off_req_count = 1; adev->gfx.gfx_off_residency = 0; @@ -5880,6 +5881,10 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context) if (r) goto out; + r = amdgpu_userq_post_reset(tmp_adev, vram_lost); + if (r) + goto out; + drm_client_dev_resume(adev_to_drm(tmp_adev), false); /* @@ -6102,6 +6107,7 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) if (!amdgpu_sriov_vf(adev)) cancel_work(&adev->reset_work); #endif + cancel_work(&adev->userq_reset_work); if (adev->kfd.dev) cancel_work(&adev->kfd.reset_work); @@ -6232,6 +6238,8 @@ static void amdgpu_device_halt_activities(struct amdgpu_device *adev, amdgpu_device_ip_need_full_reset(tmp_adev)) amdgpu_ras_suspend(tmp_adev); + amdgpu_userq_pre_reset(tmp_adev); + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = tmp_adev->rings[i]; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index aac0de86f3e8..0c91302162fa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -26,7 +26,10 @@ #include <drm/drm_exec.h> #include <linux/pm_runtime.h> +#include <drm/drm_drv.h> + #include "amdgpu.h" +#include "amdgpu_reset.h" #include "amdgpu_vm.h" #include "amdgpu_userq.h" #include "amdgpu_userq_fence.h" @@ -44,6 +47,92 @@ u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev) return userq_ip_mask; } +static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev) +{ + + if (amdgpu_device_should_recover_gpu(adev)) { + amdgpu_reset_domain_schedule(adev->reset_domain, + &adev->userq_reset_work); + /* Wait for the reset job to complete */ + flush_work(&adev->userq_reset_work); + } +} + +static int +amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr) +{ + struct amdgpu_device *adev = uq_mgr->adev; + const struct amdgpu_userq_funcs *userq_gfx_funcs = + adev->userq_funcs[AMDGPU_RING_TYPE_GFX]; + const struct amdgpu_userq_funcs *userq_compute_funcs = + adev->userq_funcs[AMDGPU_RING_TYPE_COMPUTE]; + const struct amdgpu_userq_funcs *userq_sdma_funcs = + adev->userq_funcs[AMDGPU_RING_TYPE_SDMA]; + + bool has_gfx = false, has_compute = false, has_sdma = false; + struct amdgpu_usermode_queue *userq; + bool gpu_reset = false; + int gpu_suspend, id, r = 0; + + if (idr_is_empty(&uq_mgr->userq_idr)) + return false; + + /* Detect which types of queues are present */ + idr_for_each_entry(&uq_mgr->userq_idr, userq, id) { + switch (userq->queue_type) { + case AMDGPU_RING_TYPE_GFX: + has_gfx = true; + break; + case AMDGPU_RING_TYPE_COMPUTE: + has_compute = true; + break; + case AMDGPU_RING_TYPE_SDMA: + has_sdma = true; + break; + default: + break; + } + } + + if (unlikely(adev->debug_disable_gpu_ring_reset)) { + dev_err(adev->dev, "userq reset disabled by debug mask\n"); + } else if (amdgpu_gpu_recovery) { + if (has_compute && userq_compute_funcs->detect_and_reset) { + gpu_suspend = amdgpu_mes_suspend(adev); + r = userq_compute_funcs->detect_and_reset(adev, AMDGPU_RING_TYPE_COMPUTE); + if (r) { + gpu_reset = true; + goto gpu_reset; + } + } + + if (has_gfx && userq_gfx_funcs->detect_and_reset) { + r = userq_gfx_funcs->detect_and_reset(adev, AMDGPU_RING_TYPE_GFX); + if (r) { + gpu_reset = true; + goto gpu_reset; + } + } + + if (has_sdma && userq_sdma_funcs && userq_sdma_funcs->detect_and_reset) { + r = userq_sdma_funcs->detect_and_reset(adev, AMDGPU_RING_TYPE_SDMA); + if (r) { + gpu_reset = true; + goto gpu_reset; + } + } + } + +gpu_reset: + if (gpu_reset) + amdgpu_userq_gpu_reset(adev); + + if ((!gpu_suspend) && has_compute) + amdgpu_mes_resume(adev); + + return r; +} + static int amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_usermode_queue *queue) @@ -51,15 +140,22 @@ amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_device *adev = uq_mgr->adev; const struct amdgpu_userq_funcs *userq_funcs = adev->userq_funcs[queue->queue_type]; + bool gpu_reset = false; int r = 0; if (queue->state == AMDGPU_USERQ_STATE_MAPPED) { r = userq_funcs->unmap(uq_mgr, queue); - if (r) + if (r) { queue->state = AMDGPU_USERQ_STATE_HUNG; - else + gpu_reset = true; + } else { queue->state = AMDGPU_USERQ_STATE_UNMAPPED; + } } + + if (gpu_reset) + amdgpu_userq_gpu_reset(adev); + return r; } @@ -70,16 +166,22 @@ amdgpu_userq_map_helper(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_device *adev = uq_mgr->adev; const struct amdgpu_userq_funcs *userq_funcs = adev->userq_funcs[queue->queue_type]; + bool gpu_reset = false; int r = 0; if (queue->state == AMDGPU_USERQ_STATE_UNMAPPED) { r = userq_funcs->map(uq_mgr, queue); if (r) { queue->state = AMDGPU_USERQ_STATE_HUNG; + gpu_reset = true; } else { queue->state = AMDGPU_USERQ_STATE_MAPPED; } } + + if (gpu_reset) + amdgpu_userq_gpu_reset(adev); + return r; } @@ -318,6 +420,7 @@ amdgpu_userq_destroy(struct drm_file *filp, int queue_id) amdgpu_bo_unreserve(queue->db_obj.obj); } amdgpu_bo_unref(&queue->db_obj.obj); + amdgpu_userq_detect_and_reset_queues(uq_mgr); r = amdgpu_userq_unmap_helper(uq_mgr, queue); amdgpu_userq_cleanup(uq_mgr, queue, queue_id); mutex_unlock(&uq_mgr->userq_mutex); @@ -691,6 +794,7 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr) int queue_id; int ret = 0, r; + amdgpu_userq_detect_and_reset_queues(uq_mgr); /* Try to unmap all the queues in this process ctx */ idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) { r = amdgpu_userq_unmap_helper(uq_mgr, queue); @@ -703,6 +807,23 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr) return ret; } +void amdgpu_userq_reset_work(struct work_struct *work) +{ + struct amdgpu_device *adev = container_of(work, struct amdgpu_device, + userq_reset_work); + struct amdgpu_reset_context reset_context; + + memset(&reset_context, 0, sizeof(reset_context)); + + reset_context.method = AMD_RESET_METHOD_NONE; + reset_context.reset_req_dev = adev; + reset_context.src = AMDGPU_RESET_SRC_USERQ; + set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/ + + amdgpu_device_gpu_recover(adev, NULL, &reset_context); +} + static int amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr) { @@ -729,22 +850,19 @@ void amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_eviction_fence *ev_fence) { - int ret; struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr); struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr; + struct amdgpu_device *adev = uq_mgr->adev; + int ret; /* Wait for any pending userqueue fence work to finish */ ret = amdgpu_userq_wait_for_signal(uq_mgr); - if (ret) { - drm_file_err(uq_mgr->file, "Not evicting userqueue, timeout waiting for work\n"); - return; - } + if (ret) + dev_err(adev->dev, "Not evicting userqueue, timeout waiting for work\n"); ret = amdgpu_userq_evict_all(uq_mgr); - if (ret) { - drm_file_err(uq_mgr->file, "Failed to evict userqueue\n"); - return; - } + if (ret) + dev_err(adev->dev, "Failed to evict userqueue\n"); /* Signal current eviction fence */ amdgpu_eviction_fence_signal(evf_mgr, ev_fence); @@ -785,6 +903,7 @@ void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr) mutex_lock(&adev->userq_mutex); mutex_lock(&userq_mgr->userq_mutex); + amdgpu_userq_detect_and_reset_queues(userq_mgr); idr_for_each_entry(&userq_mgr->userq_idr, queue, queue_id) { amdgpu_userq_wait_for_last_fence(userq_mgr, queue); amdgpu_userq_unmap_helper(userq_mgr, queue); @@ -818,6 +937,7 @@ int amdgpu_userq_suspend(struct amdgpu_device *adev) list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) { cancel_delayed_work_sync(&uqm->resume_work); mutex_lock(&uqm->userq_mutex); + amdgpu_userq_detect_and_reset_queues(uqm); idr_for_each_entry(&uqm->userq_idr, queue, queue_id) { r = amdgpu_userq_unmap_helper(uqm, queue); if (r) @@ -874,6 +994,7 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev, list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) { cancel_delayed_work_sync(&uqm->resume_work); mutex_lock(&uqm->userq_mutex); + amdgpu_userq_detect_and_reset_queues(uqm); idr_for_each_entry(&uqm->userq_idr, queue, queue_id) { if (((queue->queue_type == AMDGPU_HW_IP_GFX) || (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) && @@ -922,3 +1043,60 @@ int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev, mutex_unlock(&adev->userq_mutex); return ret; } + +void amdgpu_userq_pre_reset(struct amdgpu_device *adev) +{ + const struct amdgpu_userq_funcs *userq_funcs; + struct amdgpu_usermode_queue *queue; + struct amdgpu_userq_mgr *uqm, *tmp; + int queue_id; + + list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) { + cancel_delayed_work_sync(&uqm->resume_work); + idr_for_each_entry(&uqm->userq_idr, queue, queue_id) { + if (queue->state == AMDGPU_USERQ_STATE_MAPPED) { + amdgpu_userq_wait_for_last_fence(uqm, queue); + userq_funcs = adev->userq_funcs[queue->queue_type]; + userq_funcs->unmap(uqm, queue); + /* just mark all queues as hung at this point. + * if unmap succeeds, we could map again + * in amdgpu_userq_post_reset() if vram is not lost + */ + queue->state = AMDGPU_USERQ_STATE_HUNG; + amdgpu_userq_fence_driver_force_completion(queue); + } + } + } +} + +int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost) +{ + /* if any queue state is AMDGPU_USERQ_STATE_UNMAPPED + * at this point, we should be able to map it again + * and continue if vram is not lost. + */ + struct amdgpu_userq_mgr *uqm; + struct amdgpu_usermode_queue *queue; + const struct amdgpu_userq_funcs *userq_funcs; + int queue_id, r = 0; + + list_for_each_entry(uqm, &adev->userq_mgr_list, list) { + idr_for_each_entry(&uqm->userq_idr, queue, queue_id) { + if (queue->state == AMDGPU_USERQ_STATE_HUNG && !vram_lost) { + userq_funcs = adev->userq_funcs[queue->queue_type]; + + r = userq_funcs->map(uqm, queue); // Re-map queue + if (r) { + dev_err(adev->dev, "Failed to remap queue %d\n", queue_id); + continue; + } + queue->state = AMDGPU_USERQ_STATE_MAPPED; + } + } + + /* Restart resume work after reset */ + //queue_delayed_work(system_wq, &uqm->resume_work, msecs_to_jiffies(100)); + } + + return r; +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h index 68e46d01bed2..c136d7e7a763 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h @@ -139,4 +139,9 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev, int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev, u32 idx); +void amdgpu_userq_reset_work(struct work_struct *work); + +void amdgpu_userq_pre_reset(struct amdgpu_device *adev); +int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost); + #endif -- 2.49.0