From: Alex Deucher <[email protected]> Call MES with detect only to get the list of hung queues rather than detecting an resetting. Then loop over the bad queues and reset them individually and finally remove them. Skip queues not owned by KFD.
v2: always call resume_all after queue reset Signed-off-by: Alex Deucher <[email protected]> Reviewed-by: Amber Lin <[email protected]> --- .../drm/amd/amdkfd/kfd_device_queue_manager.c | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 08006ce8126c..e323658f972c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -445,7 +445,7 @@ static int reset_queues_mes(struct device_queue_manager *dqm) * Passed parameter is for targeting queues not scheduled by MES add_queue. */ r = amdgpu_mes_detect_and_reset_hung_queues(adev, AMDGPU_RING_TYPE_COMPUTE, - false, &num_hung, hung_array, ffs(dqm->dev->xcc_mask) - 1); + true, &num_hung, hung_array, ffs(dqm->dev->xcc_mask) - 1); if (!num_hung || r) { r = -ENOTRECOVERABLE; @@ -467,10 +467,9 @@ static int reset_queues_mes(struct device_queue_manager *dqm) } q = find_queue_by_doorbell_offset(dqm, hung_array[i]); - if (!q) { - r = -ENOTRECOVERABLE; - goto fail; - } + /* skip queues not owned by KFD */ + if (!q) + continue; pdd = kfd_get_process_device_data(q->device, q->process); if (!pdd) { @@ -480,6 +479,10 @@ static int reset_queues_mes(struct device_queue_manager *dqm) pr_warn("Hang detected doorbell %x pipe %d queue %d type %d\n", hung_array[i], pipe, queue, queue_type); + r = amdgpu_mes_reset_user_queue(adev, queue_type, hung_array[i], + ffs(dqm->dev->xcc_mask) - 1); + if (r) + goto fail; /* Proceed remove_queue with reset=true */ remove_queue_mes_on_reset_option(dqm, q, &pdd->qpd, true, false); set_queue_as_reset(dqm, q, &pdd->qpd); @@ -505,13 +508,17 @@ static int suspend_all_queues_mes(struct device_queue_manager *dqm) up_read(&adev->reset_domain->sem); if (r) { - if (!reset_queues_mes(dqm)) - return 0; + if (!reset_queues_mes(dqm)) { + r = 0; + goto out; + } dev_err(adev->dev, "failed to suspend gangs from MES\n"); dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n"); kfd_hws_hang(dqm); } +out: + resume_all_queues_mes(dqm); return r; } -- 2.49.0
