On 5/21/26 20:20, Alex Deucher wrote:
Call MES with detect only to get the list of hung queues rather
than detecting an resetting. Then loop over the bad queues
and reset them individually and finally remove them. Skip
queues not owned by KFD.
v2: always call resume_all after queue reset
Signed-off-by: Alex Deucher <[email protected]>
---
.../drm/amd/amdkfd/kfd_device_queue_manager.c | 21 ++++++++++++-------
1 file changed, 14 insertions(+), 7 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 08006ce8126c9..e323658f972cd 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -445,7 +445,7 @@ static int reset_queues_mes(struct device_queue_manager
*dqm)
* Passed parameter is for targeting queues not scheduled by MES
add_queue.
*/
r = amdgpu_mes_detect_and_reset_hung_queues(adev,
AMDGPU_RING_TYPE_COMPUTE,
- false, &num_hung, hung_array, ffs(dqm->dev->xcc_mask) - 1);
+ true, &num_hung, hung_array, ffs(dqm->dev->xcc_mask) - 1);
if (!num_hung || r) {
r = -ENOTRECOVERABLE;
@@ -467,10 +467,9 @@ static int reset_queues_mes(struct device_queue_manager
*dqm)
}
q = find_queue_by_doorbell_offset(dqm, hung_array[i]);
- if (!q) {
- r = -ENOTRECOVERABLE;
- goto fail;
- }
+ /* skip queues not owned by KFD */
+ if (!q)
+ continue;
pdd = kfd_get_process_device_data(q->device, q->process);
if (!pdd) {
@@ -480,6 +479,10 @@ static int reset_queues_mes(struct device_queue_manager
*dqm)
pr_warn("Hang detected doorbell %x pipe %d queue %d type %d\n",
hung_array[i], pipe, queue, queue_type);
+ r = amdgpu_mes_reset_user_queue(adev, queue_type, hung_array[i],
+ ffs(dqm->dev->xcc_mask) - 1);
+ if (r)
+ goto fail;
/* Proceed remove_queue with reset=true */
remove_queue_mes_on_reset_option(dqm, q, &pdd->qpd, true,
false);
set_queue_as_reset(dqm, q, &pdd->qpd);
The following change in suspend_all_queues_mes should not be needed, but
since they are removed in follow-up patches in this series, this patch is
Reviewed-by: Amber Lin <[email protected]>
@@ -505,13 +508,17 @@ static int suspend_all_queues_mes(struct
device_queue_manager *dqm)
up_read(&adev->reset_domain->sem);
if (r) {
- if (!reset_queues_mes(dqm))
- return 0;
+ if (!reset_queues_mes(dqm)) {
+ r = 0;
+ goto out;
+ }
dev_err(adev->dev, "failed to suspend gangs from MES\n");
dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU
reset\n");
kfd_hws_hang(dqm);
}
+out:
+ resume_all_queues_mes(dqm);
return r;
}
--
Regards,
Amber