Identify hung queues by comparing doorbells shown in hqd_info from MES
with doorbells stored in the driver to find matching queues.

Suggested-by: Jonathan Kim <[email protected]>
Signed-off-by: Amber Lin <[email protected]>
Reviewed-by: Alex Deucher <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 38 ++++++++++++++++---------
 1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index e639d6c329e9..f1f8bbfc31e0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -465,23 +465,35 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct 
amdgpu_device *adev,
 
        r = adev->mes.funcs->detect_and_reset_hung_queues(&adev->mes,
                                                          &input);
-       if (r) {
-               dev_err(adev->dev, "failed to detect and reset\n");
-       } else {
-               *hung_db_num = 0;
-               for (i = 0; i < adev->mes.hung_queue_hqd_info_offset; i++) {
-                       if (db_array[i] != AMDGPU_MES_INVALID_DB_OFFSET) {
-                               hung_db_array[i] = db_array[i];
-                               *hung_db_num += 1;
-                       }
+
+       if (r && detect_only) {
+               dev_err(adev->dev, "Failed to detect hung queues\n");
+               return r;
+       }
+
+       *hung_db_num = 0;
+       /* MES passes hung queues' doorbell to driver */
+       for (i = 0; i < adev->mes.hung_queue_hqd_info_offset; i++) {
+               /* Finding hung queues where db_array[i] is a valid doorbell */
+               if (db_array[i] != AMDGPU_MES_INVALID_DB_OFFSET) {
+                       hung_db_array[i] = db_array[i];
+                       *hung_db_num += 1;
                }
+       }
 
-               /*
-                * TODO: return HQD info for MES scheduled user compute queue 
reset cases
-                * stored in hung_db_array hqd info offset to full array size
-                */
+       if (r && !hung_db_num) {
+               dev_err(adev->dev, "Failed to detect and reset hung queues\n");
+               return r;
        }
 
+       /*
+        * TODO: return HQD info for MES scheduled user compute queue reset 
cases
+        * stored in hung_db_array hqd info offset to full array size
+        */
+
+       if (r)
+               dev_err(adev->dev, "failed to reset\n");
+
        return r;
 }
 
-- 
2.43.0

Reply via email to