From: Alex Deucher <alexander.deuc...@amd.com>

Add a detect and reset callback and add the implementation
for mes.  The callback will detect all hung queues of a
particular ip type (e.g., GFX or compute or SDMA) and
reset them.

v2: increase reset counter and set fence force completion
v3: Removed userq_mutex in mes_userq_detect_and_reset since the driver holds it 
when calling

Signed-off-by: Alex Deucher <alexander.deuc...@amd.com>
Signed-off-by: Jesse Zhang <jesse.zh...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h  |  2 +
 drivers/gpu/drm/amd/amdgpu/mes_userqueue.c | 49 ++++++++++++++++++++++
 2 files changed, 51 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
index 5111d7dce86f..9fa0d1a88d71 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
@@ -81,6 +81,8 @@ struct amdgpu_userq_funcs {
                   struct amdgpu_usermode_queue *queue);
        int (*restore)(struct amdgpu_userq_mgr *uq_mgr,
                   struct amdgpu_usermode_queue *queue);
+       int (*detect_and_reset)(struct amdgpu_device *adev,
+                 int queue_type);
 };
 
 /* Usermode queues for gfx */
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c 
b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
index 502fa0a40107..5fac18dfeca3 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
@@ -21,6 +21,7 @@
  * OTHER DEALINGS IN THE SOFTWARE.
  *
  */
+#include <drm/drm_drv.h>
 #include "amdgpu.h"
 #include "amdgpu_gfx.h"
 #include "mes_userqueue.h"
@@ -198,6 +199,53 @@ static int mes_userq_create_ctx_space(struct 
amdgpu_userq_mgr *uq_mgr,
        return 0;
 }
 
+static int mes_userq_detect_and_reset(struct amdgpu_device *adev,
+                                     int queue_type)
+{
+       int db_array_size = amdgpu_mes_get_hung_queue_db_array_size(adev);
+       struct mes_detect_and_reset_queue_input input;
+       struct amdgpu_usermode_queue *queue;
+       struct amdgpu_userq_mgr *uqm, *tmp;
+       unsigned int hung_db_num = 0;
+       int queue_id, r, i;
+       u32 db_array[4];
+
+       if (db_array_size > 4) {
+               dev_err(adev->dev, "DB array size (%d vs 4) too small\n",
+                       db_array_size);
+               return -EINVAL;
+       }
+
+       memset(&input, 0x0, sizeof(struct mes_detect_and_reset_queue_input));
+
+       input.queue_type = queue_type;
+
+       amdgpu_mes_lock(&adev->mes);
+       r = amdgpu_mes_detect_and_reset_hung_queues(adev, queue_type, false,
+                                                   &hung_db_num, db_array);
+       amdgpu_mes_unlock(&adev->mes);
+       if (r) {
+               dev_err(adev->dev, "Failed to detect and reset queues, err 
(%d)\n", r);
+       } else if (hung_db_num) {
+               list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) 
{
+                       idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
+                               if (queue->queue_type == queue_type) {
+                                       for (i = 0; i < hung_db_num; i++) {
+                                               if (queue->doorbell_index == 
db_array[i]) {
+                                                       queue->state = 
AMDGPU_USERQ_STATE_HUNG;
+                                                       
atomic_inc(&adev->gpu_reset_counter);
+                                                       
amdgpu_userq_fence_driver_force_completion(queue);
+                                                       
drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE);
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
+
+       return r;
+}
+
 static int mes_userq_mqd_create(struct amdgpu_userq_mgr *uq_mgr,
                                struct drm_amdgpu_userq_in *args_in,
                                struct amdgpu_usermode_queue *queue)
@@ -423,4 +471,5 @@ const struct amdgpu_userq_funcs userq_mes_funcs = {
        .map = mes_userq_map,
        .preempt = mes_userq_preempt,
        .restore = mes_userq_restore,
+       .detect_and_reset = mes_userq_detect_and_reset,
 };
-- 
2.49.0

Reply via email to