From: Alex Deucher <alexander.deuc...@amd.com>

Helper function to detect and reset hung queues.  MES will
return an array of doorbell indices of which queues are hung
and were optionally reset.

v2:  Clear the doorbell array before detection

Signed-off-by: Alex Deucher <alexander.deuc...@amd.com>
Signed-off-by: Jesse Zhang <jesse.zh...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 65 +++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 19 ++++++++
 2 files changed, 84 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index 135598502c8d..5bf9be073cdd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -191,6 +191,20 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
        if (r)
                goto error_doorbell;
 
+       if (adev->mes.hung_queue_db_array_size) {
+               r = amdgpu_bo_create_kernel(adev,
+                                           adev->mes.hung_queue_db_array_size 
* sizeof(u32),
+                                           PAGE_SIZE,
+                                           AMDGPU_GEM_DOMAIN_GTT,
+                                           
&adev->mes.hung_queue_db_array_gpu_obj,
+                                           
&adev->mes.hung_queue_db_array_gpu_addr,
+                                           
&adev->mes.hung_queue_db_array_cpu_addr);
+               if (r) {
+                       dev_warn(adev->dev, "failed to create MES hung db array 
buffer (%d)", r);
+                       goto error_doorbell;
+               }
+       }
+
        return 0;
 
 error_doorbell:
@@ -216,6 +230,10 @@ void amdgpu_mes_fini(struct amdgpu_device *adev)
 {
        int i;
 
+       amdgpu_bo_free_kernel(&adev->mes.hung_queue_db_array_gpu_obj,
+                             &adev->mes.hung_queue_db_array_gpu_addr,
+                             &adev->mes.hung_queue_db_array_cpu_addr);
+
        amdgpu_bo_free_kernel(&adev->mes.event_log_gpu_obj,
                              &adev->mes.event_log_gpu_addr,
                              &adev->mes.event_log_cpu_addr);
@@ -366,6 +384,53 @@ int amdgpu_mes_reset_legacy_queue(struct amdgpu_device 
*adev,
        return r;
 }
 
+int amdgpu_mes_get_hung_queue_db_array_size(struct amdgpu_device *adev)
+{
+       return adev->mes.hung_queue_db_array_size;
+}
+
+int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
+                                           int queue_type,
+                                           bool detect_only,
+                                           unsigned int *hung_db_num,
+                                           u32 *hung_db_array)
+
+{
+       struct mes_detect_and_reset_queue_input input;
+       u32 *db_array = adev->mes.hung_queue_db_array_cpu_addr;
+       int r, i;
+
+       if (!hung_db_num || !hung_db_array)
+               return -EINVAL;
+
+       if ((queue_type != AMDGPU_RING_TYPE_GFX) &&
+           (queue_type != AMDGPU_RING_TYPE_COMPUTE) &&
+           (queue_type != AMDGPU_RING_TYPE_SDMA))
+               return -EINVAL;
+
+       /* Clear the doorbell array before detection */
+       memset(adev->mes.hung_queue_db_array_cpu_addr, 0,
+               adev->mes.hung_queue_db_array_size * sizeof(u32));
+       input.queue_type = queue_type;
+       input.detect_only = detect_only;
+
+       r = adev->mes.funcs->detect_and_reset_hung_queues(&adev->mes,
+                                                         &input);
+       if (r) {
+               dev_err(adev->dev, "failed to detect and reset\n");
+       } else {
+               *hung_db_num = 0;
+               for (i = 0; i < adev->mes.hung_queue_db_array_size; i++) {
+                       if (db_array[i] != AMDGPU_MES_INVALID_DB_OFFSET) {
+                               hung_db_array[i] = db_array[i];
+                               *hung_db_num += 1;
+                       }
+               }
+       }
+
+       return r;
+}
+
 uint32_t amdgpu_mes_rreg(struct amdgpu_device *adev, uint32_t reg)
 {
        struct mes_misc_op_input op_input;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index c0d2c195fe2e..2c4568951edb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -41,6 +41,7 @@
 #define AMDGPU_MES_API_VERSION_MASK    0x00fff000
 #define AMDGPU_MES_FEAT_VERSION_MASK   0xff000000
 #define AMDGPU_MES_MSCRATCH_SIZE       0x40000
+#define AMDGPU_MES_INVALID_DB_OFFSET   0xffffffff
 
 enum amdgpu_mes_priority_level {
        AMDGPU_MES_PRIORITY_LEVEL_LOW       = 0,
@@ -147,6 +148,10 @@ struct amdgpu_mes {
        uint64_t            resource_1_gpu_addr[AMDGPU_MAX_MES_PIPES];
        void                *resource_1_addr[AMDGPU_MAX_MES_PIPES];
 
+       int                             hung_queue_db_array_size;
+       struct amdgpu_bo                *hung_queue_db_array_gpu_obj;
+       uint64_t                        hung_queue_db_array_gpu_addr;
+       void                            *hung_queue_db_array_cpu_addr;
 };
 
 struct amdgpu_mes_gang {
@@ -280,6 +285,11 @@ struct mes_reset_queue_input {
        bool                               is_kq;
 };
 
+struct mes_detect_and_reset_queue_input {
+       uint32_t                           queue_type;
+       bool                               detect_only;
+};
+
 enum mes_misc_opcode {
        MES_MISC_OP_WRITE_REG,
        MES_MISC_OP_READ_REG,
@@ -367,6 +377,8 @@ struct amdgpu_mes_funcs {
 
        int (*reset_hw_queue)(struct amdgpu_mes *mes,
                              struct mes_reset_queue_input *input);
+       int (*detect_and_reset_hung_queues)(struct amdgpu_mes *mes,
+                                           struct 
mes_detect_and_reset_queue_input *input);
 };
 
 #define amdgpu_mes_kiq_hw_init(adev) (adev)->mes.kiq_hw_init((adev))
@@ -390,6 +402,13 @@ int amdgpu_mes_reset_legacy_queue(struct amdgpu_device 
*adev,
                                  unsigned int vmid,
                                  bool use_mmio);
 
+int amdgpu_mes_get_hung_queue_db_array_size(struct amdgpu_device *adev);
+int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
+                                           int queue_type,
+                                           bool detect_only,
+                                           unsigned int *hung_db_num,
+                                           u32 *hung_db_array);
+
 uint32_t amdgpu_mes_rreg(struct amdgpu_device *adev, uint32_t reg);
 int amdgpu_mes_wreg(struct amdgpu_device *adev,
                    uint32_t reg, uint32_t val);
-- 
2.49.0

Reply via email to