From: Alex Deucher <alexander.deuc...@amd.com> Helper function to detect and reset hung queues. MES will return an array of doorbell indices of which queues are hung and were optionally reset.
v2: Clear the doorbell array before detection Signed-off-by: Alex Deucher <alexander.deuc...@amd.com> Signed-off-by: Jesse Zhang <jesse.zh...@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 65 +++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 19 ++++++++ 2 files changed, 84 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c index 135598502c8d..5bf9be073cdd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c @@ -191,6 +191,20 @@ int amdgpu_mes_init(struct amdgpu_device *adev) if (r) goto error_doorbell; + if (adev->mes.hung_queue_db_array_size) { + r = amdgpu_bo_create_kernel(adev, + adev->mes.hung_queue_db_array_size * sizeof(u32), + PAGE_SIZE, + AMDGPU_GEM_DOMAIN_GTT, + &adev->mes.hung_queue_db_array_gpu_obj, + &adev->mes.hung_queue_db_array_gpu_addr, + &adev->mes.hung_queue_db_array_cpu_addr); + if (r) { + dev_warn(adev->dev, "failed to create MES hung db array buffer (%d)", r); + goto error_doorbell; + } + } + return 0; error_doorbell: @@ -216,6 +230,10 @@ void amdgpu_mes_fini(struct amdgpu_device *adev) { int i; + amdgpu_bo_free_kernel(&adev->mes.hung_queue_db_array_gpu_obj, + &adev->mes.hung_queue_db_array_gpu_addr, + &adev->mes.hung_queue_db_array_cpu_addr); + amdgpu_bo_free_kernel(&adev->mes.event_log_gpu_obj, &adev->mes.event_log_gpu_addr, &adev->mes.event_log_cpu_addr); @@ -366,6 +384,53 @@ int amdgpu_mes_reset_legacy_queue(struct amdgpu_device *adev, return r; } +int amdgpu_mes_get_hung_queue_db_array_size(struct amdgpu_device *adev) +{ + return adev->mes.hung_queue_db_array_size; +} + +int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev, + int queue_type, + bool detect_only, + unsigned int *hung_db_num, + u32 *hung_db_array) + +{ + struct mes_detect_and_reset_queue_input input; + u32 *db_array = adev->mes.hung_queue_db_array_cpu_addr; + int r, i; + + if (!hung_db_num || !hung_db_array) + return -EINVAL; + + if ((queue_type != AMDGPU_RING_TYPE_GFX) && + (queue_type != AMDGPU_RING_TYPE_COMPUTE) && + (queue_type != AMDGPU_RING_TYPE_SDMA)) + return -EINVAL; + + /* Clear the doorbell array before detection */ + memset(adev->mes.hung_queue_db_array_cpu_addr, 0, + adev->mes.hung_queue_db_array_size * sizeof(u32)); + input.queue_type = queue_type; + input.detect_only = detect_only; + + r = adev->mes.funcs->detect_and_reset_hung_queues(&adev->mes, + &input); + if (r) { + dev_err(adev->dev, "failed to detect and reset\n"); + } else { + *hung_db_num = 0; + for (i = 0; i < adev->mes.hung_queue_db_array_size; i++) { + if (db_array[i] != AMDGPU_MES_INVALID_DB_OFFSET) { + hung_db_array[i] = db_array[i]; + *hung_db_num += 1; + } + } + } + + return r; +} + uint32_t amdgpu_mes_rreg(struct amdgpu_device *adev, uint32_t reg) { struct mes_misc_op_input op_input; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h index c0d2c195fe2e..2c4568951edb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h @@ -41,6 +41,7 @@ #define AMDGPU_MES_API_VERSION_MASK 0x00fff000 #define AMDGPU_MES_FEAT_VERSION_MASK 0xff000000 #define AMDGPU_MES_MSCRATCH_SIZE 0x40000 +#define AMDGPU_MES_INVALID_DB_OFFSET 0xffffffff enum amdgpu_mes_priority_level { AMDGPU_MES_PRIORITY_LEVEL_LOW = 0, @@ -147,6 +148,10 @@ struct amdgpu_mes { uint64_t resource_1_gpu_addr[AMDGPU_MAX_MES_PIPES]; void *resource_1_addr[AMDGPU_MAX_MES_PIPES]; + int hung_queue_db_array_size; + struct amdgpu_bo *hung_queue_db_array_gpu_obj; + uint64_t hung_queue_db_array_gpu_addr; + void *hung_queue_db_array_cpu_addr; }; struct amdgpu_mes_gang { @@ -280,6 +285,11 @@ struct mes_reset_queue_input { bool is_kq; }; +struct mes_detect_and_reset_queue_input { + uint32_t queue_type; + bool detect_only; +}; + enum mes_misc_opcode { MES_MISC_OP_WRITE_REG, MES_MISC_OP_READ_REG, @@ -367,6 +377,8 @@ struct amdgpu_mes_funcs { int (*reset_hw_queue)(struct amdgpu_mes *mes, struct mes_reset_queue_input *input); + int (*detect_and_reset_hung_queues)(struct amdgpu_mes *mes, + struct mes_detect_and_reset_queue_input *input); }; #define amdgpu_mes_kiq_hw_init(adev) (adev)->mes.kiq_hw_init((adev)) @@ -390,6 +402,13 @@ int amdgpu_mes_reset_legacy_queue(struct amdgpu_device *adev, unsigned int vmid, bool use_mmio); +int amdgpu_mes_get_hung_queue_db_array_size(struct amdgpu_device *adev); +int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev, + int queue_type, + bool detect_only, + unsigned int *hung_db_num, + u32 *hung_db_array); + uint32_t amdgpu_mes_rreg(struct amdgpu_device *adev, uint32_t reg); int amdgpu_mes_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t val); -- 2.49.0