This commit introduces hang detection infrastructure for usermode queues by:
1. Adding userq_hang_info structure to track: - Queue identification (ME/MEC, pipe, queue) - VMID and XCC ID - Queue GPU address 2. Implementing amdgpu_userqueue_detect_hang() which: - Scans active hardware queues to find matching HQD addresses - Supports both GFX and Compute queue types - Properly handles queue identification hierarchy - Stores found queue information in hang_info structure 3. Integrating hang detection with existing queue structures: - Added hang_info to amdgpu_usermode_queue - Maintained compatibility with existing reset mechanisms The implementation: - Matches hardware queue organization (ME->pipe->queue) - Uses existing MQD functions for HQD address lookup - Provides all necessary information for targeted resets Signed-off-by: Jesse Zhang <jesse.zh...@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 79 +++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 11 ++++ 2 files changed, 90 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index aac0de86f3e8..82fef5e3ddea 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -44,6 +44,85 @@ u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev) return userq_ip_mask; } +/** + * Detect if a given usermode queue is hung by comparing its GPU address + * to existing HQD addresses in the hardware. + * + * @uqm - User queue manager containing context and tracking structures + * @queue - The usermode queue to check for hang + * + * @return - bool, hang detection info is stored in hang_info if match found. + */ +static bool amdgpu_userqueue_detect_hang(struct amdgpu_userq_mgr *uqm, struct amdgpu_usermode_queue *queue) +{ + struct amdgpu_device *adev = uqm->adev; + struct userq_hang_info *hang_info = &queue->hang_info; + struct amdgpu_mqd *mqd_hw_default = &adev->mqds[queue->queue_type]; + + int queue_type = queue->queue_type; + uint64_t hdq_pq_base = queue->userq_prop->hqd_base_gpu_addr; + + uint64_t hqd_addr = 0; + uint32_t mec, me, pipe, q, vmid; + + switch (queue_type) { + case AMDGPU_HW_IP_GFX: + for (me = 0; me < adev->gfx.me.num_me; me++) { + for (q = 0; q < adev->gfx.me.num_queue_per_pipe; q++) { + for (pipe = 0; pipe < adev->gfx.me.num_pipe_per_me; pipe++) { + + hqd_addr = mqd_hw_default->hqd_get_pq_addr(adev, queue_type, + me, pipe, q, 0, &vmid); + if (!hqd_addr) + continue; + /* Check if this HQD matches the target queue */ + if (hqd_addr == hdq_pq_base) { + hang_info->me = me; + hang_info->pipe = pipe; + hang_info->queue = q; + hang_info->queue_address = hqd_addr; + hang_info->vmid = vmid; + return true; + } + } + } + } + break; + case AMDGPU_HW_IP_COMPUTE: + for (mec = 0; mec < adev->gfx.mec.num_mec; ++mec) { + /* mec0 is me1 */ + mec +=1; + for (q = 0; q < adev->gfx.mec.num_queue_per_pipe; q++) { + for (pipe = 0; pipe < adev->gfx.mec.num_pipe_per_mec; pipe++) { + hqd_addr = mqd_hw_default->hqd_get_pq_addr(adev, queue_type, + mec, pipe, q, 0, &vmid); + if (!hqd_addr) + continue; + + /* Check for address match to determine hang */ + if (hqd_addr == hdq_pq_base) { + hang_info->me = mec; + hang_info->pipe = pipe; + hang_info->queue = q; + hang_info->queue_address = hqd_addr; + hang_info->vmid = vmid; + return true; + } + } + } + } + break; + case AMDGPU_HW_IP_DMA: + case AMDGPU_HW_IP_VCN_ENC: + case AMDGPU_HW_IP_VPE: + default: + /* These queue types are not yet supported in hang detection */ + break; + } + + return false; +} + static int amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_usermode_queue *queue) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h index ec040c2fd6c9..0d44d7a3b7bc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h @@ -47,6 +47,15 @@ struct amdgpu_userq_obj { struct amdgpu_bo *obj; }; +struct userq_hang_info { + int me; + int pipe; + int queue; + int xcc_id; + int vmid; + uint64_t queue_address; +}; + struct amdgpu_usermode_queue { int queue_type; enum amdgpu_userq_state state; @@ -65,6 +74,8 @@ struct amdgpu_usermode_queue { struct dma_fence *last_fence; u32 xcp_id; int priority; + /* for per-queue reset support */ + struct userq_hang_info hang_info; }; struct amdgpu_userq_funcs { -- 2.49.0