This commit introduces hang detection infrastructure for usermode queues by:

1. Adding userq_hang_info structure to track:
   - Queue identification (ME/MEC, pipe, queue)
   - VMID and XCC ID
   - Queue GPU address

2. Implementing amdgpu_userqueue_detect_hang() which:
   - Scans active hardware queues to find matching HQD addresses
   - Supports both GFX and Compute queue types
   - Properly handles queue identification hierarchy
   - Stores found queue information in hang_info structure

3. Integrating hang detection with existing queue structures:
   - Added hang_info to amdgpu_usermode_queue
   - Maintained compatibility with existing reset mechanisms

The implementation:
- Matches hardware queue organization (ME->pipe->queue)
- Uses existing MQD functions for HQD address lookup
- Provides all necessary information for targeted resets

 Signed-off-by: Jesse Zhang <jesse.zh...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 79 +++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 11 ++++
 2 files changed, 90 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index aac0de86f3e8..82fef5e3ddea 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -44,6 +44,85 @@ u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device 
*adev)
        return userq_ip_mask;
 }
 
+/**
+ * Detect if a given usermode queue is hung by comparing its GPU address
+ * to existing HQD addresses in the hardware.
+ *
+ * @uqm   - User queue manager containing context and tracking structures
+ * @queue - The usermode queue to check for hang
+ *
+ * @return - bool, hang detection info is stored in hang_info if match found.
+ */
+static bool amdgpu_userqueue_detect_hang(struct amdgpu_userq_mgr *uqm, struct 
amdgpu_usermode_queue *queue)
+{
+       struct amdgpu_device *adev = uqm->adev;
+       struct userq_hang_info *hang_info = &queue->hang_info;
+       struct amdgpu_mqd *mqd_hw_default = &adev->mqds[queue->queue_type];
+
+       int queue_type = queue->queue_type;
+       uint64_t hdq_pq_base = queue->userq_prop->hqd_base_gpu_addr;
+
+       uint64_t hqd_addr = 0;
+       uint32_t mec, me, pipe, q, vmid;
+
+       switch (queue_type) {
+       case AMDGPU_HW_IP_GFX:
+               for (me = 0; me < adev->gfx.me.num_me; me++) {
+                       for (q = 0; q < adev->gfx.me.num_queue_per_pipe; q++) {
+                               for (pipe = 0; pipe < 
adev->gfx.me.num_pipe_per_me; pipe++) {
+
+                                       hqd_addr = 
mqd_hw_default->hqd_get_pq_addr(adev, queue_type,
+                                               me, pipe, q, 0, &vmid);
+                                       if (!hqd_addr)
+                                               continue;
+                                       /* Check if this HQD matches the target 
queue */
+                                       if (hqd_addr == hdq_pq_base) {
+                                               hang_info->me = me;
+                                               hang_info->pipe = pipe;
+                                               hang_info->queue = q;
+                                               hang_info->queue_address = 
hqd_addr;
+                                               hang_info->vmid = vmid;
+                                               return true;
+                                       }
+                               }
+                       }
+               }
+       break;
+       case AMDGPU_HW_IP_COMPUTE:
+               for (mec = 0; mec < adev->gfx.mec.num_mec; ++mec) {
+                       /* mec0 is me1 */
+                       mec +=1;
+                       for (q = 0; q < adev->gfx.mec.num_queue_per_pipe; q++) {
+                               for (pipe = 0; pipe < 
adev->gfx.mec.num_pipe_per_mec; pipe++) {
+                                       hqd_addr = 
mqd_hw_default->hqd_get_pq_addr(adev, queue_type,
+                                                                       mec, 
pipe, q, 0, &vmid);
+                                       if (!hqd_addr)
+                                           continue;
+
+                                       /* Check for address match to determine 
hang */
+                                       if (hqd_addr == hdq_pq_base) {
+                                               hang_info->me = mec;
+                                               hang_info->pipe = pipe;
+                                               hang_info->queue = q;
+                                               hang_info->queue_address = 
hqd_addr;
+                                               hang_info->vmid = vmid;
+                                               return true;
+                                       }
+                               }
+                       }
+               }
+       break;
+       case AMDGPU_HW_IP_DMA:
+       case AMDGPU_HW_IP_VCN_ENC:
+       case AMDGPU_HW_IP_VPE:
+       default:
+       /* These queue types are not yet supported in hang detection */
+       break;
+       }
+
+       return false;
+}
+
 static int
 amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr,
                          struct amdgpu_usermode_queue *queue)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
index ec040c2fd6c9..0d44d7a3b7bc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
@@ -47,6 +47,15 @@ struct amdgpu_userq_obj {
        struct amdgpu_bo *obj;
 };
 
+struct userq_hang_info {
+       int me;
+       int pipe;
+       int queue;
+       int xcc_id;
+       int vmid;
+       uint64_t queue_address;
+};
+
 struct amdgpu_usermode_queue {
        int                     queue_type;
        enum amdgpu_userq_state state;
@@ -65,6 +74,8 @@ struct amdgpu_usermode_queue {
        struct dma_fence        *last_fence;
        u32                     xcp_id;
        int                     priority;
+       /* for per-queue reset support */
+       struct userq_hang_info hang_info;
 };
 
 struct amdgpu_userq_funcs {
-- 
2.49.0

Reply via email to