CP priv-fault IRQ carries the offending queue's per-process
doorbell_id in src_data[0][9:0] (same encoding KFD extracts via
KFD_CTXID0_DOORBELL_ID_MASK), with pasid in entry->pasid. The
existing lookup against BAR-absolute doorbell_index never matches
this payload, so the targeted-reset path is never taken.

Cache args->in.doorbell_offset on the queue and look it up by the
(vm->pasid, doorbell_offset) pair. Add a queue->guilty flag set by
the IRQ and consumed via xchg in hang_detect_work; on a hit, route
compute reset through amdgpu_gfx_reset_mes_compute(..., queue, ...)
and fire hang_detect immediately via mod_delayed_work(..., 0)
(queue_delayed_work() is a no-op when the work is already armed at
submit time).

Signed-off-by: Jesse Zhang <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 37 ++++++++++++++++++-----
 drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h |  5 ++-
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c    | 10 ++++--
 drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c    | 10 ++++--
 4 files changed, 49 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index 99c5adf3cb24..231ffb29fe5e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -140,9 +140,14 @@ static void amdgpu_userq_hang_detect_work(struct 
work_struct *work)
                                                 AMDGPU_RESET_TYPE_PER_QUEUE)) {
                int r;
 
-               if (queue->queue_type == AMDGPU_HW_IP_COMPUTE)
-                       r = amdgpu_gfx_reset_mes_compute(adev, NULL, NULL, 
NULL, NULL, NULL);
-               else
+               if (queue->queue_type == AMDGPU_HW_IP_COMPUTE) {
+                       struct amdgpu_usermode_queue *guilty_uq;
+
+                       /* IRQ-side WRITE_ONCE(guilty,true) hand-off via xchg */
+                       guilty_uq = xchg(&queue->guilty, false) ? queue : NULL;
+                       r = amdgpu_gfx_reset_mes_compute(adev, NULL, NULL,
+                                                        guilty_uq, NULL, NULL);
+               } else
                        r = userq_funcs->reset(queue);
                if (r)
                        gpu_reset = true;
@@ -671,6 +676,7 @@ amdgpu_userq_create(struct drm_file *filp, union 
drm_amdgpu_userq *args)
        }
 
        queue->doorbell_index = index;
+       queue->doorbell_offset = (u32)args->in.doorbell_offset;
        r = uq_funcs->mqd_create(queue, &args->in);
        if (r) {
                drm_file_err(uq_mgr->file, "Failed to create Queue\n");
@@ -1111,16 +1117,31 @@ static void amdgpu_userq_restore_worker(struct 
work_struct *work)
        dma_fence_put(ev_fence);
 }
 
-void amdgpu_userq_process_reset_irq(struct amdgpu_device *adev, u32 doorbell)
+void amdgpu_userq_process_reset_irq(struct amdgpu_device *adev,
+                                   u32 pasid, u32 doorbell_offset)
 {
        struct xarray *xa = &adev->userq_doorbell_xa;
        struct amdgpu_usermode_queue *queue;
-       unsigned long flags;
+       unsigned long flags, idx;
 
+       /*
+        * CP priv-fault payload is (pasid, src_data[0] & 0x3ff) — the same
+        * per-process doorbell encoding KFD extracts via
+        * KFD_CTXID0_DOORBELL_ID_MASK. Find the offending queue by the
+        * (vm->pasid, doorbell_offset) pair, mark it guilty and fire
+        * hang_detect immediately (queue_delayed_work() would no-op if the
+        * work is already armed at submit time).
+        */
        xa_lock_irqsave(xa, flags);
-       queue = xa_load(xa, doorbell);
-       if (queue)
-               amdgpu_userq_start_hang_detect_work(queue);
+       xa_for_each(xa, idx, queue) {
+               if (queue->vm && queue->vm->pasid == pasid &&
+                   queue->doorbell_offset == doorbell_offset) {
+                       WRITE_ONCE(queue->guilty, true);
+                       mod_delayed_work(adev->reset_domain->wq,
+                                        &queue->hang_detect_work, 0);
+                       break;
+               }
+       }
        xa_unlock_irqrestore(xa, flags);
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
index 5a2ae33135da..1e765cd765ab 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
@@ -53,6 +53,7 @@ struct amdgpu_usermode_queue {
        enum amdgpu_userq_state state;
        uint64_t                doorbell_handle;
        uint64_t                doorbell_index;
+       u32                     doorbell_offset;
        uint64_t                flags;
        struct amdgpu_mqd_prop  *userq_prop;
        struct amdgpu_userq_mgr *userq_mgr;
@@ -86,6 +87,7 @@ struct amdgpu_usermode_queue {
         * Delayed work which runs when userq_fences time out.
         */
        struct delayed_work     hang_detect_work;
+       bool                    guilty;
        struct kref             refcount;
 
        union {
@@ -175,7 +177,8 @@ void amdgpu_userq_pre_reset(struct amdgpu_device *adev);
 int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost);
 void amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue *queue);
 void amdgpu_userq_process_fence_irq(struct amdgpu_device *adev, u32 doorbell);
-void amdgpu_userq_process_reset_irq(struct amdgpu_device *adev, u32 doorbell);
+void amdgpu_userq_process_reset_irq(struct amdgpu_device *adev,
+                                   u32 pasid, u32 doorbell_offset);
 
 int amdgpu_userq_input_va_validate(struct amdgpu_device *adev,
                                   struct amdgpu_usermode_queue *queue,
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index ea323d241324..27d0a3dbfce8 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -6681,10 +6681,16 @@ static int gfx_v11_0_set_priv_inst_fault_state(struct 
amdgpu_device *adev,
 static void gfx_v11_0_handle_priv_fault(struct amdgpu_device *adev,
                                        struct amdgpu_iv_entry *entry)
 {
-       u32 doorbell_offset = entry->src_data[0];
+       /*
+        * CP packs the per-process doorbell_id in src_data[0][9:0]; upper
+        * bits hold other CTXID0 fields. Same mask KFD uses
+        * (KFD_CTXID0_DOORBELL_ID_MASK).
+        */
+       u32 doorbell_offset = entry->src_data[0] & 0x3ff;
 
        if (adev->enable_mes && doorbell_offset) {
-               amdgpu_userq_process_reset_irq(adev, doorbell_offset);
+               amdgpu_userq_process_reset_irq(adev, entry->pasid,
+                                              doorbell_offset);
        } else if (!adev->gfx.disable_kq) {
                u8 me_id = (entry->ring_id & 0x0c) >> 2;
                u8 pipe_id = (entry->ring_id & 0x03) >> 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
index b3f1bbf3fc13..7c269cb75e07 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
@@ -5009,10 +5009,16 @@ static int gfx_v12_0_set_priv_inst_fault_state(struct 
amdgpu_device *adev,
 static void gfx_v12_0_handle_priv_fault(struct amdgpu_device *adev,
                                        struct amdgpu_iv_entry *entry)
 {
-       u32 doorbell_offset = entry->src_data[0];
+       /*
+        * CP packs the per-process doorbell_id in src_data[0][9:0]; upper
+        * bits hold other CTXID0 fields. Same mask KFD uses
+        * (KFD_CTXID0_DOORBELL_ID_MASK).
+        */
+       u32 doorbell_offset = entry->src_data[0] & 0x3ff;
 
        if (adev->enable_mes && doorbell_offset) {
-               amdgpu_userq_process_reset_irq(adev, doorbell_offset);
+               amdgpu_userq_process_reset_irq(adev, entry->pasid,
+                                              doorbell_offset);
        } else if (!adev->gfx.disable_kq) {
                u8 me_id, pipe_id, queue_id;
                struct amdgpu_ring *ring;
-- 
2.49.0

Reply via email to