[AMD Official Use Only - AMD Internal Distribution Only]

Thanks for the explanation.
You can added reviewed-by : Shaoyun.liu < 
[email protected]<mailto:[email protected]>>  for this serial

From: Lin, Amber <[email protected]>
Sent: Thursday, March 26, 2026 1:31 PM
To: Liu, Shaoyun <[email protected]>; [email protected]; 
[email protected]
Cc: Chen, Michael <[email protected]>; Zhang, Jesse(Jie) 
<[email protected]>; Kim, Jonathan <[email protected]>
Subject: Re: [PATCH v2 09/10] drm/amdkfd: Reset queue/pipe in MES

Yeah I should have made this series' cover letter more clear... This series 
only supports and enables detect-and-reset-in-MES for gfx 12.1 for user compute 
queues. KCQ is not used in gfx 12.1.

Yes, KFD removes hung queues after detect-and-reset and notify the user by 
sending a reset signal. Please see my reply inline below where 
amdgpu_mes_detect_and_reset_hung_queues is called following 
remove_queue_mes_on_reset_option calls for each hung queue, and 
kfd_signal_reset_event in the end.

Regards,

Amber

On 3/26/26 12:06, Liu, Shaoyun wrote:

[AMD Official Use Only - AMD Internal Distribution Only]



Maybe  I missed something , can you explain a little bit more on how driver 
will do the cleanup on those hung queues in the failed list after the reset & 
detect call ? will driver call remove_queue on each of them and  notify user ? 
If the hung queue is KCQ ,will driver create a new KCQ ?



Regards

Shaoyun.liu



-----Original Message-----

From: Lin, Amber <[email protected]><mailto:[email protected]>

Sent: Tuesday, March 24, 2026 1:57 PM

To: [email protected]<mailto:[email protected]>; 
[email protected]<mailto:[email protected]>

Cc: Liu, Shaoyun <[email protected]><mailto:[email protected]>; Chen, 
Michael <[email protected]><mailto:[email protected]>; Zhang, Jesse(Jie) 
<[email protected]><mailto:[email protected]>; Lin, Amber 
<[email protected]><mailto:[email protected]>; Kim, Jonathan 
<[email protected]><mailto:[email protected]>

Subject: [PATCH v2 09/10] drm/amdkfd: Reset queue/pipe in MES



When removing queues fails, KFD calls amdgpu_mes to detect and reset hung 
queues, then cleans up those hung queues in KFD.



Suggested-by: Jonathan Kim <[email protected]><mailto:[email protected]>

Signed-off-by: Amber Lin <[email protected]><mailto:[email protected]>

---

 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c       |   6 +

 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h       |   1 +

 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 147 +++++++++++++++++-

 drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |   1 +

 4 files changed, 153 insertions(+), 2 deletions(-)



diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c

index f3a4ae1fd521..7cf4b3d6fc93 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c

@@ -793,6 +793,12 @@ bool amdgpu_mes_suspend_resume_all_supported(struct 
amdgpu_device *adev)

                amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(12, 0, 0));  }



+bool amdgpu_mes_queue_reset_by_mes_supported(struct amdgpu_device

+*adev) {

+       return (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 1, 0) &&

+               (adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 0x73); }

+

 /* Fix me -- node_id is used to identify the correct MES instances in the 
future */  static int amdgpu_mes_set_enforce_isolation(struct amdgpu_device 
*adev,

                                            uint32_t node_id, bool enable) diff 
--git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h

index 643b4f8d757a..44fa4d73bce8 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h

@@ -548,6 +548,7 @@ static inline void amdgpu_mes_unlock(struct amdgpu_mes 
*mes)  }



 bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev);

+bool amdgpu_mes_queue_reset_by_mes_supported(struct amdgpu_device

+*adev);



 int amdgpu_mes_update_enforce_isolation(struct amdgpu_device *adev);



diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c

index ec8d7f4be840..1c9c350bfffe 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c

@@ -71,6 +71,12 @@ static int allocate_sdma_queue(struct device_queue_manager 
*dqm,

                                struct queue *q, const uint32_t 
*restore_sdma_id);



 static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool 
is_sdma);

+static int resume_all_queues_mes(struct device_queue_manager *dqm);

+static int suspend_all_queues_mes(struct device_queue_manager *dqm);

+static struct queue *find_queue_by_doorbell_offset(struct device_queue_manager 
*dqm,

+                                                  uint32_t doorbell_offset);

+static void set_queue_as_reset(struct device_queue_manager *dqm, struct queue 
*q,

+                              struct qcm_process_device *qpd);



 static inline

 enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type) @@ 
-273,13 +279,19 @@ static int add_queue_mes(struct device_queue_manager *dqm, 
struct queue *q,

        return r;

 }



-static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,

-                       struct qcm_process_device *qpd)

+static int remove_queue_mes_on_reset_option(struct device_queue_manager *dqm, 
struct queue *q,

+                                           struct qcm_process_device *qpd,

+                                           bool is_for_reset,

+                                           bool flush_mes_queue)

 {

        struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;

        int r;

        struct mes_remove_queue_input queue_input;



+       /* queue was already removed during reset */

+       if (q->properties.is_reset)

+               return 0;

+

        if (!dqm->sched_running || dqm->sched_halt)

                return 0;

        if (!down_read_trylock(&adev->reset_domain->sem))

@@ -288,6 +300,7 @@ static int remove_queue_mes(struct device_queue_manager 
*dqm, struct queue *q,

        memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input));

        queue_input.doorbell_offset = q->properties.doorbell_off;

        queue_input.gang_context_addr = q->gang_ctx_gpu_addr;

+       queue_input.remove_queue_after_reset = flush_mes_queue;

        queue_input.xcc_id = ffs(dqm->dev->xcc_mask) - 1;



        amdgpu_mes_lock(&adev->mes);

@@ -295,7 +308,13 @@ static int remove_queue_mes(struct device_queue_manager 
*dqm, struct queue *q,

        amdgpu_mes_unlock(&adev->mes);

        up_read(&adev->reset_domain->sem);



+       if (is_for_reset)

+               return r;

+

        if (r) {

+               if (!suspend_all_queues_mes(dqm))

+                       return resume_all_queues_mes(dqm);

+

                dev_err(adev->dev, "failed to remove hardware queue from MES, 
doorbell=0x%x\n",

                        q->properties.doorbell_off);

                dev_err(adev->dev, "MES might be in unrecoverable state, issue 
a GPU reset\n"); @@ -305,6 +324,12 @@ static int remove_queue_mes(struct 
device_queue_manager *dqm, struct queue *q,

        return r;

 }



+static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,

+                           struct qcm_process_device *qpd)

+{

+       return remove_queue_mes_on_reset_option(dqm, q, qpd, false, false); }

+

 static int remove_all_kfd_queues_mes(struct device_queue_manager *dqm)  {

        struct device_process_node *cur;

@@ -359,6 +384,103 @@ static int add_all_kfd_queues_mes(struct 
device_queue_manager *dqm)

        return retval;

 }



+static int reset_queues_mes(struct device_queue_manager *dqm) {

+       struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;

+       struct amdgpu_mes_hung_queue_hqd_info *hqd_info;

+       int hqd_info_size = adev->mes.hung_queue_hqd_info_offset;

+       int num_hung = 0, r = 0, i, pipe, queue, queue_type;

+       uint32_t *hung_array;

+       struct kfd_process_device *pdd;

+       struct queue *q;

+

+       if (!amdgpu_mes_queue_reset_by_mes_supported(adev)) {

+               r = -ENOTRECOVERABLE;

+               goto fail;

+       }

+

+       /* reset should be used only in dqm locked queue reset */

+       if (WARN_ON(dqm->detect_hang_count > 0))

+               return 0;

+

+       if (!amdgpu_gpu_recovery) {

+               r = -ENOTRECOVERABLE;

+               goto fail;

+       }

+

+       hung_array = kzalloc(adev->mes.hung_queue_db_array_size * 
sizeof(uint32_t), GFP_KERNEL);

+       if (!hung_array) {

+               r = -ENOMEM;

+               goto fail;

+       }

+

+       hqd_info = kzalloc(hqd_info_size * sizeof(struct 
amdgpu_mes_hung_queue_hqd_info), GFP_KERNEL);

+       if (!hqd_info) {

+               r = -ENOMEM;

+               goto free_hung_array;

+       }

+

+       memset(hqd_info, 0, hqd_info_size * sizeof(struct

+amdgpu_mes_hung_queue_hqd_info));

+

+       /*

+        * AMDGPU_RING_TYPE_COMPUTE parameter does not matter if called

+        * post suspend_all as reset & detect will return all hung queue types.

+        *

+        * Passed parameter is for targeting queues not scheduled by MES 
add_queue.

+        */

+       r =  amdgpu_mes_detect_and_reset_hung_queues(adev, 
AMDGPU_RING_TYPE_COMPUTE,

+               false, &num_hung, hung_array, ffs(dqm->dev->xcc_mask) - 1);
This is where KFD calls amdgpu_mes_detect_and_reset_hung_queues with 
detect_only=flase. When MES proceeds RESET successfully, KFD looks up hung 
queues and calls remove_queue_mes_on_reset_option below



+

+       if (!num_hung || r) {

+               r = -ENOTRECOVERABLE;

+               goto free_hqd_info;

+       }

+

+       /* MES reset resets queue/pipe and cleans up internally  */

+       for (i = 0; i < num_hung; i++) {

+               hqd_info[i].bit0_31 = hung_array[i + hqd_info_size];

+               pipe = hqd_info[i].pipe_index;

+               queue = hqd_info[i].queue_index;

+               queue_type = hqd_info[i].queue_type;

+

+               if (queue_type != MES_QUEUE_TYPE_COMPUTE &&

+                   queue_type != MES_QUEUE_TYPE_SDMA) {

+                       pr_warn("Unsupported hung queue reset type: %d\n", 
queue_type);

+                       hung_array[i] = AMDGPU_MES_INVALID_DB_OFFSET;

+                       continue;

+               }

+

+               q = find_queue_by_doorbell_offset(dqm, hung_array[i]);

+               if (!q) {

+                       r = -ENOTRECOVERABLE;

+                       goto free_hqd_info;

+               }

+

+               pdd = kfd_get_process_device_data(q->device, q->process);

+               if (!pdd) {

+                       r = -ENODEV;

+                       goto free_hqd_info;

+               }

+

+               pr_warn("Hang detected doorbell %x pipe %d queue %d type %d\n",

+                               hung_array[i], pipe, queue, queue_type);

+               /* Proceed remove_queue with reset=true */

+               remove_queue_mes_on_reset_option(dqm, q, &pdd->qpd, true, 
false);
This is where KFD calls remove_queue_mes_on_reset_optionto clean up bad queues 
created on user compute.



+               set_queue_as_reset(dqm, q, &pdd->qpd);

+       }

+

+       dqm->detect_hang_count = num_hung;

+       kfd_signal_reset_event(dqm->dev);
This is where KFD notifies the user process about the reset event: 
kfd_signal_reset_event



+

+free_hqd_info:

+       kfree(hqd_info);

+free_hung_array:

+       kfree(hung_array);

+fail:

+       dqm->detect_hang_count = 0;

+       return r;

+}

+

 static int suspend_all_queues_mes(struct device_queue_manager *dqm)  {

        struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev; @@ 
-371,6 +493,9 @@ static int suspend_all_queues_mes(struct device_queue_manager 
*dqm)

        up_read(&adev->reset_domain->sem);



        if (r) {

+               if (!reset_queues_mes(dqm))

+                       return 0;

+

                dev_err(adev->dev, "failed to suspend gangs from MES\n");

                dev_err(adev->dev, "MES might be in unrecoverable state, issue 
a GPU reset\n");

                kfd_hws_hang(dqm);

@@ -2137,6 +2262,7 @@ static void set_queue_as_reset(struct 
device_queue_manager *dqm, struct queue *q

                q->properties.queue_id, pdd->process->lead_thread->pid);



        pdd->has_reset_queue = true;

+       q->properties.is_reset = true;

        if (q->properties.is_active) {

                q->properties.is_active = false;

                decrement_queue_count(dqm, qpd, q);

@@ -2203,6 +2329,23 @@ static struct queue *find_queue_by_address(struct 
device_queue_manager *dqm, uin

        return NULL;

 }



+static struct queue *find_queue_by_doorbell_offset(struct

+device_queue_manager *dqm, uint32_t doorbell_offset) {

+       struct device_process_node *cur;

+       struct qcm_process_device *qpd;

+       struct queue *q;

+

+       list_for_each_entry(cur, &dqm->queues, list) {

+               qpd = cur->qpd;

+               list_for_each_entry(q, &qpd->queues_list, list) {

+                       if (doorbell_offset == q->properties.doorbell_off)

+                               return q;

+               }

+       }

+

+       return NULL;

+}

+

 static int reset_hung_queues(struct device_queue_manager *dqm)  {

        int r = 0, reset_count = 0, i;

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

index 27e4859e4ad7..6cb33f6d71e2 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

@@ -523,6 +523,7 @@ struct queue_properties {

        uint32_t pm4_target_xcc;

        bool is_dbg_wa;

        bool is_user_cu_masked;

+       bool is_reset;

        /* Not relevant for user mode queues in cp scheduling */

        unsigned int vmid;

        /* Relevant only for sdma queues*/

--

2.43.0



Reply via email to