On Thu, Mar 26, 2026 at 3:40 PM Amber Lin <[email protected]> wrote:
>
>
> Regards,
> Amber
>
>
> On 3/26/26 14:51, Alex Deucher wrote:
>
> On Tue, Mar 24, 2026 at 1:57 PM Amber Lin <[email protected]> wrote:
>
> When removing queues fails, KFD calls amdgpu_mes to detect and reset
> hung queues, then cleans up those hung queues in KFD.
>
> Suggested-by: Jonathan Kim <[email protected]>
> Signed-off-by: Amber Lin <[email protected]>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 6 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 1 +
> .../drm/amd/amdkfd/kfd_device_queue_manager.c | 147 +++++++++++++++++-
> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 +
> 4 files changed, 153 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> index f3a4ae1fd521..7cf4b3d6fc93 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> @@ -793,6 +793,12 @@ bool amdgpu_mes_suspend_resume_all_supported(struct
> amdgpu_device *adev)
> amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(12, 0, 0));
> }
>
> +bool amdgpu_mes_queue_reset_by_mes_supported(struct amdgpu_device *adev)
> +{
> + return (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 1, 0) &&
> + (adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 0x73);
> +}
> +
> /* Fix me -- node_id is used to identify the correct MES instances in the
> future */
> static int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev,
> uint32_t node_id, bool enable)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> index 643b4f8d757a..44fa4d73bce8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> @@ -548,6 +548,7 @@ static inline void amdgpu_mes_unlock(struct amdgpu_mes
> *mes)
> }
>
> bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev);
> +bool amdgpu_mes_queue_reset_by_mes_supported(struct amdgpu_device *adev);
>
> int amdgpu_mes_update_enforce_isolation(struct amdgpu_device *adev);
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index ec8d7f4be840..1c9c350bfffe 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -71,6 +71,12 @@ static int allocate_sdma_queue(struct device_queue_manager
> *dqm,
> struct queue *q, const uint32_t
> *restore_sdma_id);
>
> static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool
> is_sdma);
> +static int resume_all_queues_mes(struct device_queue_manager *dqm);
> +static int suspend_all_queues_mes(struct device_queue_manager *dqm);
> +static struct queue *find_queue_by_doorbell_offset(struct
> device_queue_manager *dqm,
> + uint32_t doorbell_offset);
> +static void set_queue_as_reset(struct device_queue_manager *dqm, struct
> queue *q,
> + struct qcm_process_device *qpd);
>
> static inline
> enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type)
> @@ -273,13 +279,19 @@ static int add_queue_mes(struct device_queue_manager
> *dqm, struct queue *q,
> return r;
> }
>
> -static int remove_queue_mes(struct device_queue_manager *dqm, struct queue
> *q,
> - struct qcm_process_device *qpd)
> +static int remove_queue_mes_on_reset_option(struct device_queue_manager
> *dqm, struct queue *q,
> + struct qcm_process_device *qpd,
> + bool is_for_reset,
> + bool flush_mes_queue)
> {
> struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
> int r;
> struct mes_remove_queue_input queue_input;
>
> + /* queue was already removed during reset */
> + if (q->properties.is_reset)
> + return 0;
> +
> if (!dqm->sched_running || dqm->sched_halt)
> return 0;
> if (!down_read_trylock(&adev->reset_domain->sem))
> @@ -288,6 +300,7 @@ static int remove_queue_mes(struct device_queue_manager
> *dqm, struct queue *q,
> memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input));
> queue_input.doorbell_offset = q->properties.doorbell_off;
> queue_input.gang_context_addr = q->gang_ctx_gpu_addr;
> + queue_input.remove_queue_after_reset = flush_mes_queue;
> queue_input.xcc_id = ffs(dqm->dev->xcc_mask) - 1;
>
> amdgpu_mes_lock(&adev->mes);
> @@ -295,7 +308,13 @@ static int remove_queue_mes(struct device_queue_manager
> *dqm, struct queue *q,
> amdgpu_mes_unlock(&adev->mes);
> up_read(&adev->reset_domain->sem);
>
> + if (is_for_reset)
> + return r;
> +
> if (r) {
> + if (!suspend_all_queues_mes(dqm))
> + return resume_all_queues_mes(dqm);
> +
> dev_err(adev->dev, "failed to remove hardware queue from MES,
> doorbell=0x%x\n",
> q->properties.doorbell_off);
> dev_err(adev->dev, "MES might be in unrecoverable state,
> issue a GPU reset\n");
> @@ -305,6 +324,12 @@ static int remove_queue_mes(struct device_queue_manager
> *dqm, struct queue *q,
> return r;
> }
>
> +static int remove_queue_mes(struct device_queue_manager *dqm, struct queue
> *q,
> + struct qcm_process_device *qpd)
> +{
> + return remove_queue_mes_on_reset_option(dqm, q, qpd, false, false);
> +}
> +
> static int remove_all_kfd_queues_mes(struct device_queue_manager *dqm)
> {
> struct device_process_node *cur;
> @@ -359,6 +384,103 @@ static int add_all_kfd_queues_mes(struct
> device_queue_manager *dqm)
> return retval;
> }
>
> +static int reset_queues_mes(struct device_queue_manager *dqm)
> +{
> + struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
> + struct amdgpu_mes_hung_queue_hqd_info *hqd_info;
> + int hqd_info_size = adev->mes.hung_queue_hqd_info_offset;
> + int num_hung = 0, r = 0, i, pipe, queue, queue_type;
> + uint32_t *hung_array;
> + struct kfd_process_device *pdd;
> + struct queue *q;
> +
> + if (!amdgpu_mes_queue_reset_by_mes_supported(adev)) {
> + r = -ENOTRECOVERABLE;
> + goto fail;
> + }
> +
> + /* reset should be used only in dqm locked queue reset */
> + if (WARN_ON(dqm->detect_hang_count > 0))
> + return 0;
> +
> + if (!amdgpu_gpu_recovery) {
> + r = -ENOTRECOVERABLE;
> + goto fail;
> + }
> +
> + hung_array = kzalloc(adev->mes.hung_queue_db_array_size *
> sizeof(uint32_t), GFP_KERNEL);
> + if (!hung_array) {
> + r = -ENOMEM;
> + goto fail;
> + }
> +
> + hqd_info = kzalloc(hqd_info_size * sizeof(struct
> amdgpu_mes_hung_queue_hqd_info), GFP_KERNEL);
>
> We should avoid allocating memory in the reset paths as they can
> deadlock if the kernel is waiting on the reset to get the memory
> needed for the allocation. Can you preallocate this somehow?
>
> Alex
>
> I probably misunderstood the concern here... When we allocate the needed
> memory here, reset process hasn't happened until we call
> amdgpu_mes_detect_and_reset_hung_queues below.
> amdgpu_mes_detect_and_reset_hung_queues is where driver prepares the
> detect_and_reset input and then submits the RESET packet to MES.
RIght, but we've detected a problem at this point (failure to remove a
queue). Presumably we tried to remove the queue due to some external
factor such as the kernel asking for memory. Once that happens the
kernel may be stuck until we complete the reset and the memory can be
freed. If you really need to allocate memory, you need to use
GFP_ATOMIC.
Alex
>
> + if (!hqd_info) {
> + r = -ENOMEM;
> + goto free_hung_array;
> + }
> +
> + memset(hqd_info, 0, hqd_info_size * sizeof(struct
> amdgpu_mes_hung_queue_hqd_info));
> +
> + /*
> + * AMDGPU_RING_TYPE_COMPUTE parameter does not matter if called
> + * post suspend_all as reset & detect will return all hung queue
> types.
> + *
> + * Passed parameter is for targeting queues not scheduled by MES
> add_queue.
> + */
> + r = amdgpu_mes_detect_and_reset_hung_queues(adev,
> AMDGPU_RING_TYPE_COMPUTE,
> + false, &num_hung, hung_array, ffs(dqm->dev->xcc_mask) - 1);
>
> Here is the amdgpu_mes_detect_and_reset_hung_queues
>
> +
> + if (!num_hung || r) {
> + r = -ENOTRECOVERABLE;
> + goto free_hqd_info;
> + }
> +
> + /* MES reset resets queue/pipe and cleans up internally */
> + for (i = 0; i < num_hung; i++) {
> + hqd_info[i].bit0_31 = hung_array[i + hqd_info_size];
> + pipe = hqd_info[i].pipe_index;
> + queue = hqd_info[i].queue_index;
> + queue_type = hqd_info[i].queue_type;
> +
> + if (queue_type != MES_QUEUE_TYPE_COMPUTE &&
> + queue_type != MES_QUEUE_TYPE_SDMA) {
> + pr_warn("Unsupported hung queue reset type: %d\n",
> queue_type);
> + hung_array[i] = AMDGPU_MES_INVALID_DB_OFFSET;
> + continue;
> + }
> +
> + q = find_queue_by_doorbell_offset(dqm, hung_array[i]);
> + if (!q) {
> + r = -ENOTRECOVERABLE;
> + goto free_hqd_info;
> + }
> +
> + pdd = kfd_get_process_device_data(q->device, q->process);
> + if (!pdd) {
> + r = -ENODEV;
> + goto free_hqd_info;
> + }
> +
> + pr_warn("Hang detected doorbell %x pipe %d queue %d type
> %d\n",
> + hung_array[i], pipe, queue, queue_type);
> + /* Proceed remove_queue with reset=true */
> + remove_queue_mes_on_reset_option(dqm, q, &pdd->qpd, true,
> false);
> + set_queue_as_reset(dqm, q, &pdd->qpd);
> + }
> +
> + dqm->detect_hang_count = num_hung;
> + kfd_signal_reset_event(dqm->dev);
> +
> +free_hqd_info:
> + kfree(hqd_info);
> +free_hung_array:
> + kfree(hung_array);
> +fail:
> + dqm->detect_hang_count = 0;
> + return r;
> +}
> +
> static int suspend_all_queues_mes(struct device_queue_manager *dqm)
> {
> struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
> @@ -371,6 +493,9 @@ static int suspend_all_queues_mes(struct
> device_queue_manager *dqm)
> up_read(&adev->reset_domain->sem);
>
> if (r) {
> + if (!reset_queues_mes(dqm))
> + return 0;
> +
> dev_err(adev->dev, "failed to suspend gangs from MES\n");
> dev_err(adev->dev, "MES might be in unrecoverable state,
> issue a GPU reset\n");
> kfd_hws_hang(dqm);
> @@ -2137,6 +2262,7 @@ static void set_queue_as_reset(struct
> device_queue_manager *dqm, struct queue *q
> q->properties.queue_id, pdd->process->lead_thread->pid);
>
> pdd->has_reset_queue = true;
> + q->properties.is_reset = true;
> if (q->properties.is_active) {
> q->properties.is_active = false;
> decrement_queue_count(dqm, qpd, q);
> @@ -2203,6 +2329,23 @@ static struct queue *find_queue_by_address(struct
> device_queue_manager *dqm, uin
> return NULL;
> }
>
> +static struct queue *find_queue_by_doorbell_offset(struct
> device_queue_manager *dqm, uint32_t doorbell_offset)
> +{
> + struct device_process_node *cur;
> + struct qcm_process_device *qpd;
> + struct queue *q;
> +
> + list_for_each_entry(cur, &dqm->queues, list) {
> + qpd = cur->qpd;
> + list_for_each_entry(q, &qpd->queues_list, list) {
> + if (doorbell_offset == q->properties.doorbell_off)
> + return q;
> + }
> + }
> +
> + return NULL;
> +}
> +
> static int reset_hung_queues(struct device_queue_manager *dqm)
> {
> int r = 0, reset_count = 0, i;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 27e4859e4ad7..6cb33f6d71e2 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -523,6 +523,7 @@ struct queue_properties {
> uint32_t pm4_target_xcc;
> bool is_dbg_wa;
> bool is_user_cu_masked;
> + bool is_reset;
> /* Not relevant for user mode queues in cp scheduling */
> unsigned int vmid;
> /* Relevant only for sdma queues*/
> --
> 2.43.0
>
>