On Fri, Mar 20, 2026 at 4:19 PM Amber Lin <[email protected]> wrote:
>
> When removing queues fails, KFD calls amdgpu_mes to detect and reset
> hung queues, then cleans up those hung queues in KFD.
>
> Suggested-by: Jonathan Kim <[email protected]>
> Signed-off-by: Amber Lin <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c       |   6 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h       |   1 +
>  .../drm/amd/amdkfd/kfd_device_queue_manager.c | 147 +++++++++++++++++-
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |   1 +
>  drivers/gpu/drm/amd/amdkfd/kfd_topology.c     |   1 +
>  5 files changed, 154 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> index 4f44b933e373..fd6b40d9da58 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> @@ -795,6 +795,12 @@ bool amdgpu_mes_suspend_resume_all_supported(struct 
> amdgpu_device *adev)
>                 amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(12, 0, 0));
>  }
>
> +bool amdgpu_mes_queue_reset_by_mes_supported(struct amdgpu_device *adev)
> +{
> +       return (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 1, 0) &&
> +               (adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 0x73);
> +}
> +
>  /* Fix me -- node_id is used to identify the correct MES instances in the 
> future */
>  static int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev,
>                                             uint32_t node_id, bool enable)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> index 643b4f8d757a..44fa4d73bce8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> @@ -548,6 +548,7 @@ static inline void amdgpu_mes_unlock(struct amdgpu_mes 
> *mes)
>  }
>
>  bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev);
> +bool amdgpu_mes_queue_reset_by_mes_supported(struct amdgpu_device *adev);
>
>  int amdgpu_mes_update_enforce_isolation(struct amdgpu_device *adev);
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index ec8d7f4be840..1c9c350bfffe 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -71,6 +71,12 @@ static int allocate_sdma_queue(struct device_queue_manager 
> *dqm,
>                                 struct queue *q, const uint32_t 
> *restore_sdma_id);
>
>  static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool 
> is_sdma);
> +static int resume_all_queues_mes(struct device_queue_manager *dqm);
> +static int suspend_all_queues_mes(struct device_queue_manager *dqm);
> +static struct queue *find_queue_by_doorbell_offset(struct 
> device_queue_manager *dqm,
> +                                                  uint32_t doorbell_offset);
> +static void set_queue_as_reset(struct device_queue_manager *dqm, struct 
> queue *q,
> +                              struct qcm_process_device *qpd);
>
>  static inline
>  enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type)
> @@ -273,13 +279,19 @@ static int add_queue_mes(struct device_queue_manager 
> *dqm, struct queue *q,
>         return r;
>  }
>
> -static int remove_queue_mes(struct device_queue_manager *dqm, struct queue 
> *q,
> -                       struct qcm_process_device *qpd)
> +static int remove_queue_mes_on_reset_option(struct device_queue_manager 
> *dqm, struct queue *q,
> +                                           struct qcm_process_device *qpd,
> +                                           bool is_for_reset,
> +                                           bool flush_mes_queue)
>  {
>         struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
>         int r;
>         struct mes_remove_queue_input queue_input;
>
> +       /* queue was already removed during reset */
> +       if (q->properties.is_reset)
> +               return 0;
> +
>         if (!dqm->sched_running || dqm->sched_halt)
>                 return 0;
>         if (!down_read_trylock(&adev->reset_domain->sem))
> @@ -288,6 +300,7 @@ static int remove_queue_mes(struct device_queue_manager 
> *dqm, struct queue *q,
>         memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input));
>         queue_input.doorbell_offset = q->properties.doorbell_off;
>         queue_input.gang_context_addr = q->gang_ctx_gpu_addr;
> +       queue_input.remove_queue_after_reset = flush_mes_queue;
>         queue_input.xcc_id = ffs(dqm->dev->xcc_mask) - 1;
>
>         amdgpu_mes_lock(&adev->mes);
> @@ -295,7 +308,13 @@ static int remove_queue_mes(struct device_queue_manager 
> *dqm, struct queue *q,
>         amdgpu_mes_unlock(&adev->mes);
>         up_read(&adev->reset_domain->sem);
>
> +       if (is_for_reset)
> +               return r;
> +
>         if (r) {
> +               if (!suspend_all_queues_mes(dqm))
> +                       return resume_all_queues_mes(dqm);
> +
>                 dev_err(adev->dev, "failed to remove hardware queue from MES, 
> doorbell=0x%x\n",
>                         q->properties.doorbell_off);
>                 dev_err(adev->dev, "MES might be in unrecoverable state, 
> issue a GPU reset\n");
> @@ -305,6 +324,12 @@ static int remove_queue_mes(struct device_queue_manager 
> *dqm, struct queue *q,
>         return r;
>  }
>
> +static int remove_queue_mes(struct device_queue_manager *dqm, struct queue 
> *q,
> +                           struct qcm_process_device *qpd)
> +{
> +       return remove_queue_mes_on_reset_option(dqm, q, qpd, false, false);
> +}
> +
>  static int remove_all_kfd_queues_mes(struct device_queue_manager *dqm)
>  {
>         struct device_process_node *cur;
> @@ -359,6 +384,103 @@ static int add_all_kfd_queues_mes(struct 
> device_queue_manager *dqm)
>         return retval;
>  }
>
> +static int reset_queues_mes(struct device_queue_manager *dqm)
> +{
> +       struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
> +       struct amdgpu_mes_hung_queue_hqd_info *hqd_info;
> +       int hqd_info_size = adev->mes.hung_queue_hqd_info_offset;
> +       int num_hung = 0, r = 0, i, pipe, queue, queue_type;
> +       uint32_t *hung_array;
> +       struct kfd_process_device *pdd;
> +       struct queue *q;
> +
> +       if (!amdgpu_mes_queue_reset_by_mes_supported(adev)) {
> +               r = -ENOTRECOVERABLE;
> +               goto fail;
> +       }
> +
> +       /* reset should be used only in dqm locked queue reset */
> +       if (WARN_ON(dqm->detect_hang_count > 0))
> +               return 0;
> +
> +       if (!amdgpu_gpu_recovery) {
> +               r = -ENOTRECOVERABLE;
> +               goto fail;
> +       }
> +
> +       hung_array = kzalloc(adev->mes.hung_queue_db_array_size * 
> sizeof(uint32_t), GFP_KERNEL);
> +       if (!hung_array) {
> +               r = -ENOMEM;
> +               goto fail;
> +       }
> +
> +       hqd_info = kzalloc(hqd_info_size * sizeof(struct 
> amdgpu_mes_hung_queue_hqd_info), GFP_KERNEL);
> +       if (!hqd_info) {
> +               r = -ENOMEM;
> +               goto free_hung_array;
> +       }
> +
> +       memset(hqd_info, 0, hqd_info_size * sizeof(struct 
> amdgpu_mes_hung_queue_hqd_info));
> +
> +       /*
> +        * AMDGPU_RING_TYPE_COMPUTE parameter does not matter if called
> +        * post suspend_all as reset & detect will return all hung queue 
> types.
> +        *
> +        * Passed parameter is for targeting queues not scheduled by MES 
> add_queue.
> +        */
> +       r =  amdgpu_mes_detect_and_reset_hung_queues(adev, 
> AMDGPU_RING_TYPE_COMPUTE,
> +               false, &num_hung, hung_array, ffs(dqm->dev->xcc_mask) - 1);
> +
> +       if (!num_hung || r) {
> +               r = -ENOTRECOVERABLE;
> +               goto free_hqd_info;
> +       }
> +
> +       /* MES reset resets queue/pipe and cleans up internally  */
> +       for (i = 0; i < num_hung; i++) {
> +               hqd_info[i].bit0_31 = hung_array[i + hqd_info_size];
> +               pipe = hqd_info[i].pipe_index;
> +               queue = hqd_info[i].queue_index;
> +               queue_type = hqd_info[i].queue_type;
> +
> +               if (queue_type != MES_QUEUE_TYPE_COMPUTE &&
> +                   queue_type != MES_QUEUE_TYPE_SDMA) {
> +                       pr_warn("Unsupported hung queue reset type: %d\n", 
> queue_type);
> +                       hung_array[i] = AMDGPU_MES_INVALID_DB_OFFSET;
> +                       continue;
> +               }
> +
> +               q = find_queue_by_doorbell_offset(dqm, hung_array[i]);
> +               if (!q) {
> +                       r = -ENOTRECOVERABLE;
> +                       goto free_hqd_info;
> +               }
> +
> +               pdd = kfd_get_process_device_data(q->device, q->process);
> +               if (!pdd) {
> +                       r = -ENODEV;
> +                       goto free_hqd_info;
> +               }
> +
> +               pr_warn("Hang detected doorbell %x pipe %d queue %d type 
> %d\n",
> +                               hung_array[i], pipe, queue, queue_type);
> +               /* Proceed remove_queue with reset=true */
> +               remove_queue_mes_on_reset_option(dqm, q, &pdd->qpd, true, 
> false);
> +               set_queue_as_reset(dqm, q, &pdd->qpd);
> +       }
> +
> +       dqm->detect_hang_count = num_hung;
> +       kfd_signal_reset_event(dqm->dev);
> +
> +free_hqd_info:
> +       kfree(hqd_info);
> +free_hung_array:
> +       kfree(hung_array);
> +fail:
> +       dqm->detect_hang_count = 0;
> +       return r;
> +}
> +
>  static int suspend_all_queues_mes(struct device_queue_manager *dqm)
>  {
>         struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
> @@ -371,6 +493,9 @@ static int suspend_all_queues_mes(struct 
> device_queue_manager *dqm)
>         up_read(&adev->reset_domain->sem);
>
>         if (r) {
> +               if (!reset_queues_mes(dqm))
> +                       return 0;
> +
>                 dev_err(adev->dev, "failed to suspend gangs from MES\n");
>                 dev_err(adev->dev, "MES might be in unrecoverable state, 
> issue a GPU reset\n");
>                 kfd_hws_hang(dqm);
> @@ -2137,6 +2262,7 @@ static void set_queue_as_reset(struct 
> device_queue_manager *dqm, struct queue *q
>                 q->properties.queue_id, pdd->process->lead_thread->pid);
>
>         pdd->has_reset_queue = true;
> +       q->properties.is_reset = true;
>         if (q->properties.is_active) {
>                 q->properties.is_active = false;
>                 decrement_queue_count(dqm, qpd, q);
> @@ -2203,6 +2329,23 @@ static struct queue *find_queue_by_address(struct 
> device_queue_manager *dqm, uin
>         return NULL;
>  }
>
> +static struct queue *find_queue_by_doorbell_offset(struct 
> device_queue_manager *dqm, uint32_t doorbell_offset)
> +{
> +       struct device_process_node *cur;
> +       struct qcm_process_device *qpd;
> +       struct queue *q;
> +
> +       list_for_each_entry(cur, &dqm->queues, list) {
> +               qpd = cur->qpd;
> +               list_for_each_entry(q, &qpd->queues_list, list) {
> +                       if (doorbell_offset == q->properties.doorbell_off)
> +                               return q;
> +               }
> +       }
> +
> +       return NULL;
> +}
> +
>  static int reset_hung_queues(struct device_queue_manager *dqm)
>  {
>         int r = 0, reset_count = 0, i;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 27e4859e4ad7..6cb33f6d71e2 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -523,6 +523,7 @@ struct queue_properties {
>         uint32_t pm4_target_xcc;
>         bool is_dbg_wa;
>         bool is_user_cu_masked;
> +       bool is_reset;
>         /* Not relevant for user mode queues in cp scheduling */
>         unsigned int vmid;
>         /* Relevant only for sdma queues*/
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> index 1ccd4514d3ee..4c52819aef9e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
> @@ -2027,6 +2027,7 @@ static void kfd_topology_set_capabilities(struct 
> kfd_topology_device *dev)
>                 if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(12, 1, 0)) {
>                         dev->node_props.capability |=
>                                 
> HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED;
> +                       dev->node_props.capability |= 
> HSA_CAP_PER_QUEUE_RESET_SUPPORTED;

Should this hunk be a separate patch?  Isn't this already supported on
existing parts using MES?

Alex

>                         dev->node_props.capability2 |=
>                                 
> HSA_CAP2_TRAP_DEBUG_LDS_OUT_OF_ADDR_RANGE_SUPPORTED;
>                 }
> --
> 2.43.0
>

Reply via email to