On Fri, Mar 20, 2026 at 4:19 PM Amber Lin <[email protected]> wrote: > > When removing queues fails, KFD calls amdgpu_mes to detect and reset > hung queues, then cleans up those hung queues in KFD. > > Suggested-by: Jonathan Kim <[email protected]> > Signed-off-by: Amber Lin <[email protected]> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 6 + > drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 1 + > .../drm/amd/amdkfd/kfd_device_queue_manager.c | 147 +++++++++++++++++- > drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 + > drivers/gpu/drm/amd/amdkfd/kfd_topology.c | 1 + > 5 files changed, 154 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c > index 4f44b933e373..fd6b40d9da58 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c > @@ -795,6 +795,12 @@ bool amdgpu_mes_suspend_resume_all_supported(struct > amdgpu_device *adev) > amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(12, 0, 0)); > } > > +bool amdgpu_mes_queue_reset_by_mes_supported(struct amdgpu_device *adev) > +{ > + return (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(12, 1, 0) && > + (adev->mes.sched_version & AMDGPU_MES_VERSION_MASK) >= 0x73); > +} > + > /* Fix me -- node_id is used to identify the correct MES instances in the > future */ > static int amdgpu_mes_set_enforce_isolation(struct amdgpu_device *adev, > uint32_t node_id, bool enable) > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h > index 643b4f8d757a..44fa4d73bce8 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h > @@ -548,6 +548,7 @@ static inline void amdgpu_mes_unlock(struct amdgpu_mes > *mes) > } > > bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev); > +bool amdgpu_mes_queue_reset_by_mes_supported(struct amdgpu_device *adev); > > int amdgpu_mes_update_enforce_isolation(struct amdgpu_device *adev); > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > index ec8d7f4be840..1c9c350bfffe 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > @@ -71,6 +71,12 @@ static int allocate_sdma_queue(struct device_queue_manager > *dqm, > struct queue *q, const uint32_t > *restore_sdma_id); > > static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool > is_sdma); > +static int resume_all_queues_mes(struct device_queue_manager *dqm); > +static int suspend_all_queues_mes(struct device_queue_manager *dqm); > +static struct queue *find_queue_by_doorbell_offset(struct > device_queue_manager *dqm, > + uint32_t doorbell_offset); > +static void set_queue_as_reset(struct device_queue_manager *dqm, struct > queue *q, > + struct qcm_process_device *qpd); > > static inline > enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type) > @@ -273,13 +279,19 @@ static int add_queue_mes(struct device_queue_manager > *dqm, struct queue *q, > return r; > } > > -static int remove_queue_mes(struct device_queue_manager *dqm, struct queue > *q, > - struct qcm_process_device *qpd) > +static int remove_queue_mes_on_reset_option(struct device_queue_manager > *dqm, struct queue *q, > + struct qcm_process_device *qpd, > + bool is_for_reset, > + bool flush_mes_queue) > { > struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev; > int r; > struct mes_remove_queue_input queue_input; > > + /* queue was already removed during reset */ > + if (q->properties.is_reset) > + return 0; > + > if (!dqm->sched_running || dqm->sched_halt) > return 0; > if (!down_read_trylock(&adev->reset_domain->sem)) > @@ -288,6 +300,7 @@ static int remove_queue_mes(struct device_queue_manager > *dqm, struct queue *q, > memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input)); > queue_input.doorbell_offset = q->properties.doorbell_off; > queue_input.gang_context_addr = q->gang_ctx_gpu_addr; > + queue_input.remove_queue_after_reset = flush_mes_queue; > queue_input.xcc_id = ffs(dqm->dev->xcc_mask) - 1; > > amdgpu_mes_lock(&adev->mes); > @@ -295,7 +308,13 @@ static int remove_queue_mes(struct device_queue_manager > *dqm, struct queue *q, > amdgpu_mes_unlock(&adev->mes); > up_read(&adev->reset_domain->sem); > > + if (is_for_reset) > + return r; > + > if (r) { > + if (!suspend_all_queues_mes(dqm)) > + return resume_all_queues_mes(dqm); > + > dev_err(adev->dev, "failed to remove hardware queue from MES, > doorbell=0x%x\n", > q->properties.doorbell_off); > dev_err(adev->dev, "MES might be in unrecoverable state, > issue a GPU reset\n"); > @@ -305,6 +324,12 @@ static int remove_queue_mes(struct device_queue_manager > *dqm, struct queue *q, > return r; > } > > +static int remove_queue_mes(struct device_queue_manager *dqm, struct queue > *q, > + struct qcm_process_device *qpd) > +{ > + return remove_queue_mes_on_reset_option(dqm, q, qpd, false, false); > +} > + > static int remove_all_kfd_queues_mes(struct device_queue_manager *dqm) > { > struct device_process_node *cur; > @@ -359,6 +384,103 @@ static int add_all_kfd_queues_mes(struct > device_queue_manager *dqm) > return retval; > } > > +static int reset_queues_mes(struct device_queue_manager *dqm) > +{ > + struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev; > + struct amdgpu_mes_hung_queue_hqd_info *hqd_info; > + int hqd_info_size = adev->mes.hung_queue_hqd_info_offset; > + int num_hung = 0, r = 0, i, pipe, queue, queue_type; > + uint32_t *hung_array; > + struct kfd_process_device *pdd; > + struct queue *q; > + > + if (!amdgpu_mes_queue_reset_by_mes_supported(adev)) { > + r = -ENOTRECOVERABLE; > + goto fail; > + } > + > + /* reset should be used only in dqm locked queue reset */ > + if (WARN_ON(dqm->detect_hang_count > 0)) > + return 0; > + > + if (!amdgpu_gpu_recovery) { > + r = -ENOTRECOVERABLE; > + goto fail; > + } > + > + hung_array = kzalloc(adev->mes.hung_queue_db_array_size * > sizeof(uint32_t), GFP_KERNEL); > + if (!hung_array) { > + r = -ENOMEM; > + goto fail; > + } > + > + hqd_info = kzalloc(hqd_info_size * sizeof(struct > amdgpu_mes_hung_queue_hqd_info), GFP_KERNEL); > + if (!hqd_info) { > + r = -ENOMEM; > + goto free_hung_array; > + } > + > + memset(hqd_info, 0, hqd_info_size * sizeof(struct > amdgpu_mes_hung_queue_hqd_info)); > + > + /* > + * AMDGPU_RING_TYPE_COMPUTE parameter does not matter if called > + * post suspend_all as reset & detect will return all hung queue > types. > + * > + * Passed parameter is for targeting queues not scheduled by MES > add_queue. > + */ > + r = amdgpu_mes_detect_and_reset_hung_queues(adev, > AMDGPU_RING_TYPE_COMPUTE, > + false, &num_hung, hung_array, ffs(dqm->dev->xcc_mask) - 1); > + > + if (!num_hung || r) { > + r = -ENOTRECOVERABLE; > + goto free_hqd_info; > + } > + > + /* MES reset resets queue/pipe and cleans up internally */ > + for (i = 0; i < num_hung; i++) { > + hqd_info[i].bit0_31 = hung_array[i + hqd_info_size]; > + pipe = hqd_info[i].pipe_index; > + queue = hqd_info[i].queue_index; > + queue_type = hqd_info[i].queue_type; > + > + if (queue_type != MES_QUEUE_TYPE_COMPUTE && > + queue_type != MES_QUEUE_TYPE_SDMA) { > + pr_warn("Unsupported hung queue reset type: %d\n", > queue_type); > + hung_array[i] = AMDGPU_MES_INVALID_DB_OFFSET; > + continue; > + } > + > + q = find_queue_by_doorbell_offset(dqm, hung_array[i]); > + if (!q) { > + r = -ENOTRECOVERABLE; > + goto free_hqd_info; > + } > + > + pdd = kfd_get_process_device_data(q->device, q->process); > + if (!pdd) { > + r = -ENODEV; > + goto free_hqd_info; > + } > + > + pr_warn("Hang detected doorbell %x pipe %d queue %d type > %d\n", > + hung_array[i], pipe, queue, queue_type); > + /* Proceed remove_queue with reset=true */ > + remove_queue_mes_on_reset_option(dqm, q, &pdd->qpd, true, > false); > + set_queue_as_reset(dqm, q, &pdd->qpd); > + } > + > + dqm->detect_hang_count = num_hung; > + kfd_signal_reset_event(dqm->dev); > + > +free_hqd_info: > + kfree(hqd_info); > +free_hung_array: > + kfree(hung_array); > +fail: > + dqm->detect_hang_count = 0; > + return r; > +} > + > static int suspend_all_queues_mes(struct device_queue_manager *dqm) > { > struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev; > @@ -371,6 +493,9 @@ static int suspend_all_queues_mes(struct > device_queue_manager *dqm) > up_read(&adev->reset_domain->sem); > > if (r) { > + if (!reset_queues_mes(dqm)) > + return 0; > + > dev_err(adev->dev, "failed to suspend gangs from MES\n"); > dev_err(adev->dev, "MES might be in unrecoverable state, > issue a GPU reset\n"); > kfd_hws_hang(dqm); > @@ -2137,6 +2262,7 @@ static void set_queue_as_reset(struct > device_queue_manager *dqm, struct queue *q > q->properties.queue_id, pdd->process->lead_thread->pid); > > pdd->has_reset_queue = true; > + q->properties.is_reset = true; > if (q->properties.is_active) { > q->properties.is_active = false; > decrement_queue_count(dqm, qpd, q); > @@ -2203,6 +2329,23 @@ static struct queue *find_queue_by_address(struct > device_queue_manager *dqm, uin > return NULL; > } > > +static struct queue *find_queue_by_doorbell_offset(struct > device_queue_manager *dqm, uint32_t doorbell_offset) > +{ > + struct device_process_node *cur; > + struct qcm_process_device *qpd; > + struct queue *q; > + > + list_for_each_entry(cur, &dqm->queues, list) { > + qpd = cur->qpd; > + list_for_each_entry(q, &qpd->queues_list, list) { > + if (doorbell_offset == q->properties.doorbell_off) > + return q; > + } > + } > + > + return NULL; > +} > + > static int reset_hung_queues(struct device_queue_manager *dqm) > { > int r = 0, reset_count = 0, i; > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > index 27e4859e4ad7..6cb33f6d71e2 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > @@ -523,6 +523,7 @@ struct queue_properties { > uint32_t pm4_target_xcc; > bool is_dbg_wa; > bool is_user_cu_masked; > + bool is_reset; > /* Not relevant for user mode queues in cp scheduling */ > unsigned int vmid; > /* Relevant only for sdma queues*/ > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c > b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c > index 1ccd4514d3ee..4c52819aef9e 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c > @@ -2027,6 +2027,7 @@ static void kfd_topology_set_capabilities(struct > kfd_topology_device *dev) > if (KFD_GC_VERSION(dev->gpu) >= IP_VERSION(12, 1, 0)) { > dev->node_props.capability |= > > HSA_CAP_TRAP_DEBUG_PRECISE_MEMORY_OPERATIONS_SUPPORTED; > + dev->node_props.capability |= > HSA_CAP_PER_QUEUE_RESET_SUPPORTED;
Should this hunk be a separate patch? Isn't this already supported on existing parts using MES? Alex > dev->node_props.capability2 |= > > HSA_CAP2_TRAP_DEBUG_LDS_OUT_OF_ADDR_RANGE_SUPPORTED; > } > -- > 2.43.0 >
