[AMD Official Use Only - AMD Internal Distribution Only] Reviewed-by: Harish Kasiviswanathan <[email protected]>
-----Original Message----- From: Alex Deucher <[email protected]> Sent: Thursday, October 9, 2025 3:10 PM To: Kim, Jonathan <[email protected]> Cc: [email protected]; Deucher, Alexander <[email protected]>; Liu, Shaoyun <[email protected]>; Kasiviswanathan, Harish <[email protected]>; Lin, Amber <[email protected]> Subject: Re: [PATCH 5/6] drm/amdkfd: fix suspend/resume all calls in mes based eviction path On Thu, Oct 9, 2025 at 2:50 PM Jonathan Kim <[email protected]> wrote: > > Suspend/resume all gangs should be done with the device lock is held. > > Signed-off-by: Jonathan Kim <[email protected]> Acked-by: Alex Deucher <[email protected]> > --- > .../drm/amd/amdkfd/kfd_device_queue_manager.c | 73 ++++++------------- > 1 file changed, 21 insertions(+), 52 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > index 6c5c7c1bf5ed..6e7bc983fc0b 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > @@ -1209,6 +1209,15 @@ static int evict_process_queues_cpsch(struct > device_queue_manager *dqm, > pr_debug_ratelimited("Evicting process pid %d queues\n", > pdd->process->lead_thread->pid); > > + if (dqm->dev->kfd->shared_resources.enable_mes) { > + pdd->last_evict_timestamp = get_jiffies_64(); > + retval = suspend_all_queues_mes(dqm); > + if (retval) { > + dev_err(dev, "Suspending all queues failed"); > + goto out; > + } > + } > + > /* Mark all queues as evicted. Deactivate all active queues on > * the qpd. > */ > @@ -1221,23 +1230,27 @@ static int evict_process_queues_cpsch(struct > device_queue_manager *dqm, > decrement_queue_count(dqm, qpd, q); > > if (dqm->dev->kfd->shared_resources.enable_mes) { > - int err; > - > - err = remove_queue_mes(dqm, q, qpd); > - if (err) { > + retval = remove_queue_mes(dqm, q, qpd); > + if (retval) { > dev_err(dev, "Failed to evict queue %d\n", > q->properties.queue_id); > - retval = err; > + goto out; > } > } > } > - pdd->last_evict_timestamp = get_jiffies_64(); > - if (!dqm->dev->kfd->shared_resources.enable_mes) > + > + if (!dqm->dev->kfd->shared_resources.enable_mes) { > + pdd->last_evict_timestamp = get_jiffies_64(); > retval = execute_queues_cpsch(dqm, > qpd->is_debug ? > > KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES : > > KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, > USE_DEFAULT_GRACE_PERIOD); > + } else { > + retval = resume_all_queues_mes(dqm); > + if (retval) > + dev_err(dev, "Resuming all queues failed"); > + } > > out: > dqm_unlock(dqm); > @@ -3098,61 +3111,17 @@ int kfd_dqm_suspend_bad_queue_mes(struct kfd_node > *knode, u32 pasid, u32 doorbel > return ret; > } > > -static int kfd_dqm_evict_pasid_mes(struct device_queue_manager *dqm, > - struct qcm_process_device *qpd) > -{ > - struct device *dev = dqm->dev->adev->dev; > - int ret = 0; > - > - /* Check if process is already evicted */ > - dqm_lock(dqm); > - if (qpd->evicted) { > - /* Increment the evicted count to make sure the > - * process stays evicted before its terminated. > - */ > - qpd->evicted++; > - dqm_unlock(dqm); > - goto out; > - } > - dqm_unlock(dqm); > - > - ret = suspend_all_queues_mes(dqm); > - if (ret) { > - dev_err(dev, "Suspending all queues failed"); > - goto out; > - } > - > - ret = dqm->ops.evict_process_queues(dqm, qpd); > - if (ret) { > - dev_err(dev, "Evicting process queues failed"); > - goto out; > - } > - > - ret = resume_all_queues_mes(dqm); > - if (ret) > - dev_err(dev, "Resuming all queues failed"); > - > -out: > - return ret; > -} > - > int kfd_evict_process_device(struct kfd_process_device *pdd) > { > struct device_queue_manager *dqm; > struct kfd_process *p; > - int ret = 0; > > p = pdd->process; > dqm = pdd->dev->dqm; > > WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid); > > - if (dqm->dev->kfd->shared_resources.enable_mes) > - ret = kfd_dqm_evict_pasid_mes(dqm, &pdd->qpd); > - else > - ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd); > - > - return ret; > + return dqm->ops.evict_process_queues(dqm, &pdd->qpd); > } > > int reserve_debug_trap_vmid(struct device_queue_manager *dqm, > -- > 2.34.1 >
