Okay Can you send me the patch (in attachment) once you finished it, I need to verify it on SRIOV
Thanks _____________________________________ Monk Liu|GPU Virtualization Team |AMD -----Original Message----- From: Kuehling, Felix <[email protected]> Sent: Friday, December 20, 2019 3:56 PM To: Liu, Monk <[email protected]>; [email protected] Cc: Liu, Shaoyun <[email protected]> Subject: Re: [PATCH 1/1] drm/amdkfd: Don't touch the hardware in pre_reset callback You should be able to save the plain text email and pass that to "git am". It's trivial with Thunderbird on a Linux system. If you're using outlook, I'm not sure. Anyway, I'm already reworking the patch based on Shaoyun's suggestion and some ideas it gave me. Regards, Felix On 2019-12-19 23:23, Liu, Monk wrote: > Hi Felix > > Do you know how I can get a "xxx.patch" file from the email from you ?? > > _____________________________________ > Monk Liu|GPU Virtualization Team |AMD > > > -----Original Message----- > From: Kuehling, Felix <[email protected]> > Sent: Friday, December 20, 2019 10:09 AM > To: [email protected] > Cc: Liu, Shaoyun <[email protected]>; Liu, Monk <[email protected]> > Subject: [PATCH 1/1] drm/amdkfd: Don't touch the hardware in pre_reset > callback > > The intention of the pre_reset callback is to update the driver state to > reflect that all user mode queues are preempted and the HIQ is destroyed. > However we should not actually preempt any queues or otherwise touch the > hardware because it is presumably hanging. > The impending reset will take care of actually stopping all queues. > > This should prevent KIQ register access hanging on SRIOV function level reset > (FLR). It should also speed up the reset by avoiding unnecessary timeouts > from a potentially hanging GPU scheduler. > > CC: shaoyunl <[email protected]> > CC: Liu Monk <[email protected]> > Signed-off-by: Felix Kuehling <[email protected]> > --- > drivers/gpu/drm/amd/amdkfd/kfd_device.c | 24 ++++++++++------- > .../drm/amd/amdkfd/kfd_device_queue_manager.c | 27 ++++++++++++------- > .../drm/amd/amdkfd/kfd_device_queue_manager.h | 5 ++-- > drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 8 +++--- > .../gpu/drm/amd/amdkfd/kfd_packet_manager.c | 4 +-- > drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 8 +++--- > drivers/gpu/drm/amd/amdkfd/kfd_process.c | 11 ++++---- > .../amd/amdkfd/kfd_process_queue_manager.c | 2 +- > 8 files changed, 53 insertions(+), 36 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c > b/drivers/gpu/drm/amd/amdkfd/kfd_device.c > index c6b6901bbda3..796996a0d926 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c > @@ -486,6 +486,7 @@ static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned > int buf_size, > unsigned int chunk_size); > static void kfd_gtt_sa_fini(struct kfd_dev *kfd); > > +static void kfd_suspend(struct kfd_dev *kfd, bool pre_reset); > static int kfd_resume(struct kfd_dev *kfd); > > struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, @@ -728,7 +729,7 @@ int > kgd2kfd_pre_reset(struct kfd_dev *kfd) { > if (!kfd->init_complete) > return 0; > - kgd2kfd_suspend(kfd); > + kfd_suspend(kfd, true); > > kfd_signal_reset_event(kfd); > return 0; > @@ -767,13 +768,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd) > if (!kfd->init_complete) > return; > > - /* For first KFD device suspend all the KFD processes */ > - if (atomic_inc_return(&kfd_locked) == 1) > - kfd_suspend_all_processes(); > - > - kfd->dqm->ops.stop(kfd->dqm); > - > - kfd_iommu_suspend(kfd); > + kfd_suspend(kfd, false); > } > > int kgd2kfd_resume(struct kfd_dev *kfd) @@ -795,6 +790,17 @@ int > kgd2kfd_resume(struct kfd_dev *kfd) > return ret; > } > > +static void kfd_suspend(struct kfd_dev *kfd, bool pre_reset) { > + /* For first KFD device suspend all the KFD processes */ > + if (atomic_inc_return(&kfd_locked) == 1) > + kfd_suspend_all_processes(pre_reset); > + > + kfd->dqm->ops.stop(kfd->dqm, pre_reset); > + > + kfd_iommu_suspend(kfd); > +} > + > static int kfd_resume(struct kfd_dev *kfd) { > int err = 0; > @@ -877,7 +883,7 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm) > if (!p) > return -ESRCH; > > - r = kfd_process_evict_queues(p); > + r = kfd_process_evict_queues(p, false); > > kfd_unref_process(p); > return r; > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > index f7f6df40875e..3a49944164da 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > @@ -592,7 +592,8 @@ static int update_queue(struct > device_queue_manager *dqm, struct queue *q) } > > static int evict_process_queues_nocpsch(struct device_queue_manager *dqm, > - struct qcm_process_device *qpd) > + struct qcm_process_device *qpd, > + bool pre_reset) > { > struct queue *q; > struct mqd_manager *mqd_mgr; > @@ -622,6 +623,8 @@ static int evict_process_queues_nocpsch(struct > device_queue_manager *dqm, > > if (WARN_ONCE(!dqm->sched_running, "Evict when stopped\n")) > continue; > + if (pre_reset) > + continue; > > retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd, > KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN, @@ -639,7 > +642,8 @@ static int > evict_process_queues_nocpsch(struct device_queue_manager *dqm, } > > static int evict_process_queues_cpsch(struct device_queue_manager *dqm, > - struct qcm_process_device *qpd) > + struct qcm_process_device *qpd, > + bool pre_reset) > { > struct queue *q; > struct kfd_process_device *pdd; > @@ -664,6 +668,10 @@ static int evict_process_queues_cpsch(struct > device_queue_manager *dqm, > q->properties.is_active = false; > dqm->queue_count--; > } > + > + if (pre_reset) > + goto out; > + > retval = execute_queues_cpsch(dqm, > qpd->is_debug ? > KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES : > @@ -944,10 +952,10 @@ static int start_nocpsch(struct device_queue_manager > *dqm) > return 0; > } > > -static int stop_nocpsch(struct device_queue_manager *dqm) > +static int stop_nocpsch(struct device_queue_manager *dqm, bool > +pre_reset) > { > if (dqm->dev->device_info->asic_family == CHIP_HAWAII) > - pm_uninit(&dqm->packets); > + pm_uninit(&dqm->packets, pre_reset); > dqm->sched_running = false; > > return 0; > @@ -1107,20 +1115,21 @@ static int start_cpsch(struct device_queue_manager > *dqm) > return 0; > fail_allocate_vidmem: > fail_set_sched_resources: > - pm_uninit(&dqm->packets); > + pm_uninit(&dqm->packets, false); > fail_packet_manager_init: > return retval; > } > > -static int stop_cpsch(struct device_queue_manager *dqm) > +static int stop_cpsch(struct device_queue_manager *dqm, bool > +pre_reset) > { > dqm_lock(dqm); > - unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); > + if (!pre_reset) > + unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); > dqm->sched_running = false; > dqm_unlock(dqm); > > kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); > - pm_uninit(&dqm->packets); > + pm_uninit(&dqm->packets, pre_reset); > > return 0; > } > @@ -1891,7 +1900,7 @@ int kfd_process_vm_fault(struct device_queue_manager > *dqm, > return -EINVAL; > pdd = kfd_get_process_device_data(dqm->dev, p); > if (pdd) > - ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd); > + ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd, false); > kfd_unref_process(p); > > return ret; > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h > b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h > index a8c37e6da027..9f82f95f6a92 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h > @@ -103,7 +103,7 @@ struct device_queue_manager_ops { > > int (*initialize)(struct device_queue_manager *dqm); > int (*start)(struct device_queue_manager *dqm); > - int (*stop)(struct device_queue_manager *dqm); > + int (*stop)(struct device_queue_manager *dqm, bool pre_reset); > void (*uninitialize)(struct device_queue_manager *dqm); > int (*create_kernel_queue)(struct device_queue_manager *dqm, > struct kernel_queue *kq, > @@ -129,7 +129,8 @@ struct device_queue_manager_ops { > struct qcm_process_device *qpd); > > int (*evict_process_queues)(struct device_queue_manager *dqm, > - struct qcm_process_device *qpd); > + struct qcm_process_device *qpd, > + bool pre_reset); > int (*restore_process_queues)(struct device_queue_manager *dqm, > struct qcm_process_device *qpd); > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c > b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c > index 2d56dc534459..dbd2e8e9ae69 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c > @@ -195,9 +195,9 @@ static bool kq_initialize(struct kernel_queue *kq, > struct kfd_dev *dev, } > > /* Uninitialize a kernel queue and free all its memory usages. */ > -static void kq_uninitialize(struct kernel_queue *kq) > +static void kq_uninitialize(struct kernel_queue *kq, bool pre_reset) > { > - if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ) > + if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && !pre_reset) > kq->mqd_mgr->destroy_mqd(kq->mqd_mgr, > kq->queue->mqd, > KFD_PREEMPT_TYPE_WAVEFRONT_RESET, @@ > -337,9 +337,9 @@ struct > kernel_queue *kernel_queue_init(struct kfd_dev *dev, > return NULL; > } > > -void kernel_queue_uninit(struct kernel_queue *kq) > +void kernel_queue_uninit(struct kernel_queue *kq, bool pre_reset) > { > - kq_uninitialize(kq); > + kq_uninitialize(kq, pre_reset); > kfree(kq); > } > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c > b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c > index 6cabed06ef5d..7732a3bbebd6 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c > @@ -264,10 +264,10 @@ int pm_init(struct packet_manager *pm, struct > device_queue_manager *dqm) > return 0; > } > > -void pm_uninit(struct packet_manager *pm) > +void pm_uninit(struct packet_manager *pm, bool pre_reset) > { > mutex_destroy(&pm->lock); > - kernel_queue_uninit(pm->priv_queue); > + kernel_queue_uninit(pm->priv_queue, pre_reset); > } > > int pm_send_set_resources(struct packet_manager *pm, diff --git > a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > index 087e96838997..9bc4ced4acba 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > @@ -762,9 +762,9 @@ struct kfd_process *kfd_get_process(const struct > task_struct *); struct kfd_process > *kfd_lookup_process_by_pasid(unsigned int pasid); struct kfd_process > *kfd_lookup_process_by_mm(const struct mm_struct *mm); void > kfd_unref_process(struct kfd_process *p); -int > kfd_process_evict_queues(struct kfd_process *p); > +int kfd_process_evict_queues(struct kfd_process *p, bool pre_reset); > int kfd_process_restore_queues(struct kfd_process *p); -void > kfd_suspend_all_processes(void); > +void kfd_suspend_all_processes(bool pre_reset); > int kfd_resume_all_processes(void); > > int kfd_process_device_init_vm(struct kfd_process_device *pdd, @@ -883,7 > +883,7 @@ struct device_queue_manager *device_queue_manager_init(struct > kfd_dev *dev); void device_queue_manager_uninit(struct device_queue_manager > *dqm); struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, > enum kfd_queue_type type); > -void kernel_queue_uninit(struct kernel_queue *kq); > +void kernel_queue_uninit(struct kernel_queue *kq, bool pre_reset); > int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned > int pasid); > > /* Process Queue Manager */ > @@ -974,7 +974,7 @@ extern const struct packet_manager_funcs > kfd_vi_pm_funcs; extern const struct packet_manager_funcs > kfd_v9_pm_funcs; > > int pm_init(struct packet_manager *pm, struct device_queue_manager > *dqm); -void pm_uninit(struct packet_manager *pm); > +void pm_uninit(struct packet_manager *pm, bool pre_reset); > int pm_send_set_resources(struct packet_manager *pm, > struct scheduling_resources *res); > int pm_send_runlist(struct packet_manager *pm, struct list_head > *dqm_queues); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c > b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > index 536a153ac9a4..7bcadd3a1e3b 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > @@ -948,7 +948,7 @@ struct kfd_process *kfd_lookup_process_by_mm(const struct > mm_struct *mm) > * Eviction is reference-counted per process-device. This means multiple > * evictions from different sources can be nested safely. > */ > -int kfd_process_evict_queues(struct kfd_process *p) > +int kfd_process_evict_queues(struct kfd_process *p, bool pre_reset) > { > struct kfd_process_device *pdd; > int r = 0; > @@ -956,7 +956,8 @@ int kfd_process_evict_queues(struct kfd_process > *p) > > list_for_each_entry(pdd, &p->per_device_data, per_device_list) { > r = pdd->dev->dqm->ops.evict_process_queues(pdd->dev->dqm, > - &pdd->qpd); > + &pdd->qpd, > + pre_reset); > if (r) { > pr_err("Failed to evict process queues\n"); > goto fail; > @@ -1026,7 +1027,7 @@ static void evict_process_worker(struct work_struct > *work) > flush_delayed_work(&p->restore_work); > > pr_debug("Started evicting pasid 0x%x\n", p->pasid); > - ret = kfd_process_evict_queues(p); > + ret = kfd_process_evict_queues(p, false); > if (!ret) { > dma_fence_signal(p->ef); > dma_fence_put(p->ef); > @@ -1082,7 +1083,7 @@ static void restore_process_worker(struct work_struct > *work) > pr_err("Failed to restore queues of pasid 0x%x\n", p->pasid); } > > -void kfd_suspend_all_processes(void) > +void kfd_suspend_all_processes(bool pre_reset) > { > struct kfd_process *p; > unsigned int temp; > @@ -1092,7 +1093,7 @@ void kfd_suspend_all_processes(void) > cancel_delayed_work_sync(&p->eviction_work); > cancel_delayed_work_sync(&p->restore_work); > > - if (kfd_process_evict_queues(p)) > + if (kfd_process_evict_queues(p, pre_reset)) > pr_err("Failed to suspend process 0x%x\n", p->pasid); > dma_fence_signal(p->ef); > dma_fence_put(p->ef); > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c > b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c > index d3eacf72e8db..8fa856e6a03f 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c > @@ -374,7 +374,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, > unsigned int qid) > /* destroy kernel queue (DIQ) */ > dqm = pqn->kq->dev->dqm; > dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd); > - kernel_queue_uninit(pqn->kq); > + kernel_queue_uninit(pqn->kq, false); > } > > if (pqn->q) { > -- > 2.24.1 > _______________________________________________ amd-gfx mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/amd-gfx
