Okay

Can you send me the patch (in attachment) once you finished it, I need to 
verify it on SRIOV 

Thanks 

_____________________________________
Monk Liu|GPU Virtualization Team |AMD


-----Original Message-----
From: Kuehling, Felix <[email protected]> 
Sent: Friday, December 20, 2019 3:56 PM
To: Liu, Monk <[email protected]>; [email protected]
Cc: Liu, Shaoyun <[email protected]>
Subject: Re: [PATCH 1/1] drm/amdkfd: Don't touch the hardware in pre_reset 
callback

You should be able to save the plain text email and pass that to "git am". It's 
trivial with Thunderbird on a Linux system. If you're using outlook, I'm not 
sure.

Anyway, I'm already reworking the patch based on Shaoyun's suggestion and some 
ideas it gave me.

Regards,
   Felix

On 2019-12-19 23:23, Liu, Monk wrote:
> Hi Felix
>
> Do you know how I can get a "xxx.patch" file from the email from you ??
>
> _____________________________________
> Monk Liu|GPU Virtualization Team |AMD
>
>
> -----Original Message-----
> From: Kuehling, Felix <[email protected]>
> Sent: Friday, December 20, 2019 10:09 AM
> To: [email protected]
> Cc: Liu, Shaoyun <[email protected]>; Liu, Monk <[email protected]>
> Subject: [PATCH 1/1] drm/amdkfd: Don't touch the hardware in pre_reset 
> callback
>
> The intention of the pre_reset callback is to update the driver state to 
> reflect that all user mode queues are preempted and the HIQ is destroyed. 
> However we should not actually preempt any queues or otherwise touch the 
> hardware because it is presumably hanging.
> The impending reset will take care of actually stopping all queues.
>
> This should prevent KIQ register access hanging on SRIOV function level reset 
> (FLR). It should also speed up the reset by avoiding unnecessary timeouts 
> from a potentially hanging GPU scheduler.
>
> CC: shaoyunl <[email protected]>
> CC: Liu Monk <[email protected]>
> Signed-off-by: Felix Kuehling <[email protected]>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_device.c       | 24 ++++++++++-------
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 27 ++++++++++++-------  
> .../drm/amd/amdkfd/kfd_device_queue_manager.h |  5 ++--  
> drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c |  8 +++---
>   .../gpu/drm/amd/amdkfd/kfd_packet_manager.c   |  4 +--
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  8 +++---
>   drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 11 ++++----
>   .../amd/amdkfd/kfd_process_queue_manager.c    |  2 +-
>   8 files changed, 53 insertions(+), 36 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index c6b6901bbda3..796996a0d926 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -486,6 +486,7 @@ static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned 
> int buf_size,
>                               unsigned int chunk_size);
>   static void kfd_gtt_sa_fini(struct kfd_dev *kfd);
>   
> +static void kfd_suspend(struct kfd_dev *kfd, bool pre_reset);
>   static int kfd_resume(struct kfd_dev *kfd);
>   
>   struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, @@ -728,7 +729,7 @@ int 
> kgd2kfd_pre_reset(struct kfd_dev *kfd)  {
>       if (!kfd->init_complete)
>               return 0;
> -     kgd2kfd_suspend(kfd);
> +     kfd_suspend(kfd, true);
>   
>       kfd_signal_reset_event(kfd);
>       return 0;
> @@ -767,13 +768,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd)
>       if (!kfd->init_complete)
>               return;
>   
> -     /* For first KFD device suspend all the KFD processes */
> -     if (atomic_inc_return(&kfd_locked) == 1)
> -             kfd_suspend_all_processes();
> -
> -     kfd->dqm->ops.stop(kfd->dqm);
> -
> -     kfd_iommu_suspend(kfd);
> +     kfd_suspend(kfd, false);
>   }
>   
>   int kgd2kfd_resume(struct kfd_dev *kfd) @@ -795,6 +790,17 @@ int 
> kgd2kfd_resume(struct kfd_dev *kfd)
>       return ret;
>   }
>   
> +static void kfd_suspend(struct kfd_dev *kfd, bool pre_reset) {
> +     /* For first KFD device suspend all the KFD processes */
> +     if (atomic_inc_return(&kfd_locked) == 1)
> +             kfd_suspend_all_processes(pre_reset);
> +
> +     kfd->dqm->ops.stop(kfd->dqm, pre_reset);
> +
> +     kfd_iommu_suspend(kfd);
> +}
> +
>   static int kfd_resume(struct kfd_dev *kfd)  {
>       int err = 0;
> @@ -877,7 +883,7 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm)
>       if (!p)
>               return -ESRCH;
>   
> -     r = kfd_process_evict_queues(p);
> +     r = kfd_process_evict_queues(p, false);
>   
>       kfd_unref_process(p);
>       return r;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index f7f6df40875e..3a49944164da 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -592,7 +592,8 @@ static int update_queue(struct 
> device_queue_manager *dqm, struct queue *q)  }
>   
>   static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
> -                                     struct qcm_process_device *qpd)
> +                                     struct qcm_process_device *qpd,
> +                                     bool pre_reset)
>   {
>       struct queue *q;
>       struct mqd_manager *mqd_mgr;
> @@ -622,6 +623,8 @@ static int evict_process_queues_nocpsch(struct 
> device_queue_manager *dqm,
>   
>               if (WARN_ONCE(!dqm->sched_running, "Evict when stopped\n"))
>                       continue;
> +             if (pre_reset)
> +                     continue;
>   
>               retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
>                               KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN, @@ -639,7 
> +642,8 @@ static int 
> evict_process_queues_nocpsch(struct device_queue_manager *dqm,  }
>   
>   static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
> -                                   struct qcm_process_device *qpd)
> +                                   struct qcm_process_device *qpd,
> +                                   bool pre_reset)
>   {
>       struct queue *q;
>       struct kfd_process_device *pdd;
> @@ -664,6 +668,10 @@ static int evict_process_queues_cpsch(struct 
> device_queue_manager *dqm,
>               q->properties.is_active = false;
>               dqm->queue_count--;
>       }
> +
> +     if (pre_reset)
> +             goto out;
> +
>       retval = execute_queues_cpsch(dqm,
>                               qpd->is_debug ?
>                               KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES :
> @@ -944,10 +952,10 @@ static int start_nocpsch(struct device_queue_manager 
> *dqm)
>       return 0;
>   }
>   
> -static int stop_nocpsch(struct device_queue_manager *dqm)
> +static int stop_nocpsch(struct device_queue_manager *dqm, bool
> +pre_reset)
>   {
>       if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
> -             pm_uninit(&dqm->packets);
> +             pm_uninit(&dqm->packets, pre_reset);
>       dqm->sched_running = false;
>   
>       return 0;
> @@ -1107,20 +1115,21 @@ static int start_cpsch(struct device_queue_manager 
> *dqm)
>       return 0;
>   fail_allocate_vidmem:
>   fail_set_sched_resources:
> -     pm_uninit(&dqm->packets);
> +     pm_uninit(&dqm->packets, false);
>   fail_packet_manager_init:
>       return retval;
>   }
>   
> -static int stop_cpsch(struct device_queue_manager *dqm)
> +static int stop_cpsch(struct device_queue_manager *dqm, bool 
> +pre_reset)
>   {
>       dqm_lock(dqm);
> -     unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
> +     if (!pre_reset)
> +             unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
>       dqm->sched_running = false;
>       dqm_unlock(dqm);
>   
>       kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
> -     pm_uninit(&dqm->packets);
> +     pm_uninit(&dqm->packets, pre_reset);
>   
>       return 0;
>   }
> @@ -1891,7 +1900,7 @@ int kfd_process_vm_fault(struct device_queue_manager 
> *dqm,
>               return -EINVAL;
>       pdd = kfd_get_process_device_data(dqm->dev, p);
>       if (pdd)
> -             ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd);
> +             ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd, false);
>       kfd_unref_process(p);
>   
>       return ret;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> index a8c37e6da027..9f82f95f6a92 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
> @@ -103,7 +103,7 @@ struct device_queue_manager_ops {
>   
>       int     (*initialize)(struct device_queue_manager *dqm);
>       int     (*start)(struct device_queue_manager *dqm);
> -     int     (*stop)(struct device_queue_manager *dqm);
> +     int     (*stop)(struct device_queue_manager *dqm, bool pre_reset);
>       void    (*uninitialize)(struct device_queue_manager *dqm);
>       int     (*create_kernel_queue)(struct device_queue_manager *dqm,
>                                       struct kernel_queue *kq,
> @@ -129,7 +129,8 @@ struct device_queue_manager_ops {
>                       struct qcm_process_device *qpd);
>   
>       int (*evict_process_queues)(struct device_queue_manager *dqm,
> -                                 struct qcm_process_device *qpd);
> +                                 struct qcm_process_device *qpd,
> +                                 bool pre_reset);
>       int (*restore_process_queues)(struct device_queue_manager *dqm,
>                                     struct qcm_process_device *qpd);
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> index 2d56dc534459..dbd2e8e9ae69 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> @@ -195,9 +195,9 @@ static bool kq_initialize(struct kernel_queue *kq, 
> struct kfd_dev *dev,  }
>   
>   /* Uninitialize a kernel queue and free all its memory usages. */ 
> -static void kq_uninitialize(struct kernel_queue *kq)
> +static void kq_uninitialize(struct kernel_queue *kq, bool pre_reset)
>   {
> -     if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
> +     if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && !pre_reset)
>               kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
>                                       kq->queue->mqd,
>                                       KFD_PREEMPT_TYPE_WAVEFRONT_RESET, @@ 
> -337,9 +337,9 @@ struct 
> kernel_queue *kernel_queue_init(struct kfd_dev *dev,
>       return NULL;
>   }
>   
> -void kernel_queue_uninit(struct kernel_queue *kq)
> +void kernel_queue_uninit(struct kernel_queue *kq, bool pre_reset)
>   {
> -     kq_uninitialize(kq);
> +     kq_uninitialize(kq, pre_reset);
>       kfree(kq);
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> index 6cabed06ef5d..7732a3bbebd6 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> @@ -264,10 +264,10 @@ int pm_init(struct packet_manager *pm, struct 
> device_queue_manager *dqm)
>       return 0;
>   }
>   
> -void pm_uninit(struct packet_manager *pm)
> +void pm_uninit(struct packet_manager *pm, bool pre_reset)
>   {
>       mutex_destroy(&pm->lock);
> -     kernel_queue_uninit(pm->priv_queue);
> +     kernel_queue_uninit(pm->priv_queue, pre_reset);
>   }
>   
>   int pm_send_set_resources(struct packet_manager *pm, diff --git 
> a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 087e96838997..9bc4ced4acba 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -762,9 +762,9 @@ struct kfd_process *kfd_get_process(const struct 
> task_struct *);  struct kfd_process 
> *kfd_lookup_process_by_pasid(unsigned int pasid);  struct kfd_process 
> *kfd_lookup_process_by_mm(const struct mm_struct *mm);  void 
> kfd_unref_process(struct kfd_process *p); -int 
> kfd_process_evict_queues(struct kfd_process *p);
> +int kfd_process_evict_queues(struct kfd_process *p, bool pre_reset);
>   int kfd_process_restore_queues(struct kfd_process *p); -void 
> kfd_suspend_all_processes(void);
> +void kfd_suspend_all_processes(bool pre_reset);
>   int kfd_resume_all_processes(void);
>   
>   int kfd_process_device_init_vm(struct kfd_process_device *pdd, @@ -883,7 
> +883,7 @@ struct device_queue_manager *device_queue_manager_init(struct 
> kfd_dev *dev);  void device_queue_manager_uninit(struct device_queue_manager 
> *dqm);  struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
>                                       enum kfd_queue_type type);
> -void kernel_queue_uninit(struct kernel_queue *kq);
> +void kernel_queue_uninit(struct kernel_queue *kq, bool pre_reset);
>   int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned 
> int pasid);
>   
>   /* Process Queue Manager */
> @@ -974,7 +974,7 @@ extern const struct packet_manager_funcs 
> kfd_vi_pm_funcs;  extern const struct packet_manager_funcs 
> kfd_v9_pm_funcs;
>   
>   int pm_init(struct packet_manager *pm, struct device_queue_manager 
> *dqm); -void pm_uninit(struct packet_manager *pm);
> +void pm_uninit(struct packet_manager *pm, bool pre_reset);
>   int pm_send_set_resources(struct packet_manager *pm,
>                               struct scheduling_resources *res);
>   int pm_send_runlist(struct packet_manager *pm, struct list_head 
> *dqm_queues); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 536a153ac9a4..7bcadd3a1e3b 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -948,7 +948,7 @@ struct kfd_process *kfd_lookup_process_by_mm(const struct 
> mm_struct *mm)
>    * Eviction is reference-counted per process-device. This means multiple
>    * evictions from different sources can be nested safely.
>    */
> -int kfd_process_evict_queues(struct kfd_process *p)
> +int kfd_process_evict_queues(struct kfd_process *p, bool pre_reset)
>   {
>       struct kfd_process_device *pdd;
>       int r = 0;
> @@ -956,7 +956,8 @@ int kfd_process_evict_queues(struct kfd_process 
> *p)
>   
>       list_for_each_entry(pdd, &p->per_device_data, per_device_list) {
>               r = pdd->dev->dqm->ops.evict_process_queues(pdd->dev->dqm,
> -                                                         &pdd->qpd);
> +                                                         &pdd->qpd,
> +                                                         pre_reset);
>               if (r) {
>                       pr_err("Failed to evict process queues\n");
>                       goto fail;
> @@ -1026,7 +1027,7 @@ static void evict_process_worker(struct work_struct 
> *work)
>       flush_delayed_work(&p->restore_work);
>   
>       pr_debug("Started evicting pasid 0x%x\n", p->pasid);
> -     ret = kfd_process_evict_queues(p);
> +     ret = kfd_process_evict_queues(p, false);
>       if (!ret) {
>               dma_fence_signal(p->ef);
>               dma_fence_put(p->ef);
> @@ -1082,7 +1083,7 @@ static void restore_process_worker(struct work_struct 
> *work)
>               pr_err("Failed to restore queues of pasid 0x%x\n", p->pasid);  }
>   
> -void kfd_suspend_all_processes(void)
> +void kfd_suspend_all_processes(bool pre_reset)
>   {
>       struct kfd_process *p;
>       unsigned int temp;
> @@ -1092,7 +1093,7 @@ void kfd_suspend_all_processes(void)
>               cancel_delayed_work_sync(&p->eviction_work);
>               cancel_delayed_work_sync(&p->restore_work);
>   
> -             if (kfd_process_evict_queues(p))
> +             if (kfd_process_evict_queues(p, pre_reset))
>                       pr_err("Failed to suspend process 0x%x\n", p->pasid);
>               dma_fence_signal(p->ef);
>               dma_fence_put(p->ef);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index d3eacf72e8db..8fa856e6a03f 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -374,7 +374,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, 
> unsigned int qid)
>               /* destroy kernel queue (DIQ) */
>               dqm = pqn->kq->dev->dqm;
>               dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
> -             kernel_queue_uninit(pqn->kq);
> +             kernel_queue_uninit(pqn->kq, false);
>       }
>   
>       if (pqn->q) {
> --
> 2.24.1
>
_______________________________________________
amd-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to