m 2021-08-03 um 2:57 p.m. schrieb Eric Huang:
> It is to differenciate case scenario for proper behavior when
> calling evict queues, such as GPU reset doesn't need to roll
> back restoring partial evicted queues.
>
> Signed-off-by: Eric Huang <jinhuieric.hu...@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c |  2 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  4 ++--
>  drivers/gpu/drm/amd/amdkfd/kfd_device.c    | 10 +++++-----
>  drivers/gpu/drm/amd/amdkfd/kfd_priv.h      |  4 ++--
>  drivers/gpu/drm/amd/amdkfd/kfd_process.c   | 18 ++++++++++--------
>  5 files changed, 20 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index 77044e8ba4e6..59ce5a17a834 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -190,7 +190,7 @@ void amdgpu_amdkfd_interrupt(struct amdgpu_device *adev,
>  void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool run_pm)
>  {
>       if (adev->kfd.dev)
> -             kgd2kfd_suspend(adev->kfd.dev, run_pm);
> +             kgd2kfd_suspend(adev->kfd.dev, run_pm, false);

If suspend fails, this should return an error that should be handled in
amdgpu_device_suspend. Maybe this could be fixed in a follow up patch.
This means kgd2kfd_suspend and kfd_suspend_all_processes should not
return void and return an error code on failures at least if force=false.

Otherwise this patch is

Reviewed-by: Felix Kuehling <felix.kuehl...@amd.com>


>  }
>  
>  int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm, bool sync)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index 332ccba00e69..b7e46ad0507e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -372,7 +372,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
>                        struct drm_device *ddev,
>                        const struct kgd2kfd_shared_resources *gpu_resources);
>  void kgd2kfd_device_exit(struct kfd_dev *kfd);
> -void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm);
> +void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm, bool force);
>  int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm, bool sync);
>  int kgd2kfd_pre_reset(struct kfd_dev *kfd);
>  int kgd2kfd_post_reset(struct kfd_dev *kfd);
> @@ -407,7 +407,7 @@ static inline void kgd2kfd_device_exit(struct kfd_dev 
> *kfd)
>  {
>  }
>  
> -static inline void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
> +static inline void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm, bool 
> force)
>  {
>  }
>  
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 24b5e0aa1eac..48e51ee8de56 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -940,7 +940,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
>  void kgd2kfd_device_exit(struct kfd_dev *kfd)
>  {
>       if (kfd->init_complete) {
> -             kgd2kfd_suspend(kfd, false);
> +             kgd2kfd_suspend(kfd, false, true);
>               svm_migrate_fini((struct amdgpu_device *)kfd->kgd);
>               device_queue_manager_uninit(kfd->dqm);
>               kfd_interrupt_exit(kfd);
> @@ -965,7 +965,7 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)
>  
>       kfd->dqm->ops.pre_reset(kfd->dqm);
>  
> -     kgd2kfd_suspend(kfd, false);
> +     kgd2kfd_suspend(kfd, false, true);
>  
>       kfd_signal_reset_event(kfd);
>       return 0;
> @@ -1001,7 +1001,7 @@ bool kfd_is_locked(void)
>       return  (atomic_read(&kfd_locked) > 0);
>  }
>  
> -void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
> +void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm, bool force)
>  {
>       if (!kfd->init_complete)
>               return;
> @@ -1010,7 +1010,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
>       if (!run_pm) {
>               /* For first KFD device suspend all the KFD processes */
>               if (atomic_inc_return(&kfd_locked) == 1)
> -                     kfd_suspend_all_processes();
> +                     kfd_suspend_all_processes(force);
>       }
>  
>       kfd->dqm->ops.stop(kfd->dqm);
> @@ -1122,7 +1122,7 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm)
>               return -ESRCH;
>  
>       WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid);
> -     r = kfd_process_evict_queues(p);
> +     r = kfd_process_evict_queues(p, true);
>  
>       kfd_unref_process(p);
>       return r;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 3d5d3994d8a4..e80fb64a6dcc 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -1042,9 +1042,9 @@ static inline struct kfd_process_device 
> *kfd_process_device_from_gpuidx(
>  }
>  
>  void kfd_unref_process(struct kfd_process *p);
> -int kfd_process_evict_queues(struct kfd_process *p);
> +int kfd_process_evict_queues(struct kfd_process *p, bool force);
>  int kfd_process_restore_queues(struct kfd_process *p);
> -void kfd_suspend_all_processes(void);
> +void kfd_suspend_all_processes(bool force);
>  /*
>   * kfd_resume_all_processes:
>   *   bool sync: If kfd_resume_all_processes() should wait for the
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 38a9dee40785..a41ece37bc3c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -1875,20 +1875,22 @@ struct kfd_process *kfd_lookup_process_by_mm(const 
> struct mm_struct *mm)
>   * Eviction is reference-counted per process-device. This means multiple
>   * evictions from different sources can be nested safely.
>   */
> -int kfd_process_evict_queues(struct kfd_process *p)
> +int kfd_process_evict_queues(struct kfd_process *p, bool force)
>  {
> -     int r = 0;
> +     int r = 0, r_tmp = 0;
>       int i;
>       unsigned int n_evicted = 0;
>  
>       for (i = 0; i < p->n_pdds; i++) {
>               struct kfd_process_device *pdd = p->pdds[i];
>  
> -             r = pdd->dev->dqm->ops.evict_process_queues(pdd->dev->dqm,
> +             r_tmp = pdd->dev->dqm->ops.evict_process_queues(pdd->dev->dqm,
>                                                           &pdd->qpd);
> -             if (r) {
> +             if (r_tmp) {
>                       pr_err("Failed to evict process queues\n");
> -                     goto fail;
> +                     r = r_tmp;
> +                     if (!force)
> +                             goto fail;
>               }
>               n_evicted++;
>       }
> @@ -2079,7 +2081,7 @@ static void evict_process_worker(struct work_struct 
> *work)
>       p->last_evict_timestamp = get_jiffies_64();
>  
>       pr_debug("Started evicting pasid 0x%x\n", p->pasid);
> -     ret = kfd_process_evict_queues(p);
> +     ret = kfd_process_evict_queues(p, false);
>       if (!ret) {
>               dma_fence_signal(p->ef);
>               dma_fence_put(p->ef);
> @@ -2147,7 +2149,7 @@ static void restore_process_worker(struct work_struct 
> *work)
>               pr_err("Failed to restore queues of pasid 0x%x\n", p->pasid);
>  }
>  
> -void kfd_suspend_all_processes(void)
> +void kfd_suspend_all_processes(bool force)
>  {
>       struct kfd_process *p;
>       unsigned int temp;
> @@ -2158,7 +2160,7 @@ void kfd_suspend_all_processes(void)
>               cancel_delayed_work_sync(&p->eviction_work);
>               cancel_delayed_work_sync(&p->restore_work);
>  
> -             if (kfd_process_evict_queues(p))
> +             if (kfd_process_evict_queues(p, force))
>                       pr_err("Failed to suspend process 0x%x\n", p->pasid);
>               dma_fence_signal(p->ef);
>               dma_fence_put(p->ef);

Reply via email to