[AMD Official Use Only - Internal Distribution Only]

I see. Thank you Felix for the explanation.

Regards,
Oak

-----Original Message-----
From: Kuehling, Felix <[email protected]> 
Sent: Friday, December 20, 2019 12:28 PM
To: Zeng, Oak <[email protected]>; [email protected]
Subject: Re: [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch

On 2019-12-20 12:22, Zeng, Oak wrote:
> [AMD Official Use Only - Internal Distribution Only]
>
>
>
> Regards,
> Oak
>
> -----Original Message-----
> From: amd-gfx <[email protected]> On Behalf Of 
> Felix Kuehling
> Sent: Friday, December 20, 2019 3:30 AM
> To: [email protected]
> Subject: [PATCH 4/4] drm/amdkfd: Avoid hanging hardware in stop_cpsch
>
> Don't use the HWS if it's known to be hanging. In a reset also don't try to 
> destroy the HIQ because that may hang on SRIOV if the KIQ is unresponsive.
>
> Signed-off-by: Felix Kuehling <[email protected]>
> ---
>   .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c    | 12 ++++++++----
>   drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c        |  8 ++++----
>   drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c      |  4 ++--
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h                |  4 ++--
>   .../gpu/drm/amd/amdkfd/kfd_process_queue_manager.c   |  2 +-
>   5 files changed, 17 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index a7e9ec1b3ce3..d7eb6ac37f62 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -946,7 +946,7 @@ static int start_nocpsch(struct device_queue_manager 
> *dqm)  static int stop_nocpsch(struct device_queue_manager *dqm)  {
>       if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
> -             pm_uninit(&dqm->packets);
> +             pm_uninit(&dqm->packets, false);
>       dqm->sched_running = false;
>   
>       return 0;
> @@ -1114,20 +1114,24 @@ static int start_cpsch(struct device_queue_manager 
> *dqm)
>       return 0;
>   fail_allocate_vidmem:
>   fail_set_sched_resources:
> -     pm_uninit(&dqm->packets);
> +     pm_uninit(&dqm->packets, false);
>   fail_packet_manager_init:
>       return retval;
>   }
>   
>   static int stop_cpsch(struct device_queue_manager *dqm)  {
> +     bool hanging;
> +
>       dqm_lock(dqm);
> -     unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
> +     if (!dqm->is_hws_hang)
> +             unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
> +     hanging = dqm->is_hws_hang || dqm->is_resetting;
> [Oak] I don't think dqm->is_resetting is necessary. If is_resetting is true, 
> is_hws_hang is always true. Those two flags are always the same except a 
> period during which hws hang is detected but kfd_pre_reset is not called. In 
> this period, hang is true but resetting is false, so "||resetting" doesn't 
> help.

This is not necessarily true. A GPU reset can be caused by amdgpu for example 
when the graphics engine is hanging. In that case HWS isn't necessarily 
hanging. I added "|| resetting" here to avoid touching hardware in an unknown 
state in pm_uninit=>kq_uninitialize in this case.

Regards,
   Felix

>
> Also see my comment on the 3rd patch.
>
>       dqm->sched_running = false;
>       dqm_unlock(dqm);
>   
>       kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
> -     pm_uninit(&dqm->packets);
> +     pm_uninit(&dqm->packets, hanging);
>   
>       return 0;
>   }
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> index 2d56dc534459..bae706462f96 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c
> @@ -195,9 +195,9 @@ static bool kq_initialize(struct kernel_queue *kq, 
> struct kfd_dev *dev,  }
>   
>   /* Uninitialize a kernel queue and free all its memory usages. */ 
> -static void kq_uninitialize(struct kernel_queue *kq)
> +static void kq_uninitialize(struct kernel_queue *kq, bool hanging)
>   {
> -     if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ)
> +     if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && !hanging)
>               kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
>                                       kq->queue->mqd,
>                                       KFD_PREEMPT_TYPE_WAVEFRONT_RESET, @@ 
> -337,9 +337,9 @@ struct 
> kernel_queue *kernel_queue_init(struct kfd_dev *dev,
>       return NULL;
>   }
>   
> -void kernel_queue_uninit(struct kernel_queue *kq)
> +void kernel_queue_uninit(struct kernel_queue *kq, bool hanging)
>   {
> -     kq_uninitialize(kq);
> +     kq_uninitialize(kq, hanging);
>       kfree(kq);
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> index 6cabed06ef5d..dc406e6dee23 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c
> @@ -264,10 +264,10 @@ int pm_init(struct packet_manager *pm, struct 
> device_queue_manager *dqm)
>       return 0;
>   }
>   
> -void pm_uninit(struct packet_manager *pm)
> +void pm_uninit(struct packet_manager *pm, bool hanging)
>   {
>       mutex_destroy(&pm->lock);
> -     kernel_queue_uninit(pm->priv_queue);
> +     kernel_queue_uninit(pm->priv_queue, hanging);
>   }
>   
>   int pm_send_set_resources(struct packet_manager *pm, diff --git 
> a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 087e96838997..8ac680dc90f1 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -883,7 +883,7 @@ struct device_queue_manager 
> *device_queue_manager_init(struct kfd_dev *dev);  void 
> device_queue_manager_uninit(struct device_queue_manager *dqm);  struct 
> kernel_queue *kernel_queue_init(struct kfd_dev *dev,
>                                       enum kfd_queue_type type);
> -void kernel_queue_uninit(struct kernel_queue *kq);
> +void kernel_queue_uninit(struct kernel_queue *kq, bool hanging);
>   int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned 
> int pasid);
>   
>   /* Process Queue Manager */
> @@ -974,7 +974,7 @@ extern const struct packet_manager_funcs 
> kfd_vi_pm_funcs;  extern const struct packet_manager_funcs 
> kfd_v9_pm_funcs;
>   
>   int pm_init(struct packet_manager *pm, struct device_queue_manager 
> *dqm); -void pm_uninit(struct packet_manager *pm);
> +void pm_uninit(struct packet_manager *pm, bool hanging);
>   int pm_send_set_resources(struct packet_manager *pm,
>                               struct scheduling_resources *res);
>   int pm_send_runlist(struct packet_manager *pm, struct list_head 
> *dqm_queues); diff --git 
> a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index d3eacf72e8db..8fa856e6a03f 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -374,7 +374,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, 
> unsigned int qid)
>               /* destroy kernel queue (DIQ) */
>               dqm = pqn->kq->dev->dqm;
>               dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
> -             kernel_queue_uninit(pqn->kq);
> +             kernel_queue_uninit(pqn->kq, false);
>       }
>   
>       if (pqn->q) {
> --
> 2.24.1
>
> _______________________________________________
> amd-gfx mailing list
> [email protected]
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flist
> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7Coa
> k.zeng%40amd.com%7C7602eef96b0545baac8608d78526dde0%7C3dd8961fe4884e60
> 8e11a82d994e183d%7C0%7C0%7C637124274407587881&amp;sdata=BlPgWgLi%2Frtk
> cPQLu%2FbK0dOrvg6qm4IsGVfuoUo%2B%2B1g%3D&amp;reserved=0
_______________________________________________
amd-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to