Re: [PATCH v5 3/6] drm/scheduler: rework job destruction

2019-05-29 Thread Daniel Vetter
On Thu, Apr 18, 2019 at 5:00 PM Andrey Grodzovsky
 wrote:
>
> From: Christian König 
>
> We now destroy finished jobs from the worker thread to make sure that
> we never destroy a job currently in timeout processing.
> By this we avoid holding lock around ring mirror list in drm_sched_stop
> which should solve a deadlock reported by a user.
>
> v2: Remove unused variable.
> v4: Move guilty job free into sched code.
> v5:
> Move sched->hw_rq_count to drm_sched_start to account for counter
> decrement in drm_sched_stop even when we don't call resubmit jobs
> if guily job did signal.
>
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109692
>
> Signed-off-by: Christian König 
> Signed-off-by: Andrey Grodzovsky 

$ make htmldocs

./drivers/gpu/drm/scheduler/sched_main.c:365: warning: Function
parameter or member 'bad' not described in 'drm_sched_stop'

Please fix, thanks.
-Daniel

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |   9 +-
>  drivers/gpu/drm/etnaviv/etnaviv_dump.c |   4 -
>  drivers/gpu/drm/etnaviv/etnaviv_sched.c|   2 +-
>  drivers/gpu/drm/lima/lima_sched.c  |   2 +-
>  drivers/gpu/drm/panfrost/panfrost_job.c|   2 +-
>  drivers/gpu/drm/scheduler/sched_main.c | 159 
> +
>  drivers/gpu/drm/v3d/v3d_sched.c|   2 +-
>  include/drm/gpu_scheduler.h|   6 +-
>  8 files changed, 102 insertions(+), 84 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 7cee269..a0e165c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3334,7 +3334,7 @@ static int amdgpu_device_pre_asic_reset(struct 
> amdgpu_device *adev,
> if (!ring || !ring->sched.thread)
> continue;
>
> -   drm_sched_stop(&ring->sched);
> +   drm_sched_stop(&ring->sched, &job->base);
>
> /* after all hw jobs are reset, hw fence is meaningless, so 
> force_completion */
> amdgpu_fence_driver_force_completion(ring);
> @@ -3343,8 +3343,6 @@ static int amdgpu_device_pre_asic_reset(struct 
> amdgpu_device *adev,
> if(job)
> drm_sched_increase_karma(&job->base);
>
> -
> -
> if (!amdgpu_sriov_vf(adev)) {
>
> if (!need_full_reset)
> @@ -3482,8 +3480,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info 
> *hive,
> return r;
>  }
>
> -static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev,
> - struct amdgpu_job *job)
> +static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev)
>  {
> int i;
>
> @@ -3623,7 +3620,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
> *adev,
>
> /* Post ASIC reset for all devs .*/
> list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
> -   amdgpu_device_post_asic_reset(tmp_adev, tmp_adev == adev ? 
> job : NULL);
> +   amdgpu_device_post_asic_reset(tmp_adev);
>
> if (r) {
> /* bad news, how to tell it to userspace ? */
> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_dump.c 
> b/drivers/gpu/drm/etnaviv/etnaviv_dump.c
> index 33854c9..5778d9c 100644
> --- a/drivers/gpu/drm/etnaviv/etnaviv_dump.c
> +++ b/drivers/gpu/drm/etnaviv/etnaviv_dump.c
> @@ -135,13 +135,11 @@ void etnaviv_core_dump(struct etnaviv_gpu *gpu)
> mmu_size + gpu->buffer.size;
>
> /* Add in the active command buffers */
> -   spin_lock_irqsave(&gpu->sched.job_list_lock, flags);
> list_for_each_entry(s_job, &gpu->sched.ring_mirror_list, node) {
> submit = to_etnaviv_submit(s_job);
> file_size += submit->cmdbuf.size;
> n_obj++;
> }
> -   spin_unlock_irqrestore(&gpu->sched.job_list_lock, flags);
>
> /* Add in the active buffer objects */
> list_for_each_entry(vram, &gpu->mmu->mappings, mmu_node) {
> @@ -183,14 +181,12 @@ void etnaviv_core_dump(struct etnaviv_gpu *gpu)
>   gpu->buffer.size,
>   etnaviv_cmdbuf_get_va(&gpu->buffer));
>
> -   spin_lock_irqsave(&gpu->sched.job_list_lock, flags);
> list_for_each_entry(s_job, &gpu->sched.ring_mirror_list, node) {
> submit = to_etnaviv_submit(s_job);
> etnaviv_core_dump_mem(&iter, ETDUMP_BUF_CMD,
>   submit->cmdbuf.vaddr, 
> submit->cmdbuf.size,
>   etnaviv_cmdbuf_get_va(&submit->cmdbuf));
> }
> -   spin_unlock_irqrestore(&gpu->sched.job_list_lock, flags);
>
> /* Reserve space for the bomap */
> if (n_bomap_pages) {
> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c 
> b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
> index 6d24fea..a813c82 100644
> --- a/drivers/gpu

Re: [PATCH v5 3/6] drm/scheduler: rework job destruction

2019-04-23 Thread Grodzovsky, Andrey

On 4/23/19 10:44 AM, Zhou, David(ChunMing) wrote:
This patch is to fix deadlock between fence->lock and sched->job_list_lock, 
right?
So I suggest to just move list_del_init(&s_job->node) from 
drm_sched_process_job to work thread. That will avoid deadlock described in the 
link.


Do you mean restoring back scheduling work from HW fence interrupt handler and 
deleting there ? Yes, I suggested this as an option (take a look at my comment 
9 in https://bugs.freedesktop.org/show_bug.cgi?id=109692)  but since we still 
have to wait for all fences in flight to signal to avoid the problem fixed in 
'3741540 drm/sched: Rework HW fence processing.' this thing becomes somewhat 
complicated and so Christian came up with the core idea in this patch which is 
to do all deletions/insertions thread safe by grantee it's always dome from one 
thread.  It does simplify the handling.

Andrey



 Original Message --------
Subject: Re: [PATCH v5 3/6] drm/scheduler: rework job destruction
From: "Grodzovsky, Andrey"
To: "Zhou, David(ChunMing)" 
,dri-devel@lists.freedesktop.org,amd-...@lists.freedesktop.org,e...@anholt.net,etna...@lists.freedesktop.org,ckoenig.leichtzumer...@gmail.com<mailto:dri-devel@lists.freedesktop.org,amd-...@lists.freedesktop.org,e...@anholt.net,etna...@lists.freedesktop.org,ckoenig.leichtzumer...@gmail.com>
CC: "Kazlauskas, Nicholas" ,"Koenig, Christian"


On 4/22/19 8:48 AM, Chunming Zhou wrote:
> Hi Andrey,
>
> static void drm_sched_process_job(struct dma_fence *f, struct
> dma_fence_cb *cb)
> {
> ...
>   spin_lock_irqsave(&sched->job_list_lock, flags);
>   /* remove job from ring_mirror_list */
>   list_del_init(&s_job->node);
>   spin_unlock_irqrestore(&sched->job_list_lock, flags);
> [David] How about just remove above to worker from irq process? Any
> problem? Maybe I missed previous your discussion, but I think removing
> lock for list is a risk for future maintenance although you make sure
> thread safe currently.
>
> -David


We remove the lock exactly because of the fact that insertion and
removal to/from the list will be done form exactly one thread at ant
time now. So I am not sure I understand what you mean.

Andrey


>
> ...
>
>   schedule_work(&s_job->finish_work);
> }
>
> 在 2019/4/18 23:00, Andrey Grodzovsky 写道:
>> From: Christian König 
>> <mailto:christian.koe...@amd.com>
>>
>> We now destroy finished jobs from the worker thread to make sure that
>> we never destroy a job currently in timeout processing.
>> By this we avoid holding lock around ring mirror list in drm_sched_stop
>> which should solve a deadlock reported by a user.
>>
>> v2: Remove unused variable.
>> v4: Move guilty job free into sched code.
>> v5:
>> Move sched->hw_rq_count to drm_sched_start to account for counter
>> decrement in drm_sched_stop even when we don't call resubmit jobs
>> if guily job did signal.
>>
>> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109692
>>
>> Signed-off-by: Christian König 
>> <mailto:christian.koe...@amd.com>
>> Signed-off-by: Andrey Grodzovsky 
>> <mailto:andrey.grodzov...@amd.com>
>> ---
>>drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |   9 +-
>>drivers/gpu/drm/etnaviv/etnaviv_dump.c |   4 -
>>drivers/gpu/drm/etnaviv/etnaviv_sched.c|   2 +-
>>drivers/gpu/drm/lima/lima_sched.c  |   2 +-
>>drivers/gpu/drm/panfrost/panfrost_job.c|   2 +-
>>drivers/gpu/drm/scheduler/sched_main.c | 159 
>> +
>>drivers/gpu/drm/v3d/v3d_sched.c|   2 +-
>>include/drm/gpu_scheduler.h|   6 +-
>>8 files changed, 102 insertions(+), 84 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 7cee269..a0e165c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -3334,7 +3334,7 @@ static int amdgpu_device_pre_asic_reset(struct 
>> amdgpu_device *adev,
>>if (!ring || !ring->sched.thread)
>>continue;
>>
>> - drm_sched_stop(&ring->sched);
>> + drm_sched_stop(&ring->sched, &job->base);
>>
>>/* after all hw jobs are reset, hw fence is meaningless, so 
>> force_completion */
>>amdgpu_fence_driver_force_completion(ring);
>> @@ -3343,8 +3343,6 @@ static int amdgpu_device_pre_asic_reset(struct 
>> amdgpu_device *adev,
>>if(job)
>>drm_sched_increase_karma(&am

Re:[PATCH v5 3/6] drm/scheduler: rework job destruction

2019-04-23 Thread Zhou, David(ChunMing)
This patch is to fix deadlock between fence->lock and sched->job_list_lock, 
right?
So I suggest to just move list_del_init(&s_job->node) from 
drm_sched_process_job to work thread. That will avoid deadlock described in the 
link.


 Original Message 
Subject: Re: [PATCH v5 3/6] drm/scheduler: rework job destruction
From: "Grodzovsky, Andrey"
To: "Zhou, David(ChunMing)" 
,dri-devel@lists.freedesktop.org,amd-...@lists.freedesktop.org,e...@anholt.net,etna...@lists.freedesktop.org,ckoenig.leichtzumer...@gmail.com
CC: "Kazlauskas, Nicholas" ,"Koenig, Christian"


On 4/22/19 8:48 AM, Chunming Zhou wrote:
> Hi Andrey,
>
> static void drm_sched_process_job(struct dma_fence *f, struct
> dma_fence_cb *cb)
> {
> ...
>   spin_lock_irqsave(&sched->job_list_lock, flags);
>   /* remove job from ring_mirror_list */
>   list_del_init(&s_job->node);
>   spin_unlock_irqrestore(&sched->job_list_lock, flags);
> [David] How about just remove above to worker from irq process? Any
> problem? Maybe I missed previous your discussion, but I think removing
> lock for list is a risk for future maintenance although you make sure
> thread safe currently.
>
> -David


We remove the lock exactly because of the fact that insertion and
removal to/from the list will be done form exactly one thread at ant
time now. So I am not sure I understand what you mean.

Andrey


>
> ...
>
>   schedule_work(&s_job->finish_work);
> }
>
> 在 2019/4/18 23:00, Andrey Grodzovsky 写道:
>> From: Christian König 
>>
>> We now destroy finished jobs from the worker thread to make sure that
>> we never destroy a job currently in timeout processing.
>> By this we avoid holding lock around ring mirror list in drm_sched_stop
>> which should solve a deadlock reported by a user.
>>
>> v2: Remove unused variable.
>> v4: Move guilty job free into sched code.
>> v5:
>> Move sched->hw_rq_count to drm_sched_start to account for counter
>> decrement in drm_sched_stop even when we don't call resubmit jobs
>> if guily job did signal.
>>
>> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109692
>>
>> Signed-off-by: Christian König 
>> Signed-off-by: Andrey Grodzovsky 
>> ---
>>drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |   9 +-
>>drivers/gpu/drm/etnaviv/etnaviv_dump.c |   4 -
>>drivers/gpu/drm/etnaviv/etnaviv_sched.c|   2 +-
>>drivers/gpu/drm/lima/lima_sched.c  |   2 +-
>>drivers/gpu/drm/panfrost/panfrost_job.c|   2 +-
>>drivers/gpu/drm/scheduler/sched_main.c | 159 
>> +
>>drivers/gpu/drm/v3d/v3d_sched.c|   2 +-
>>include/drm/gpu_scheduler.h|   6 +-
>>8 files changed, 102 insertions(+), 84 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 7cee269..a0e165c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -3334,7 +3334,7 @@ static int amdgpu_device_pre_asic_reset(struct 
>> amdgpu_device *adev,
>>if (!ring || !ring->sched.thread)
>>continue;
>>
>> - drm_sched_stop(&ring->sched);
>> + drm_sched_stop(&ring->sched, &job->base);
>>
>>/* after all hw jobs are reset, hw fence is meaningless, so 
>> force_completion */
>>amdgpu_fence_driver_force_completion(ring);
>> @@ -3343,8 +3343,6 @@ static int amdgpu_device_pre_asic_reset(struct 
>> amdgpu_device *adev,
>>if(job)
>>drm_sched_increase_karma(&job->base);
>>
>> -
>> -
>>if (!amdgpu_sriov_vf(adev)) {
>>
>>if (!need_full_reset)
>> @@ -3482,8 +3480,7 @@ static int amdgpu_do_asic_reset(struct 
>> amdgpu_hive_info *hive,
>>return r;
>>}
>>
>> -static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev,
>> -   struct amdgpu_job *job)
>> +static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev)
>>{
>>int i;
>>
>> @@ -3623,7 +3620,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
>> *adev,
>>
>>/* Post ASIC reset for all devs .*/
>>list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>> - amdgpu_device_post_asic_reset(tmp_adev, tmp_adev == adev ? job : NULL);
>> + amdgpu_device_post_asic_reset(tmp_adev);
>>
>>if (r) {
>>/* bad news, how to tell it to userspace ? */
>

Re: [PATCH v5 3/6] drm/scheduler: rework job destruction

2019-04-23 Thread Grodzovsky, Andrey

On 4/22/19 8:48 AM, Chunming Zhou wrote:
> Hi Andrey,
>
> static void drm_sched_process_job(struct dma_fence *f, struct
> dma_fence_cb *cb)
> {
> ...
>       spin_lock_irqsave(&sched->job_list_lock, flags);
>       /* remove job from ring_mirror_list */
>       list_del_init(&s_job->node);
>       spin_unlock_irqrestore(&sched->job_list_lock, flags);
> [David] How about just remove above to worker from irq process? Any
> problem? Maybe I missed previous your discussion, but I think removing
> lock for list is a risk for future maintenance although you make sure
> thread safe currently.
>
> -David


We remove the lock exactly because of the fact that insertion and 
removal to/from the list will be done form exactly one thread at ant 
time now. So I am not sure I understand what you mean.

Andrey


>
> ...
>
>       schedule_work(&s_job->finish_work);
> }
>
> 在 2019/4/18 23:00, Andrey Grodzovsky 写道:
>> From: Christian König 
>>
>> We now destroy finished jobs from the worker thread to make sure that
>> we never destroy a job currently in timeout processing.
>> By this we avoid holding lock around ring mirror list in drm_sched_stop
>> which should solve a deadlock reported by a user.
>>
>> v2: Remove unused variable.
>> v4: Move guilty job free into sched code.
>> v5:
>> Move sched->hw_rq_count to drm_sched_start to account for counter
>> decrement in drm_sched_stop even when we don't call resubmit jobs
>> if guily job did signal.
>>
>> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109692
>>
>> Signed-off-by: Christian König 
>> Signed-off-by: Andrey Grodzovsky 
>> ---
>>drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |   9 +-
>>drivers/gpu/drm/etnaviv/etnaviv_dump.c |   4 -
>>drivers/gpu/drm/etnaviv/etnaviv_sched.c|   2 +-
>>drivers/gpu/drm/lima/lima_sched.c  |   2 +-
>>drivers/gpu/drm/panfrost/panfrost_job.c|   2 +-
>>drivers/gpu/drm/scheduler/sched_main.c | 159 
>> +
>>drivers/gpu/drm/v3d/v3d_sched.c|   2 +-
>>include/drm/gpu_scheduler.h|   6 +-
>>8 files changed, 102 insertions(+), 84 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 7cee269..a0e165c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -3334,7 +3334,7 @@ static int amdgpu_device_pre_asic_reset(struct 
>> amdgpu_device *adev,
>>  if (!ring || !ring->sched.thread)
>>  continue;
>>
>> -drm_sched_stop(&ring->sched);
>> +drm_sched_stop(&ring->sched, &job->base);
>>
>>  /* after all hw jobs are reset, hw fence is meaningless, so 
>> force_completion */
>>  amdgpu_fence_driver_force_completion(ring);
>> @@ -3343,8 +3343,6 @@ static int amdgpu_device_pre_asic_reset(struct 
>> amdgpu_device *adev,
>>  if(job)
>>  drm_sched_increase_karma(&job->base);
>>
>> -
>> -
>>  if (!amdgpu_sriov_vf(adev)) {
>>
>>  if (!need_full_reset)
>> @@ -3482,8 +3480,7 @@ static int amdgpu_do_asic_reset(struct 
>> amdgpu_hive_info *hive,
>>  return r;
>>}
>>
>> -static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev,
>> -  struct amdgpu_job *job)
>> +static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev)
>>{
>>  int i;
>>
>> @@ -3623,7 +3620,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
>> *adev,
>>
>>  /* Post ASIC reset for all devs .*/
>>  list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
>> -amdgpu_device_post_asic_reset(tmp_adev, tmp_adev == adev ? job 
>> : NULL);
>> +amdgpu_device_post_asic_reset(tmp_adev);
>>
>>  if (r) {
>>  /* bad news, how to tell it to userspace ? */
>> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_dump.c 
>> b/drivers/gpu/drm/etnaviv/etnaviv_dump.c
>> index 33854c9..5778d9c 100644
>> --- a/drivers/gpu/drm/etnaviv/etnaviv_dump.c
>> +++ b/drivers/gpu/drm/etnaviv/etnaviv_dump.c
>> @@ -135,13 +135,11 @@ void etnaviv_core_dump(struct etnaviv_gpu *gpu)
>>  mmu_size + gpu->buffer.size;
>>
>>  /* Add in the active command buffers */
>> -spin_lock_irqsave(&gpu->sched.job_list_lock, flags);
>>  list_for_each_entry(s_job, &gpu->sched.ring_mirror_list, node) {
>>  submit = to_etnaviv_submit(s_job);
>>  file_size += submit->cmdbuf.size;
>>  n_obj++;
>>  }
>> -spin_unlock_irqrestore(&gpu->sched.job_list_lock, flags);
>>
>>  /* Add in the active buffer objects */
>>  list_for_each_entry(vram, &gpu->mmu->mappings, mmu_node) {
>> @@ -183,14 +181,12 @@ void etnaviv_core_dump(struct etnaviv_gpu *gpu)
>>gpu->buffer.size,
>>etn

Re: [PATCH v5 3/6] drm/scheduler: rework job destruction

2019-04-22 Thread Chunming Zhou
Hi Andrey,

static void drm_sched_process_job(struct dma_fence *f, struct 
dma_fence_cb *cb)
{
...
     spin_lock_irqsave(&sched->job_list_lock, flags);
     /* remove job from ring_mirror_list */
     list_del_init(&s_job->node);
     spin_unlock_irqrestore(&sched->job_list_lock, flags);
[David] How about just remove above to worker from irq process? Any 
problem? Maybe I missed previous your discussion, but I think removing 
lock for list is a risk for future maintenance although you make sure 
thread safe currently.

-David

...

     schedule_work(&s_job->finish_work);
}

在 2019/4/18 23:00, Andrey Grodzovsky 写道:
> From: Christian König 
>
> We now destroy finished jobs from the worker thread to make sure that
> we never destroy a job currently in timeout processing.
> By this we avoid holding lock around ring mirror list in drm_sched_stop
> which should solve a deadlock reported by a user.
>
> v2: Remove unused variable.
> v4: Move guilty job free into sched code.
> v5:
> Move sched->hw_rq_count to drm_sched_start to account for counter
> decrement in drm_sched_stop even when we don't call resubmit jobs
> if guily job did signal.
>
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109692
>
> Signed-off-by: Christian König 
> Signed-off-by: Andrey Grodzovsky 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |   9 +-
>   drivers/gpu/drm/etnaviv/etnaviv_dump.c |   4 -
>   drivers/gpu/drm/etnaviv/etnaviv_sched.c|   2 +-
>   drivers/gpu/drm/lima/lima_sched.c  |   2 +-
>   drivers/gpu/drm/panfrost/panfrost_job.c|   2 +-
>   drivers/gpu/drm/scheduler/sched_main.c | 159 
> +
>   drivers/gpu/drm/v3d/v3d_sched.c|   2 +-
>   include/drm/gpu_scheduler.h|   6 +-
>   8 files changed, 102 insertions(+), 84 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 7cee269..a0e165c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3334,7 +3334,7 @@ static int amdgpu_device_pre_asic_reset(struct 
> amdgpu_device *adev,
>   if (!ring || !ring->sched.thread)
>   continue;
>   
> - drm_sched_stop(&ring->sched);
> + drm_sched_stop(&ring->sched, &job->base);
>   
>   /* after all hw jobs are reset, hw fence is meaningless, so 
> force_completion */
>   amdgpu_fence_driver_force_completion(ring);
> @@ -3343,8 +3343,6 @@ static int amdgpu_device_pre_asic_reset(struct 
> amdgpu_device *adev,
>   if(job)
>   drm_sched_increase_karma(&job->base);
>   
> -
> -
>   if (!amdgpu_sriov_vf(adev)) {
>   
>   if (!need_full_reset)
> @@ -3482,8 +3480,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info 
> *hive,
>   return r;
>   }
>   
> -static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev,
> -   struct amdgpu_job *job)
> +static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev)
>   {
>   int i;
>   
> @@ -3623,7 +3620,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device 
> *adev,
>   
>   /* Post ASIC reset for all devs .*/
>   list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
> - amdgpu_device_post_asic_reset(tmp_adev, tmp_adev == adev ? job 
> : NULL);
> + amdgpu_device_post_asic_reset(tmp_adev);
>   
>   if (r) {
>   /* bad news, how to tell it to userspace ? */
> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_dump.c 
> b/drivers/gpu/drm/etnaviv/etnaviv_dump.c
> index 33854c9..5778d9c 100644
> --- a/drivers/gpu/drm/etnaviv/etnaviv_dump.c
> +++ b/drivers/gpu/drm/etnaviv/etnaviv_dump.c
> @@ -135,13 +135,11 @@ void etnaviv_core_dump(struct etnaviv_gpu *gpu)
>   mmu_size + gpu->buffer.size;
>   
>   /* Add in the active command buffers */
> - spin_lock_irqsave(&gpu->sched.job_list_lock, flags);
>   list_for_each_entry(s_job, &gpu->sched.ring_mirror_list, node) {
>   submit = to_etnaviv_submit(s_job);
>   file_size += submit->cmdbuf.size;
>   n_obj++;
>   }
> - spin_unlock_irqrestore(&gpu->sched.job_list_lock, flags);
>   
>   /* Add in the active buffer objects */
>   list_for_each_entry(vram, &gpu->mmu->mappings, mmu_node) {
> @@ -183,14 +181,12 @@ void etnaviv_core_dump(struct etnaviv_gpu *gpu)
> gpu->buffer.size,
> etnaviv_cmdbuf_get_va(&gpu->buffer));
>   
> - spin_lock_irqsave(&gpu->sched.job_list_lock, flags);
>   list_for_each_entry(s_job, &gpu->sched.ring_mirror_list, node) {
>   submit = to_etnaviv_submit(s_job);
>   etnaviv_core_dump_mem(&iter, ETDUMP_BUF_CMD,
> submit->cmdbuf.vaddr, submit->cmdbuf.size,
> 

[PATCH v5 3/6] drm/scheduler: rework job destruction

2019-04-18 Thread Andrey Grodzovsky
From: Christian König 

We now destroy finished jobs from the worker thread to make sure that
we never destroy a job currently in timeout processing.
By this we avoid holding lock around ring mirror list in drm_sched_stop
which should solve a deadlock reported by a user.

v2: Remove unused variable.
v4: Move guilty job free into sched code.
v5:
Move sched->hw_rq_count to drm_sched_start to account for counter
decrement in drm_sched_stop even when we don't call resubmit jobs
if guily job did signal.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109692

Signed-off-by: Christian König 
Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |   9 +-
 drivers/gpu/drm/etnaviv/etnaviv_dump.c |   4 -
 drivers/gpu/drm/etnaviv/etnaviv_sched.c|   2 +-
 drivers/gpu/drm/lima/lima_sched.c  |   2 +-
 drivers/gpu/drm/panfrost/panfrost_job.c|   2 +-
 drivers/gpu/drm/scheduler/sched_main.c | 159 +
 drivers/gpu/drm/v3d/v3d_sched.c|   2 +-
 include/drm/gpu_scheduler.h|   6 +-
 8 files changed, 102 insertions(+), 84 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 7cee269..a0e165c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3334,7 +3334,7 @@ static int amdgpu_device_pre_asic_reset(struct 
amdgpu_device *adev,
if (!ring || !ring->sched.thread)
continue;
 
-   drm_sched_stop(&ring->sched);
+   drm_sched_stop(&ring->sched, &job->base);
 
/* after all hw jobs are reset, hw fence is meaningless, so 
force_completion */
amdgpu_fence_driver_force_completion(ring);
@@ -3343,8 +3343,6 @@ static int amdgpu_device_pre_asic_reset(struct 
amdgpu_device *adev,
if(job)
drm_sched_increase_karma(&job->base);
 
-
-
if (!amdgpu_sriov_vf(adev)) {
 
if (!need_full_reset)
@@ -3482,8 +3480,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info 
*hive,
return r;
 }
 
-static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev,
- struct amdgpu_job *job)
+static void amdgpu_device_post_asic_reset(struct amdgpu_device *adev)
 {
int i;
 
@@ -3623,7 +3620,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 
/* Post ASIC reset for all devs .*/
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
-   amdgpu_device_post_asic_reset(tmp_adev, tmp_adev == adev ? job 
: NULL);
+   amdgpu_device_post_asic_reset(tmp_adev);
 
if (r) {
/* bad news, how to tell it to userspace ? */
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_dump.c 
b/drivers/gpu/drm/etnaviv/etnaviv_dump.c
index 33854c9..5778d9c 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_dump.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_dump.c
@@ -135,13 +135,11 @@ void etnaviv_core_dump(struct etnaviv_gpu *gpu)
mmu_size + gpu->buffer.size;
 
/* Add in the active command buffers */
-   spin_lock_irqsave(&gpu->sched.job_list_lock, flags);
list_for_each_entry(s_job, &gpu->sched.ring_mirror_list, node) {
submit = to_etnaviv_submit(s_job);
file_size += submit->cmdbuf.size;
n_obj++;
}
-   spin_unlock_irqrestore(&gpu->sched.job_list_lock, flags);
 
/* Add in the active buffer objects */
list_for_each_entry(vram, &gpu->mmu->mappings, mmu_node) {
@@ -183,14 +181,12 @@ void etnaviv_core_dump(struct etnaviv_gpu *gpu)
  gpu->buffer.size,
  etnaviv_cmdbuf_get_va(&gpu->buffer));
 
-   spin_lock_irqsave(&gpu->sched.job_list_lock, flags);
list_for_each_entry(s_job, &gpu->sched.ring_mirror_list, node) {
submit = to_etnaviv_submit(s_job);
etnaviv_core_dump_mem(&iter, ETDUMP_BUF_CMD,
  submit->cmdbuf.vaddr, submit->cmdbuf.size,
  etnaviv_cmdbuf_get_va(&submit->cmdbuf));
}
-   spin_unlock_irqrestore(&gpu->sched.job_list_lock, flags);
 
/* Reserve space for the bomap */
if (n_bomap_pages) {
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c 
b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
index 6d24fea..a813c82 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
@@ -109,7 +109,7 @@ static void etnaviv_sched_timedout_job(struct drm_sched_job 
*sched_job)
}
 
/* block scheduler */
-   drm_sched_stop(&gpu->sched);
+   drm_sched_stop(&gpu->sched, sched_job);
 
if(sched_job)
drm_sched_increase_karma(sched_job);
diff --git a/drivers/gpu/drm/lima/lima_sched.c 
b/drivers/gpu/drm/li