Re: [PATCH 1/2] drm/sched: Refactor ring mirror list handling.

2018-12-06 Thread Grodzovsky, Andrey


On 12/06/2018 01:33 PM, Christian König wrote:
> Am 06.12.18 um 18:41 schrieb Andrey Grodzovsky:
>> Decauple sched threads stop and start and ring mirror
>> list handling from the policy of what to do about the
>> guilty jobs.
>> When stoppping the sched thread and detaching sched fences
>> from non signaled HW fenes wait for all signaled HW fences
>> to complete before rerunning the jobs.
>>
>> Suggested-by: Christian Koenig 
>> Signed-off-by: Andrey Grodzovsky 
>
> Just briefly skimmed over this, but it looks exactly like what I had 
> in mind.
>
> Need to give that a more detailed thought tomorrow,
> Christian.

Please note I've already resent V2 after finding refactoring error.

Andrey

>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++---
>>   drivers/gpu/drm/etnaviv/etnaviv_sched.c    |  8 +--
>>   drivers/gpu/drm/scheduler/sched_main.c | 86 
>> +++---
>>   drivers/gpu/drm/v3d/v3d_sched.c    | 11 ++--
>>   include/drm/gpu_scheduler.h    | 10 ++--
>>   5 files changed, 83 insertions(+), 49 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index ef36cc5..42111d5 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -3292,17 +3292,16 @@ static int 
>> amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
>>   /* block all schedulers and reset given job's ring */
>>   for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>   struct amdgpu_ring *ring = adev->rings[i];
>> +    bool park_only = job && job->base.sched != >sched;
>>     if (!ring || !ring->sched.thread)
>>   continue;
>>   -    kthread_park(ring->sched.thread);
>> +    drm_sched_stop(>sched, job ? >base : NULL, 
>> park_only);
>>   -    if (job && job->base.sched != >sched)
>> +    if (park_only)
>>   continue;
>>   -    drm_sched_hw_job_reset(>sched, job ? >base : 
>> NULL);
>> -
>>   /* after all hw jobs are reset, hw fence is meaningless, so 
>> force_completion */
>>   amdgpu_fence_driver_force_completion(ring);
>>   }
>> @@ -3445,6 +3444,7 @@ static void 
>> amdgpu_device_post_asic_reset(struct amdgpu_device *adev,
>>     struct amdgpu_job *job)
>>   {
>>   int i;
>> +    bool unpark_only;
>>     for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
>>   struct amdgpu_ring *ring = adev->rings[i];
>> @@ -3456,10 +3456,13 @@ static void 
>> amdgpu_device_post_asic_reset(struct amdgpu_device *adev,
>>    * or all rings (in the case @job is NULL)
>>    * after above amdgpu_reset accomplished
>>    */
>> -    if ((!job || job->base.sched == >sched) && 
>> !adev->asic_reset_res)
>> -    drm_sched_job_recovery(>sched);
>> +    unpark_only = (job && job->base.sched != >sched) ||
>> +   adev->asic_reset_res;
>> +
>> +    if (!unpark_only)
>> +    drm_sched_resubmit_jobs(>sched);
>>   -    kthread_unpark(ring->sched.thread);
>> +    drm_sched_start(>sched, unpark_only);
>>   }
>>     if (!amdgpu_device_has_dc_support(adev)) {
>> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c 
>> b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>> index 49a6763..fab3b51 100644
>> --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>> +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
>> @@ -109,16 +109,16 @@ static void etnaviv_sched_timedout_job(struct 
>> drm_sched_job *sched_job)
>>   }
>>     /* block scheduler */
>> -    kthread_park(gpu->sched.thread);
>> -    drm_sched_hw_job_reset(>sched, sched_job);
>> +    drm_sched_stop(>sched, sched_job, false);
>>     /* get the GPU back into the init state */
>>   etnaviv_core_dump(gpu);
>>   etnaviv_gpu_recover_hang(gpu);
>>   +    drm_sched_resubmit_jobs(>sched);
>> +
>>   /* restart scheduler after GPU is usable again */
>> -    drm_sched_job_recovery(>sched);
>> -    kthread_unpark(gpu->sched.thread);
>> +    drm_sched_start(>sched);
>>   }
>>     static void etnaviv_sched_free_job(struct drm_sched_job *sched_job)
>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
>> b/drivers/gpu/drm/scheduler/sched_main.c
>> index dbb6906..8fb7f86 100644
>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>> @@ -60,8 +60,6 @@
>>     static void drm_sched_process_job(struct dma_fence *f, struct 
>> dma_fence_cb *cb);
>>   -static void drm_sched_expel_job_unlocked(struct drm_sched_job 
>> *s_job);
>> -
>>   /**
>>    * drm_sched_rq_init - initialize a given run queue struct
>>    *
>> @@ -342,13 +340,21 @@ static void drm_sched_job_timedout(struct 
>> work_struct *work)
>>    * @bad: bad scheduler job
>>    *
>>    */
>> -void drm_sched_hw_job_reset(struct drm_gpu_scheduler *sched, struct 
>> drm_sched_job *bad)
>> +void drm_sched_stop(struct drm_gpu_scheduler *sched, struct 
>> 

Re: [PATCH 1/2] drm/sched: Refactor ring mirror list handling.

2018-12-06 Thread Christian König

Am 06.12.18 um 18:41 schrieb Andrey Grodzovsky:

Decauple sched threads stop and start and ring mirror
list handling from the policy of what to do about the
guilty jobs.
When stoppping the sched thread and detaching sched fences
from non signaled HW fenes wait for all signaled HW fences
to complete before rerunning the jobs.

Suggested-by: Christian Koenig 
Signed-off-by: Andrey Grodzovsky 


Just briefly skimmed over this, but it looks exactly like what I had in 
mind.


Need to give that a more detailed thought tomorrow,
Christian.


---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++---
  drivers/gpu/drm/etnaviv/etnaviv_sched.c|  8 +--
  drivers/gpu/drm/scheduler/sched_main.c | 86 +++---
  drivers/gpu/drm/v3d/v3d_sched.c| 11 ++--
  include/drm/gpu_scheduler.h| 10 ++--
  5 files changed, 83 insertions(+), 49 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index ef36cc5..42111d5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3292,17 +3292,16 @@ static int amdgpu_device_pre_asic_reset(struct 
amdgpu_device *adev,
/* block all schedulers and reset given job's ring */
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = adev->rings[i];
+   bool park_only = job && job->base.sched != >sched;
  
  		if (!ring || !ring->sched.thread)

continue;
  
-		kthread_park(ring->sched.thread);

+   drm_sched_stop(>sched, job ? >base : NULL, 
park_only);
  
-		if (job && job->base.sched != >sched)

+   if (park_only)
continue;
  
-		drm_sched_hw_job_reset(>sched, job ? >base : NULL);

-
/* after all hw jobs are reset, hw fence is meaningless, so 
force_completion */
amdgpu_fence_driver_force_completion(ring);
}
@@ -3445,6 +3444,7 @@ static void amdgpu_device_post_asic_reset(struct 
amdgpu_device *adev,
  struct amdgpu_job *job)
  {
int i;
+   bool unpark_only;
  
  	for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {

struct amdgpu_ring *ring = adev->rings[i];
@@ -3456,10 +3456,13 @@ static void amdgpu_device_post_asic_reset(struct 
amdgpu_device *adev,
 * or all rings (in the case @job is NULL)
 * after above amdgpu_reset accomplished
 */
-   if ((!job || job->base.sched == >sched) && 
!adev->asic_reset_res)
-   drm_sched_job_recovery(>sched);
+   unpark_only = (job && job->base.sched != >sched) ||
+  adev->asic_reset_res;
+
+   if (!unpark_only)
+   drm_sched_resubmit_jobs(>sched);
  
-		kthread_unpark(ring->sched.thread);

+   drm_sched_start(>sched, unpark_only);
}
  
  	if (!amdgpu_device_has_dc_support(adev)) {

diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c 
b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
index 49a6763..fab3b51 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
@@ -109,16 +109,16 @@ static void etnaviv_sched_timedout_job(struct 
drm_sched_job *sched_job)
}
  
  	/* block scheduler */

-   kthread_park(gpu->sched.thread);
-   drm_sched_hw_job_reset(>sched, sched_job);
+   drm_sched_stop(>sched, sched_job, false);
  
  	/* get the GPU back into the init state */

etnaviv_core_dump(gpu);
etnaviv_gpu_recover_hang(gpu);
  
+	drm_sched_resubmit_jobs(>sched);

+
/* restart scheduler after GPU is usable again */
-   drm_sched_job_recovery(>sched);
-   kthread_unpark(gpu->sched.thread);
+   drm_sched_start(>sched);
  }
  
  static void etnaviv_sched_free_job(struct drm_sched_job *sched_job)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
b/drivers/gpu/drm/scheduler/sched_main.c
index dbb6906..8fb7f86 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -60,8 +60,6 @@
  
  static void drm_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb);
  
-static void drm_sched_expel_job_unlocked(struct drm_sched_job *s_job);

-
  /**
   * drm_sched_rq_init - initialize a given run queue struct
   *
@@ -342,13 +340,21 @@ static void drm_sched_job_timedout(struct work_struct 
*work)
   * @bad: bad scheduler job
   *
   */
-void drm_sched_hw_job_reset(struct drm_gpu_scheduler *sched, struct 
drm_sched_job *bad)
+void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad,
+   bool park_only)
  {
struct drm_sched_job *s_job;
struct drm_sched_entity *entity, *tmp;
unsigned long flags;
+   struct list_head wait_list;
int i;
  
+	kthread_park(sched->thread);

+   if (park_only)
+   return;

[PATCH 1/2] drm/sched: Refactor ring mirror list handling.

2018-12-06 Thread Andrey Grodzovsky
Decauple sched threads stop and start and ring mirror
list handling from the policy of what to do about the
guilty jobs.
When stoppping the sched thread and detaching sched fences
from non signaled HW fenes wait for all signaled HW fences
to complete before rerunning the jobs.

Suggested-by: Christian Koenig 
Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 +++---
 drivers/gpu/drm/etnaviv/etnaviv_sched.c|  8 +--
 drivers/gpu/drm/scheduler/sched_main.c | 86 +++---
 drivers/gpu/drm/v3d/v3d_sched.c| 11 ++--
 include/drm/gpu_scheduler.h| 10 ++--
 5 files changed, 83 insertions(+), 49 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index ef36cc5..42111d5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3292,17 +3292,16 @@ static int amdgpu_device_pre_asic_reset(struct 
amdgpu_device *adev,
/* block all schedulers and reset given job's ring */
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = adev->rings[i];
+   bool park_only = job && job->base.sched != >sched;
 
if (!ring || !ring->sched.thread)
continue;
 
-   kthread_park(ring->sched.thread);
+   drm_sched_stop(>sched, job ? >base : NULL, 
park_only);
 
-   if (job && job->base.sched != >sched)
+   if (park_only)
continue;
 
-   drm_sched_hw_job_reset(>sched, job ? >base : NULL);
-
/* after all hw jobs are reset, hw fence is meaningless, so 
force_completion */
amdgpu_fence_driver_force_completion(ring);
}
@@ -3445,6 +3444,7 @@ static void amdgpu_device_post_asic_reset(struct 
amdgpu_device *adev,
  struct amdgpu_job *job)
 {
int i;
+   bool unpark_only;
 
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = adev->rings[i];
@@ -3456,10 +3456,13 @@ static void amdgpu_device_post_asic_reset(struct 
amdgpu_device *adev,
 * or all rings (in the case @job is NULL)
 * after above amdgpu_reset accomplished
 */
-   if ((!job || job->base.sched == >sched) && 
!adev->asic_reset_res)
-   drm_sched_job_recovery(>sched);
+   unpark_only = (job && job->base.sched != >sched) ||
+  adev->asic_reset_res;
+
+   if (!unpark_only)
+   drm_sched_resubmit_jobs(>sched);
 
-   kthread_unpark(ring->sched.thread);
+   drm_sched_start(>sched, unpark_only);
}
 
if (!amdgpu_device_has_dc_support(adev)) {
diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c 
b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
index 49a6763..fab3b51 100644
--- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c
+++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c
@@ -109,16 +109,16 @@ static void etnaviv_sched_timedout_job(struct 
drm_sched_job *sched_job)
}
 
/* block scheduler */
-   kthread_park(gpu->sched.thread);
-   drm_sched_hw_job_reset(>sched, sched_job);
+   drm_sched_stop(>sched, sched_job, false);
 
/* get the GPU back into the init state */
etnaviv_core_dump(gpu);
etnaviv_gpu_recover_hang(gpu);
 
+   drm_sched_resubmit_jobs(>sched);
+
/* restart scheduler after GPU is usable again */
-   drm_sched_job_recovery(>sched);
-   kthread_unpark(gpu->sched.thread);
+   drm_sched_start(>sched);
 }
 
 static void etnaviv_sched_free_job(struct drm_sched_job *sched_job)
diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
b/drivers/gpu/drm/scheduler/sched_main.c
index dbb6906..8fb7f86 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -60,8 +60,6 @@
 
 static void drm_sched_process_job(struct dma_fence *f, struct dma_fence_cb 
*cb);
 
-static void drm_sched_expel_job_unlocked(struct drm_sched_job *s_job);
-
 /**
  * drm_sched_rq_init - initialize a given run queue struct
  *
@@ -342,13 +340,21 @@ static void drm_sched_job_timedout(struct work_struct 
*work)
  * @bad: bad scheduler job
  *
  */
-void drm_sched_hw_job_reset(struct drm_gpu_scheduler *sched, struct 
drm_sched_job *bad)
+void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad,
+   bool park_only)
 {
struct drm_sched_job *s_job;
struct drm_sched_entity *entity, *tmp;
unsigned long flags;
+   struct list_head wait_list;
int i;
 
+   kthread_park(sched->thread);
+   if (park_only)
+   return;
+
+   INIT_LIST_HEAD(_list);
+
spin_lock_irqsave(>job_list_lock, flags);
list_for_each_entry_reverse(s_job,