Am 25.10.2017 um 11:22 schrieb Monk Liu:
jobs are skipped under two cases
1)when the entity behind this job marked guilty, the job
poped from this entity's queue will be dropped in sched_main loop.

2)in job_recovery(), skip the scheduling job if its karma detected
above limit, and also skipped as well for other jobs sharing the
same fence context. this approach is becuase job_recovery() cannot
access job->entity due to entity may already dead.

with this feature we can introduce new gpu recover feature.

Sorry for the delay, totally swamped once more at the moment.


Change-Id: I268b1c752c94e6ecd4ea78c87eb226ea3f52908a
Signed-off-by: Monk Liu <[email protected]>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       |  9 +++---
  drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 46 +++++++++++++++++++--------
  drivers/gpu/drm/amd/scheduler/gpu_scheduler.h |  2 +-
  3 files changed, 39 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index a58e3c5..f08fde9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -177,7 +177,7 @@ static struct dma_fence *amdgpu_job_dependency(struct 
amd_sched_job *sched_job)
        return fence;
  }
-static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job)
+static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job, bool 
skip)
  {
        struct dma_fence *fence = NULL;
        struct amdgpu_device *adev;
@@ -194,10 +194,11 @@ static struct dma_fence *amdgpu_job_run(struct 
amd_sched_job *sched_job)
        BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL));
trace_amdgpu_sched_run_job(job);
-       /* skip ib schedule when vram is lost */
-       if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter)) {
+
+       if (skip || job->vram_lost_counter != 
atomic_read(&adev->vram_lost_counter)) {
+               /* skip ib schedule if looks needed, and set error */
                dma_fence_set_error(&job->base.s_fence->finished, -ECANCELED);
-               DRM_ERROR("Skip scheduling IBs!\n");
+               DRM_INFO("Skip scheduling IBs!\n");

Not really needed, we could just use amd_sched_job_fake_signal() here as well.

        } else {
                r = amdgpu_ib_schedule(job->ring, job->num_ibs, job->ibs, job,
                                       &fence);
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c 
b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
index 9cbeade..29841ba 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
@@ -330,6 +330,20 @@ static bool amd_sched_entity_add_dependency_cb(struct 
amd_sched_entity *entity)
        return false;
  }
+static void amd_sched_job_fake_signal(struct amd_sched_job *job)
+{
+       struct amd_sched_fence *s_fence = job->s_fence;
+
+       dma_fence_set_error(&job->s_fence->finished, -ECANCELED);
+       /* fake signaling the scheduled fence */
+       amd_sched_fence_scheduled(s_fence);
+       /* fake signaling the finished fence */
+       dma_fence_put(&job->s_fence->finished);
+       job->sched->ops->free_job(job);

Well signaling the finished fence will free the job as well, won't it?

+       /* call CB on the s_fence, e.g. wake up WAIT_CS */
+       amd_sched_fence_finished(s_fence);
+}
+
  static struct amd_sched_job *
  amd_sched_entity_pop_job(struct amd_sched_entity *entity)
  {
@@ -345,6 +359,12 @@ amd_sched_entity_pop_job(struct amd_sched_entity *entity)
                        return NULL;
sched_job->s_entity = NULL;
+
+       if (entity->guilty && atomic_read(entity->guilty)) {
+               amd_sched_job_fake_signal(sched_job);
+               sched_job = NULL;
+       }
+

This must come after the spsc_queue_pop() below, otherwise you mess up the spsc queue.

        spsc_queue_pop(&entity->job_queue);
        return sched_job;
  }
@@ -441,13 +461,6 @@ static void amd_sched_job_timedout(struct work_struct 
*work)
        job->sched->ops->timedout_job(job);
  }
-static void amd_sched_set_guilty(struct amd_sched_job *s_job)
-{
-       if (atomic_inc_return(&s_job->karma) > s_job->sched->hang_limit)
-               if (s_job->s_entity->guilty)
-                       atomic_set(s_job->s_entity->guilty, 1);
-}
-
  void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct 
amd_sched_job *bad)
  {
        struct amd_sched_job *s_job;
@@ -466,7 +479,7 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler 
*sched, struct amd_sched_jo
        }
        spin_unlock(&sched->job_list_lock);
- if (bad) {
+       if (bad && atomic_inc_return(&bad->karma) > bad->sched->hang_limit) {
                bool found = false;
for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_MAX; i++ ) {
@@ -476,7 +489,8 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler 
*sched, struct amd_sched_jo
                        list_for_each_entry_safe(entity, tmp, &rq->entities, 
list) {
                                if (bad->s_fence->scheduled.context == 
entity->fence_context) {
                                        found = true;
-                                       amd_sched_set_guilty(bad);
+                                       if (entity->guilty)
+                                               atomic_set(entity->guilty, 1);
                                        break;
                                }
                        }
@@ -510,9 +524,17 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler 
*sched)
        list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
                struct amd_sched_fence *s_fence = s_job->s_fence;
                struct dma_fence *fence;
+               bool found_guilty = false, skip;
+               uint64_t guilty_context;

Well if you want the variables to have an effect you should probably move them outside the loop. Otherwise the code doesn't make much sense to me.

Regards,
Christian.

+
+               if (!found_guilty && atomic_read(&s_job->karma) > 
sched->hang_limit) {
+                       found_guilty = true;
+                       guilty_context = s_job->s_fence->scheduled.context;
+               }
+ skip = (found_guilty && s_job->s_fence->scheduled.context == guilty_context);
                spin_unlock(&sched->job_list_lock);
-               fence = sched->ops->run_job(s_job);
+               fence = sched->ops->run_job(s_job, skip);
                atomic_inc(&sched->hw_rq_count);
                if (fence) {
                        s_fence->parent = dma_fence_get(fence);
@@ -525,7 +547,6 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
                                          r);
                        dma_fence_put(fence);
                } else {
-                       DRM_ERROR("Failed to run job!\n");
                        amd_sched_process_job(NULL, &s_fence->cb);
                }
                spin_lock(&sched->job_list_lock);
@@ -650,7 +671,7 @@ static int amd_sched_main(void *param)
                atomic_inc(&sched->hw_rq_count);
                amd_sched_job_begin(sched_job);
- fence = sched->ops->run_job(sched_job);
+               fence = sched->ops->run_job(sched_job, false);
                amd_sched_fence_scheduled(s_fence);
if (fence) {
@@ -664,7 +685,6 @@ static int amd_sched_main(void *param)
                                          r);
                        dma_fence_put(fence);
                } else {
-                       DRM_ERROR("Failed to run job!\n");
                        amd_sched_process_job(NULL, &s_fence->cb);
                }
diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
index f9e3a83..a8e5a7d 100644
--- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
+++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
@@ -126,7 +126,7 @@ static inline bool amd_sched_invalidate_job(struct 
amd_sched_job *s_job, int thr
  */
  struct amd_sched_backend_ops {
        struct dma_fence *(*dependency)(struct amd_sched_job *sched_job);
-       struct dma_fence *(*run_job)(struct amd_sched_job *sched_job);
+       struct dma_fence *(*run_job)(struct amd_sched_job *sched_job, bool 
skip);
        void (*timedout_job)(struct amd_sched_job *sched_job);
        void (*free_job)(struct amd_sched_job *sched_job);
  };


_______________________________________________
amd-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to