Thanks Christian, will refine the code as your suggestion.

Best Wishes,
Emily Deng

> -----Original Message-----
> From: Christian König [mailto:ckoenig.leichtzumer...@gmail.com]
> Sent: Wednesday, April 04, 2018 6:02 PM
> To: Deng, Emily <emily.d...@amd.com>; amd-gfx@lists.freedesktop.org
> Cc: Liu, Monk <monk....@amd.com>
> Subject: Re: [PATCH] drm/gpu-sched: fix force APP kill hang(v2)
> 
> Am 04.04.2018 um 11:37 schrieb Emily Deng:
> > issue:
> > there are VMC page fault occured if force APP kill during 3dmark test,
> > the cause is in entity_fini we manually signal all those jobs in
> > entity's queue which confuse the sync/dep
> > mechanism:
> >
> > 1)page fault occured in sdma's clear job which operate on shadow
> > buffer, and shadow buffer's Gart table is cleaned by ttm_bo_release
> > since the fence in its reservation was fake signaled by entity_fini()
> > under the case of SIGKILL received.
> >
> > 2)page fault occured in gfx' job because during the lifetime of gfx
> > job we manually fake signal all jobs from its entity in entity_fini(),
> > thus the unmapping/clear PTE job depend on those result fence is
> > satisfied and sdma start clearing the PTE and lead to GFX page fault.
> >
> > fix:
> > 1)should at least wait all jobs already scheduled complete in
> > entity_fini() if SIGKILL is the case.
> >
> > 2)if a fence signaled and try to clear some entity's dependency,
> > should set this entity guilty to prevent its job really run since the
> > dependency is fake signaled.
> >
> > v2:
> > splitting drm_sched_entity_fini() into two functions:
> > 1)The first one is does the waiting, removes the entity from the
> > runqueue and returns an error when the process was killed.
> > 2)The second one then goes over the entity, install it as completion
> > signal for the remaining jobs and signals all jobs with an error code.
> 
> First of all this patch won't work like this. You need to call the first part
> before the VM teardown in amdgpu_driver_postclose_kms() and the second
> part after the VM teardown.
> 
> Then please come up with a better name than fini1 and fini2, something like
> drm_sched_entity_cleanup() or something like this.
> 
> Then I think it is harmless to call the _cleanup() function from the
> _fini() function and only change the places where it makes sense to
> explicitely call _cleanup().
> 
> E.g. add a new function amdgpu_ctx_mgr_cleanup() which calls the
> cleanup() for each still open ctx and then modify the order in
> amdgpu_driver_postclose_kms().
> 
> Regards,
> Christian.
> 
> >
> > Signed-off-by: Monk Liu <monk....@amd.com>
> > Signed-off-by: Emily Deng <emily.d...@amd.com>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c   | 13 +++++-
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   |  4 +-
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c   |  3 +-
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c   |  3 +-
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c   |  8 +++-
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c    |  6 ++-
> >   drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c     |  3 +-
> >   drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c     |  3 +-
> >   drivers/gpu/drm/scheduler/gpu_scheduler.c | 67
> +++++++++++++++++++++++++------
> >   include/drm/gpu_scheduler.h               |  7 +++-
> >   10 files changed, 93 insertions(+), 24 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> > index 09d35051..69dadf9 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> > @@ -104,8 +104,13 @@ static int amdgpu_ctx_init(struct amdgpu_device
> > *adev,
> >
> >   failed:
> >     for (j = 0; j < i; j++)
> > -           drm_sched_entity_fini(&adev->rings[j]->sched,
> > +           drm_sched_entity_fini1(&adev->rings[j]->sched,
> >                                   &ctx->rings[j].entity);
> > +
> > +   for (j = 0; j < i; j++)
> > +           drm_sched_entity_fini2(&adev->rings[j]->sched,
> > +                                 &ctx->rings[j].entity);
> > +
> >     kfree(ctx->fences);
> >     ctx->fences = NULL;
> >     return r;
> > @@ -126,7 +131,11 @@ static void amdgpu_ctx_fini(struct amdgpu_ctx
> *ctx)
> >     ctx->fences = NULL;
> >
> >     for (i = 0; i < adev->num_rings; i++)
> > -           drm_sched_entity_fini(&adev->rings[i]->sched,
> > +           drm_sched_entity_fini1(&adev->rings[i]->sched,
> > +                                 &ctx->rings[i].entity);
> > +
> > +   for (i = 0; i < adev->num_rings; i++)
> > +           drm_sched_entity_fini2(&adev->rings[i]->sched,
> >                                   &ctx->rings[i].entity);
> >
> >     amdgpu_queue_mgr_fini(adev, &ctx->queue_mgr); diff --git
> > a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> > index d2ab404..83d46bb 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> > @@ -131,8 +131,10 @@ static int amdgpu_ttm_global_init(struct
> amdgpu_device *adev)
> >   static void amdgpu_ttm_global_fini(struct amdgpu_device *adev)
> >   {
> >     if (adev->mman.mem_global_referenced) {
> > -           drm_sched_entity_fini(adev->mman.entity.sched,
> > +           drm_sched_entity_fini1(adev->mman.entity.sched,
> >                                   &adev->mman.entity);
> > +           drm_sched_entity_fini2(adev->mman.entity.sched,
> > +                      &adev->mman.entity);
> >             mutex_destroy(&adev->mman.gtt_window_lock);
> >             drm_global_item_unref(&adev->mman.bo_global_ref.ref);
> >             drm_global_item_unref(&adev->mman.mem_global_ref);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
> > index 627542b..74a6953 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
> > @@ -277,7 +277,8 @@ int amdgpu_uvd_sw_fini(struct amdgpu_device
> *adev)
> >     int i;
> >     kfree(adev->uvd.saved_bo);
> >
> > -   drm_sched_entity_fini(&adev->uvd.ring.sched, &adev->uvd.entity);
> > +   drm_sched_entity_fini1(&adev->uvd.ring.sched, &adev->uvd.entity);
> > +   drm_sched_entity_fini2(&adev->uvd.ring.sched, &adev->uvd.entity);
> >
> >     amdgpu_bo_free_kernel(&adev->uvd.vcpu_bo,
> >                           &adev->uvd.gpu_addr,
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
> > index a33804b..4166d26 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
> > @@ -212,7 +212,8 @@ int amdgpu_vce_sw_fini(struct amdgpu_device
> *adev)
> >     if (adev->vce.vcpu_bo == NULL)
> >             return 0;
> >
> > -   drm_sched_entity_fini(&adev->vce.ring[0].sched, &adev->vce.entity);
> > +   drm_sched_entity_fini1(&adev->vce.ring[0].sched, &adev-
> >vce.entity);
> > +   drm_sched_entity_fini2(&adev->vce.ring[0].sched, &adev-
> >vce.entity);
> >
> >     amdgpu_bo_free_kernel(&adev->vce.vcpu_bo, &adev-
> >vce.gpu_addr,
> >             (void **)&adev->vce.cpu_addr);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> > index 58e4953..268c0c3 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> > @@ -129,9 +129,13 @@ int amdgpu_vcn_sw_fini(struct amdgpu_device
> > *adev)
> >
> >     kfree(adev->vcn.saved_bo);
> >
> > -   drm_sched_entity_fini(&adev->vcn.ring_dec.sched, &adev-
> >vcn.entity_dec);
> > +   drm_sched_entity_fini1(&adev->vcn.ring_dec.sched,
> > +&adev->vcn.entity_dec);
> >
> > -   drm_sched_entity_fini(&adev->vcn.ring_enc[0].sched, &adev-
> >vcn.entity_enc);
> > +   drm_sched_entity_fini1(&adev->vcn.ring_enc[0].sched,
> > +&adev->vcn.entity_enc);
> > +
> > +   drm_sched_entity_fini2(&adev->vcn.ring_dec.sched,
> > +&adev->vcn.entity_dec);
> > +
> > +   drm_sched_entity_fini2(&adev->vcn.ring_enc[0].sched,
> > +&adev->vcn.entity_enc);
> >
> >     amdgpu_bo_free_kernel(&adev->vcn.vcpu_bo,
> >                           &adev->vcn.gpu_addr,
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > index 2447429..084a39d 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > @@ -2456,7 +2456,8 @@ int amdgpu_vm_init(struct amdgpu_device
> *adev, struct amdgpu_vm *vm,
> >     vm->root.base.bo = NULL;
> >
> >   error_free_sched_entity:
> > -   drm_sched_entity_fini(&ring->sched, &vm->entity);
> > +   drm_sched_entity_fini1(&ring->sched, &vm->entity);
> > +   drm_sched_entity_fini2(&ring->sched, &vm->entity);
> >
> >     return r;
> >   }
> > @@ -2520,7 +2521,8 @@ void amdgpu_vm_fini(struct amdgpu_device
> *adev, struct amdgpu_vm *vm)
> >             spin_unlock_irqrestore(&adev->vm_manager.pasid_lock,
> flags);
> >     }
> >
> > -   drm_sched_entity_fini(vm->entity.sched, &vm->entity);
> > +   drm_sched_entity_fini1(vm->entity.sched, &vm->entity);
> > +   drm_sched_entity_fini2(vm->entity.sched, &vm->entity);
> >
> >     if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
> >             dev_err(adev->dev, "still active bo inside vm\n"); diff --git
> > a/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c
> > b/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c
> > index f26f515..3c57a6e9 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c
> > @@ -469,7 +469,8 @@ static int uvd_v6_0_sw_fini(void *handle)
> >             return r;
> >
> >     if (uvd_v6_0_enc_support(adev)) {
> > -           drm_sched_entity_fini(&adev->uvd.ring_enc[0].sched,
> &adev->uvd.entity_enc);
> > +           drm_sched_entity_fini1(&adev->uvd.ring_enc[0].sched,
> &adev->uvd.entity_enc);
> > +           drm_sched_entity_fini2(&adev->uvd.ring_enc[0].sched,
> > +&adev->uvd.entity_enc);
> >
> >             for (i = 0; i < adev->uvd.num_enc_rings; ++i)
> >                     amdgpu_ring_fini(&adev->uvd.ring_enc[i]);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
> > b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
> > index 280c082..a422acf 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
> > @@ -472,7 +472,8 @@ static int uvd_v7_0_sw_fini(void *handle)
> >     if (r)
> >             return r;
> >
> > -   drm_sched_entity_fini(&adev->uvd.ring_enc[0].sched, &adev-
> >uvd.entity_enc);
> > +   drm_sched_entity_fini1(&adev->uvd.ring_enc[0].sched, &adev-
> >uvd.entity_enc);
> > +   drm_sched_entity_fini2(&adev->uvd.ring_enc[0].sched,
> > +&adev->uvd.entity_enc);
> >
> >     for (i = 0; i < adev->uvd.num_enc_rings; ++i)
> >             amdgpu_ring_fini(&adev->uvd.ring_enc[i]);
> > diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c
> > b/drivers/gpu/drm/scheduler/gpu_scheduler.c
> > index 310275e..abeb3ff 100644
> > --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c
> > +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c
> > @@ -136,6 +136,8 @@ int drm_sched_entity_init(struct
> drm_gpu_scheduler *sched,
> >     entity->rq = rq;
> >     entity->sched = sched;
> >     entity->guilty = guilty;
> > +   entity->fini_status = 0;
> > +   entity->finished = NULL;
> >
> >     spin_lock_init(&entity->rq_lock);
> >     spin_lock_init(&entity->queue_lock);
> > @@ -197,19 +199,29 @@ static bool drm_sched_entity_is_ready(struct
> drm_sched_entity *entity)
> >     return true;
> >   }
> >
> > +static void drm_sched_entity_fini_job_cb(struct dma_fence *f,
> > +                               struct dma_fence_cb *cb)
> > +{
> > +   struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
> > +                                            finish_cb);
> > +   drm_sched_fence_finished(job->s_fence);
> > +   WARN_ON(job->s_fence->parent);
> > +   dma_fence_put(&job->s_fence->finished);
> > +   job->sched->ops->free_job(job);
> > +}
> > +
> >   /**
> >    * Destroy a context entity
> >    *
> >    * @sched       Pointer to scheduler instance
> >    * @entity        The pointer to a valid scheduler entity
> >    *
> > - * Cleanup and free the allocated resources.
> > + * Splitting drm_sched_entity_fini() into two functions, The first
> > + one is does the waiting,
> > + * removes the entity from the runqueue and returns an error when the
> process was killed.
> >    */
> > -void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
> > +void drm_sched_entity_fini1(struct drm_gpu_scheduler *sched,
> >                        struct drm_sched_entity *entity)
> >   {
> > -   int r;
> > -
> >     if (!drm_sched_entity_is_initialized(sched, entity))
> >             return;
> >     /**
> > @@ -217,13 +229,27 @@ void drm_sched_entity_fini(struct
> drm_gpu_scheduler *sched,
> >      * queued IBs or discard them on SIGKILL
> >     */
> >     if ((current->flags & PF_SIGNALED) && current->exit_code == SIGKILL)
> > -           r = -ERESTARTSYS;
> > +           entity->fini_status = -ERESTARTSYS;
> >     else
> > -           r = wait_event_killable(sched->job_scheduled,
> > +           entity->fini_status = wait_event_killable(sched-
> >job_scheduled,
> >                                     drm_sched_entity_is_idle(entity));
> >     drm_sched_entity_set_rq(entity, NULL);
> > -   if (r) {
> > +}
> > +
> > +/**
> > + * Destroy a context entity
> > + *
> > + * @sched       Pointer to scheduler instance
> > + * @entity The pointer to a valid scheduler entity
> > + *
> > + * The second one then goes over the entity and signals all jobs with an
> error code.
> > + */
> > +void drm_sched_entity_fini2(struct drm_gpu_scheduler *sched,
> > +                      struct drm_sched_entity *entity) {
> > +   if (entity->fini_status) {
> >             struct drm_sched_job *job;
> > +           int r;
> >
> >             /* Park the kernel for a moment to make sure it isn't
> processing
> >              * our enity.
> > @@ -241,14 +267,21 @@ void drm_sched_entity_fini(struct
> drm_gpu_scheduler *sched,
> >                     struct drm_sched_fence *s_fence = job->s_fence;
> >                     drm_sched_fence_scheduled(s_fence);
> >                     dma_fence_set_error(&s_fence->finished, -ESRCH);
> > -                   drm_sched_fence_finished(s_fence);
> > -                   WARN_ON(s_fence->parent);
> > -                   dma_fence_put(&s_fence->finished);
> > -                   sched->ops->free_job(job);
> > +            r = dma_fence_add_callback(entity->finished, &job->finish_cb,
> > +                               drm_sched_entity_fini_job_cb);
> > +            if (r == -ENOENT)
> > +                drm_sched_entity_fini_job_cb(entity->finished, 
> > &job->finish_cb);
> > +            else if (r)
> > +                DRM_ERROR("fence add callback failed (%d)\n", r);
> > +        }
> > +        if(entity->finished){
> > +            dma_fence_put(entity->finished);
> > +               entity->finished = NULL;
> >             }
> >     }
> >   }
> > -EXPORT_SYMBOL(drm_sched_entity_fini);
> > +EXPORT_SYMBOL(drm_sched_entity_fini1);
> > +EXPORT_SYMBOL(drm_sched_entity_fini2);
> >
> >   static void drm_sched_entity_wakeup(struct dma_fence *f, struct
> dma_fence_cb *cb)
> >   {
> > @@ -530,6 +563,11 @@ void drm_sched_job_recovery(struct
> drm_gpu_scheduler *sched)
> >             spin_unlock(&sched->job_list_lock);
> >             fence = sched->ops->run_job(s_job);
> >             atomic_inc(&sched->hw_rq_count);
> > +
> > +        if(s_job->entity->finished)
> > +                   dma_fence_put(s_job->entity->finished);
> > +           s_job->entity->finished = dma_fence_get(&s_fence-
> >finished);
> > +
> >             if (fence) {
> >                     s_fence->parent = dma_fence_get(fence);
> >                     r = dma_fence_add_callback(fence, &s_fence->cb,
> @@ -556,6 +594,7
> > @@ int drm_sched_job_init(struct drm_sched_job *job,
> >                    void *owner)
> >   {
> >     job->sched = sched;
> > +   job->entity = entity;
> >     job->s_priority = entity->rq - sched->sched_rq;
> >     job->s_fence = drm_sched_fence_create(entity, owner);
> >     if (!job->s_fence)
> > @@ -669,6 +708,10 @@ static int drm_sched_main(void *param)
> >             fence = sched->ops->run_job(sched_job);
> >             drm_sched_fence_scheduled(s_fence);
> >
> > +           if(entity->finished)
> > +                   dma_fence_put(entity->finished);
> > +           entity->finished = dma_fence_get(&s_fence->finished);
> > +
> >             if (fence) {
> >                     s_fence->parent = dma_fence_get(fence);
> >                     r = dma_fence_add_callback(fence, &s_fence->cb,
> diff --git
> > a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index
> > dfd54fb..e0a5a53 100644
> > --- a/include/drm/gpu_scheduler.h
> > +++ b/include/drm/gpu_scheduler.h
> > @@ -63,6 +63,8 @@ struct drm_sched_entity {
> >     struct dma_fence                *dependency;
> >     struct dma_fence_cb             cb;
> >     atomic_t                        *guilty; /* points to ctx's guilty */
> > +   uint32_t            fini_status;
> > +   struct dma_fence    *finished;
> >   };
> >
> >   /**
> > @@ -99,6 +101,7 @@ struct drm_sched_job {
> >     uint64_t                        id;
> >     atomic_t                        karma;
> >     enum drm_sched_priority         s_priority;
> > +   struct drm_sched_entity  *entity;
> >   };
> >
> >   static inline bool drm_sched_invalidate_job(struct drm_sched_job
> > *s_job, @@ -148,7 +151,9 @@ int drm_sched_entity_init(struct
> drm_gpu_scheduler *sched,
> >                       struct drm_sched_entity *entity,
> >                       struct drm_sched_rq *rq,
> >                       uint32_t jobs, atomic_t *guilty); -void
> > drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
> > +void drm_sched_entity_fini1(struct drm_gpu_scheduler *sched,
> > +                      struct drm_sched_entity *entity); void
> > +drm_sched_entity_fini2(struct drm_gpu_scheduler *sched,
> >                        struct drm_sched_entity *entity);
> >   void drm_sched_entity_push_job(struct drm_sched_job *sched_job,
> >                            struct drm_sched_entity *entity);

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to