RE: [PATCH] drm/gpu-sched: fix force APP kill hang(v3)

Deng, Emily Thu, 12 Apr 2018 19:06:24 -0700

Ping....

Best Wishes,
Emily Deng





> -----Original Message-----
> From: Emily Deng [mailto:[email protected]]
> Sent: Thursday, April 12, 2018 6:22 PM
> To: [email protected]
> Cc: Deng, Emily <[email protected]>; Liu, Monk <[email protected]>
> Subject: [PATCH] drm/gpu-sched: fix force APP kill hang(v3)
> 
> issue:
> there are VMC page fault occurred if force APP kill during 3dmark test, the
> cause is in entity_fini we manually signal all those jobs in entity's queue
> which confuse the sync/dep
> mechanism:
> 
> 1)page fault occurred in sdma's clear job which operate on shadow buffer,
> and shadow buffer's Gart table is cleaned by ttm_bo_release since the fence
> in its reservation was fake signaled by entity_fini() under the case of 
> SIGKILL
> received.
> 
> 2)page fault occurred in gfx' job because during the lifetime of gfx job we
> manually fake signal all jobs from its entity in entity_fini(), thus the
> unmapping/clear PTE job depend on those result fence is satisfied and sdma
> start clearing the PTE and lead to GFX page fault.
> 
> fix:
> 1)should at least wait all jobs already scheduled complete in entity_fini() if
> SIGKILL is the case.
> 
> 2)if a fence signaled and try to clear some entity's dependency, should set
> this entity guilty to prevent its job really run since the dependency is fake
> signaled.
> 
> v2:
> splitting drm_sched_entity_fini() into two functions:
> 1)The first one is does the waiting, removes the entity from the runqueue
> and returns an error when the process was killed.
> 2)The second one then goes over the entity, install it as completion signal 
> for
> the remaining jobs and signals all jobs with an error code.
> 
> v3:
> 1)Replace the fini1 and fini2 with better name 2)Call the first part before 
> the
> VM teardown in
> amdgpu_driver_postclose_kms() and the second part after the VM teardown
> 3)Keep the original function drm_sched_entity_fini to refine the code.
> 
> Signed-off-by: Monk Liu <[email protected]>
> Signed-off-by: Emily Deng <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h       |  2 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c   | 64
> ++++++++++++++++++++++----
>  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c   |  5 ++-
>  drivers/gpu/drm/scheduler/gpu_scheduler.c | 74
> ++++++++++++++++++++++++++-----
>  include/drm/gpu_scheduler.h               |  7 +++
>  5 files changed, 131 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 2babfad..200db73 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -681,6 +681,8 @@ int amdgpu_ctx_ioctl(struct drm_device *dev, void
> *data,  int amdgpu_ctx_wait_prev_fence(struct amdgpu_ctx *ctx, unsigned
> ring_id);
> 
>  void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr);
> +void amdgpu_ctx_mgr_entity_cleanup(struct amdgpu_ctx_mgr *mgr); void
> +amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr);
>  void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr);
> 
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> index 09d35051..659add4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> @@ -111,8 +111,9 @@ static int amdgpu_ctx_init(struct amdgpu_device
> *adev,
>       return r;
>  }
> 
> -static void amdgpu_ctx_fini(struct amdgpu_ctx *ctx)
> +static void amdgpu_ctx_fini(struct kref *ref)
>  {
> +     struct amdgpu_ctx *ctx = container_of(ref, struct amdgpu_ctx,
> +refcount);
>       struct amdgpu_device *adev = ctx->adev;
>       unsigned i, j;
> 
> @@ -125,13 +126,11 @@ static void amdgpu_ctx_fini(struct amdgpu_ctx
> *ctx)
>       kfree(ctx->fences);
>       ctx->fences = NULL;
> 
> -     for (i = 0; i < adev->num_rings; i++)
> -             drm_sched_entity_fini(&adev->rings[i]->sched,
> -                                   &ctx->rings[i].entity);
> -
>       amdgpu_queue_mgr_fini(adev, &ctx->queue_mgr);
> 
>       mutex_destroy(&ctx->lock);
> +
> +     kfree(ctx);
>  }
> 
>  static int amdgpu_ctx_alloc(struct amdgpu_device *adev, @@ -170,12
> +169,15 @@ static int amdgpu_ctx_alloc(struct amdgpu_device *adev,
> static void amdgpu_ctx_do_release(struct kref *ref)  {
>       struct amdgpu_ctx *ctx;
> +     u32 i;
> 
>       ctx = container_of(ref, struct amdgpu_ctx, refcount);
> 
> -     amdgpu_ctx_fini(ctx);
> +     for (i = 0; i < ctx->adev->num_rings; i++)
> +             drm_sched_entity_fini(&ctx->adev->rings[i]->sched,
> +                                                     &ctx->rings[i].entity);
> 
> -     kfree(ctx);
> +     amdgpu_ctx_fini(ref);
>  }
> 
>  static int amdgpu_ctx_free(struct amdgpu_fpriv *fpriv, uint32_t id) @@ -
> 435,16 +437,62 @@ void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr
> *mgr)
>       idr_init(&mgr->ctx_handles);
>  }
> 
> +void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr) {
> +     struct amdgpu_ctx *ctx;
> +     struct idr *idp;
> +     uint32_t id, i;
> +
> +     idp = &mgr->ctx_handles;
> +
> +     idr_for_each_entry(idp, ctx, id) {
> +
> +             if (!ctx->adev)
> +                     return;
> +
> +             for (i = 0; i < ctx->adev->num_rings; i++)
> +                     if (kref_read(&ctx->refcount) == 1)
> +                             drm_sched_entity_do_release(&ctx->adev-
> >rings[i]->sched,
> +                                               &ctx->rings[i].entity);
> +                     else
> +                             DRM_ERROR("ctx %p is still alive\n", ctx);
> +     }
> +}
> +
> +void amdgpu_ctx_mgr_entity_cleanup(struct amdgpu_ctx_mgr *mgr) {
> +     struct amdgpu_ctx *ctx;
> +     struct idr *idp;
> +     uint32_t id, i;
> +
> +     idp = &mgr->ctx_handles;
> +
> +     idr_for_each_entry(idp, ctx, id) {
> +
> +             if (!ctx->adev)
> +                     return;
> +
> +             for (i = 0; i < ctx->adev->num_rings; i++)
> +                     if (kref_read(&ctx->refcount) == 1)
> +                             drm_sched_entity_cleanup(&ctx->adev-
> >rings[i]->sched,
> +                                     &ctx->rings[i].entity);
> +                     else
> +                             DRM_ERROR("ctx %p is still alive\n", ctx);
> +     }
> +}
> +
>  void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr)  {
>       struct amdgpu_ctx *ctx;
>       struct idr *idp;
>       uint32_t id;
> 
> +     amdgpu_ctx_mgr_entity_cleanup(mgr);
> +
>       idp = &mgr->ctx_handles;
> 
>       idr_for_each_entry(idp, ctx, id) {
> -             if (kref_put(&ctx->refcount, amdgpu_ctx_do_release) != 1)
> +             if (kref_put(&ctx->refcount, amdgpu_ctx_fini) != 1)
>                       DRM_ERROR("ctx %p is still alive\n", ctx);
>       }
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> index 487d39e..6cbb427 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> @@ -913,8 +913,7 @@ void amdgpu_driver_postclose_kms(struct
> drm_device *dev,
>               return;
> 
>       pm_runtime_get_sync(dev->dev);
> -
> -     amdgpu_ctx_mgr_fini(&fpriv->ctx_mgr);
> +     amdgpu_ctx_mgr_entity_fini(&fpriv->ctx_mgr);
> 
>       if (adev->asic_type != CHIP_RAVEN) {
>               amdgpu_uvd_free_handles(adev, file_priv); @@ -935,6
> +934,8 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,
>       pd = amdgpu_bo_ref(fpriv->vm.root.base.bo);
> 
>       amdgpu_vm_fini(adev, &fpriv->vm);
> +     amdgpu_ctx_mgr_fini(&fpriv->ctx_mgr);
> +
>       if (pasid)
>               amdgpu_pasid_free_delayed(pd->tbo.resv, pasid);
>       amdgpu_bo_unref(&pd);
> diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c
> b/drivers/gpu/drm/scheduler/gpu_scheduler.c
> index 310275e..9062d44 100644
> --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c
> +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c
> @@ -136,6 +136,8 @@ int drm_sched_entity_init(struct drm_gpu_scheduler
> *sched,
>       entity->rq = rq;
>       entity->sched = sched;
>       entity->guilty = guilty;
> +     entity->fini_status = 0;
> +     entity->finished = NULL;
> 
>       spin_lock_init(&entity->rq_lock);
>       spin_lock_init(&entity->queue_lock);
> @@ -197,19 +199,30 @@ static bool drm_sched_entity_is_ready(struct
> drm_sched_entity *entity)
>       return true;
>  }
> 
> +static void drm_sched_entity_fini_job_cb(struct dma_fence *f,
> +                                 struct dma_fence_cb *cb)
> +{
> +     struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
> +                                              finish_cb);
> +     drm_sched_fence_finished(job->s_fence);
> +     WARN_ON(job->s_fence->parent);
> +     dma_fence_put(&job->s_fence->finished);
> +     job->sched->ops->free_job(job);
> +}
> +
> +
>  /**
>   * Destroy a context entity
>   *
>   * @sched       Pointer to scheduler instance
>   * @entity   The pointer to a valid scheduler entity
>   *
> - * Cleanup and free the allocated resources.
> + * Splitting drm_sched_entity_fini() into two functions, The first one
> + is does the waiting,
> + * removes the entity from the runqueue and returns an error when the
> process was killed.
>   */
> -void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
> +void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,
>                          struct drm_sched_entity *entity)
>  {
> -     int r;
> -
>       if (!drm_sched_entity_is_initialized(sched, entity))
>               return;
>       /**
> @@ -217,13 +230,28 @@ void drm_sched_entity_fini(struct
> drm_gpu_scheduler *sched,
>        * queued IBs or discard them on SIGKILL
>       */
>       if ((current->flags & PF_SIGNALED) && current->exit_code == SIGKILL)
> -             r = -ERESTARTSYS;
> +             entity->fini_status = -ERESTARTSYS;
>       else
> -             r = wait_event_killable(sched->job_scheduled,
> +             entity->fini_status = wait_event_killable(sched-
> >job_scheduled,
>                                       drm_sched_entity_is_idle(entity));
>       drm_sched_entity_set_rq(entity, NULL);
> -     if (r) {
> +}
> +EXPORT_SYMBOL(drm_sched_entity_do_release);
> +
> +/**
> + * Destroy a context entity
> + *
> + * @sched       Pointer to scheduler instance
> + * @entity   The pointer to a valid scheduler entity
> + *
> + * The second one then goes over the entity and signals all jobs with an
> error code.
> + */
> +void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched,
> +                        struct drm_sched_entity *entity)
> +{
> +     if (entity->fini_status) {
>               struct drm_sched_job *job;
> +             int r;
> 
>               /* Park the kernel for a moment to make sure it isn't
> processing
>                * our enity.
> @@ -241,13 +269,28 @@ void drm_sched_entity_fini(struct
> drm_gpu_scheduler *sched,
>                       struct drm_sched_fence *s_fence = job->s_fence;
>                       drm_sched_fence_scheduled(s_fence);
>                       dma_fence_set_error(&s_fence->finished, -ESRCH);
> -                     drm_sched_fence_finished(s_fence);
> -                     WARN_ON(s_fence->parent);
> -                     dma_fence_put(&s_fence->finished);
> -                     sched->ops->free_job(job);
> +                     r = dma_fence_add_callback(entity->finished, &job-
> >finish_cb,
> +
>       drm_sched_entity_fini_job_cb);
> +                     if (r == -ENOENT)
> +                             drm_sched_entity_fini_job_cb(entity-
> >finished, &job->finish_cb);
> +                     else if (r)
> +                             DRM_ERROR("fence add callback failed
> (%d)\n", r);
> +             }
> +
> +             if (entity->finished) {
> +                     dma_fence_put(entity->finished);
> +                     entity->finished = NULL;
>               }
>       }
>  }
> +EXPORT_SYMBOL(drm_sched_entity_cleanup);
> +
> +void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
> +                             struct drm_sched_entity *entity)
> +{
> +     drm_sched_entity_do_release(sched, entity);
> +     drm_sched_entity_cleanup(sched, entity); }
>  EXPORT_SYMBOL(drm_sched_entity_fini);
> 
>  static void drm_sched_entity_wakeup(struct dma_fence *f, struct
> dma_fence_cb *cb) @@ -530,6 +573,11 @@ void
> drm_sched_job_recovery(struct drm_gpu_scheduler *sched)
>               spin_unlock(&sched->job_list_lock);
>               fence = sched->ops->run_job(s_job);
>               atomic_inc(&sched->hw_rq_count);
> +
> +             if (s_job->entity->finished)
> +                     dma_fence_put(s_job->entity->finished);
> +             s_job->entity->finished = dma_fence_get(&s_fence-
> >finished);
> +
>               if (fence) {
>                       s_fence->parent = dma_fence_get(fence);
>                       r = dma_fence_add_callback(fence, &s_fence->cb,
> @@ -556,6 +604,7 @@ int drm_sched_job_init(struct drm_sched_job *job,
>                      void *owner)
>  {
>       job->sched = sched;
> +     job->entity = entity;
>       job->s_priority = entity->rq - sched->sched_rq;
>       job->s_fence = drm_sched_fence_create(entity, owner);
>       if (!job->s_fence)
> @@ -668,6 +717,9 @@ static int drm_sched_main(void *param)
> 
>               fence = sched->ops->run_job(sched_job);
>               drm_sched_fence_scheduled(s_fence);
> +             if (entity->finished)
> +                     dma_fence_put(entity->finished);
> +             entity->finished = dma_fence_get(&s_fence->finished);
> 
>               if (fence) {
>                       s_fence->parent = dma_fence_get(fence); diff --git
> a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index
> c053a32..a0dd947 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -65,6 +65,8 @@ struct drm_sched_entity {
>       struct dma_fence                *dependency;
>       struct dma_fence_cb             cb;
>       atomic_t                        *guilty; /* points to ctx's guilty */
> +     uint32_t            fini_status;
> +     struct dma_fence    *finished;
>  };
> 
>  /**
> @@ -119,6 +121,7 @@ struct drm_sched_job {
>       uint64_t                        id;
>       atomic_t                        karma;
>       enum drm_sched_priority         s_priority;
> +     struct drm_sched_entity  *entity;
>  };
> 
>  static inline bool drm_sched_invalidate_job(struct drm_sched_job *s_job,
> @@ -186,6 +189,10 @@ int drm_sched_entity_init(struct
> drm_gpu_scheduler *sched,
>                         struct drm_sched_entity *entity,
>                         struct drm_sched_rq *rq,
>                         uint32_t jobs, atomic_t *guilty);
> +void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,
> +                        struct drm_sched_entity *entity); void
> +drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched,
> +                        struct drm_sched_entity *entity);
>  void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
>                          struct drm_sched_entity *entity);  void
> drm_sched_entity_push_job(struct drm_sched_job *sched_job,
> --
> 2.7.4

_______________________________________________
amd-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH] drm/gpu-sched: fix force APP kill hang(v3)

Reply via email to