amdgpu: Move old fence waiting before reservation lock is aquired.

Christian König Wed, 11 Oct 2017 00:58:14 -0700

Am 10.10.2017 um 22:50 schrieb Andrey Grodzovsky:

Helps avoiding deadlock during GPU reset.
Added mutex to amdgpu_ctx to preserve order of fences on a ring.


v2:
Put waiting logic in a function in a seperate function in amdgpu_ctx.c

Signed-off-by: Andrey Grodzovsky <[email protected]>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  4 ++++
  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  |  8 ++++++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 30 ++++++++++++++++++++++++------
  3 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index da48f97..235eca5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -741,6 +741,7 @@ struct amdgpu_ctx {
        bool                    preamble_presented;
        enum amd_sched_priority init_priority;
        enum amd_sched_priority override_priority;
+       struct mutex            lock;
  };

struct amdgpu_ctx_mgr {

@@ -763,9 +764,12 @@ void amdgpu_ctx_priority_override(struct amdgpu_ctx *ctx,
  int amdgpu_ctx_ioctl(struct drm_device *dev, void *data,
                     struct drm_file *filp);

+int amdgpu_ctx_wait_prev_fence(struct amdgpu_ctx *ctx, unsigned ring_id);

+
  void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr);
  void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr);

+

  /*
   * file private structure
   */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 1a54e53..c36297c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -90,6 +90,8 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, 
void *data)
                goto free_chunk;
        }

+ mutex_lock(&p->ctx->lock);

There is a bug in amdgpu_cs_parser_init(), take a look at the put_ctxlabel. It calls amdgpu_ctx_put() without setting p->ctx to NULL after that.

This way amdgpu_cs_parser_fini() will call amdgpu_ctx_put() again andmess up the reference count.

That is not a bug introduced by this patch, so this one is Reviewed-by:Christian König <[email protected]>.

But please provide a follow up patch just removing the extraamdgpu_ctx_put() from amdgpu_cs_parser_init().


Thanks for the help,
Christian.

        /* get chunks */
        chunk_array_user = u64_to_user_ptr(cs->in.chunks);
        if (copy_from_user(chunk_array, chunk_array_user,
@@ -737,8 +739,10 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser 
*parser, int error,

dma_fence_put(parser->fence);- if (parser->ctx)

+       if (parser->ctx) {
+               mutex_unlock(&parser->ctx->lock);
                amdgpu_ctx_put(parser->ctx);
+       }

if (parser->bo_list)

                amdgpu_bo_list_put(parser->bo_list);
@@ -992,7 +996,7 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
            parser->job->ring->funcs->type == AMDGPU_RING_TYPE_VCE))
                return -EINVAL;

- return 0;

+       return amdgpu_ctx_wait_prev_fence(parser->ctx, parser->job->ring->idx);
  }

static int amdgpu_cs_process_fence_dep(struct amdgpu_cs_parser *p,

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index a78b03f6..4309820 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -67,6 +67,8 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev,
        if (!ctx->fences)
                return -ENOMEM;

+ mutex_init(&ctx->lock);

+
        for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
                ctx->rings[i].sequence = 1;
                ctx->rings[i].fences = &ctx->fences[amdgpu_sched_jobs * i];
@@ -126,6 +128,8 @@ static void amdgpu_ctx_fini(struct amdgpu_ctx *ctx)
                                      &ctx->rings[i].entity);

amdgpu_queue_mgr_fini(adev, &ctx->queue_mgr);

+
+       mutex_destroy(&ctx->lock);
  }

static int amdgpu_ctx_alloc(struct amdgpu_device *adev,

@@ -296,12 +300,8 @@ int amdgpu_ctx_add_fence(struct amdgpu_ctx *ctx, struct 
amdgpu_ring *ring,

idx = seq & (amdgpu_sched_jobs - 1);

        other = cring->fences[idx];
-       if (other) {
-               signed long r;
-               r = dma_fence_wait_timeout(other, true, MAX_SCHEDULE_TIMEOUT);
-               if (r < 0)
-                       return r;
-       }
+       if (other)
+               BUG_ON(!dma_fence_is_signaled(other));

dma_fence_get(fence);@@ -372,6 +372,24 @@ void amdgpu_ctx_priority_override(struct amdgpu_ctx *ctx,

        }
  }

+int amdgpu_ctx_wait_prev_fence(struct amdgpu_ctx *ctx, unsigned ring_id)

+{
+       struct amdgpu_ctx_ring *cring = &ctx->rings[ring_id];
+       unsigned idx = cring->sequence & (amdgpu_sched_jobs - 1);
+       struct dma_fence *other = cring->fences[idx];
+
+       if (other) {
+               signed long r;
+               r = dma_fence_wait_timeout(other, false, MAX_SCHEDULE_TIMEOUT);
+               if (r < 0) {
+                       DRM_ERROR("Error (%ld) waiting for fence!\n", r);
+                       return r;
+               }
+       }
+
+       return 0;
+}
+
  void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr)
  {
        mutex_init(&mgr->lock);



_______________________________________________
amd-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH v2 2/2] drm/amdgpu: Move old fence waiting before reservation lock is aquired.

Reply via email to