amdgpu:resolv deadlock between reset and cs_ioctl v4.

Christian König Tue, 10 Oct 2017 00:05:46 -0700

Am 10.10.2017 um 05:31 schrieb andrey:



On 2017-10-09 04:34 AM, Christian König wrote:

Am 06.10.2017 um 20:20 schrieb Andrey Grodzovsky:

From: Monk Liu <[email protected]>

need to unreserve ttm bo before "cs_add_fence" and "entity_push_job"
otherwise there will be deadlock between "recover_vram_from_shadow"
and previous two routines on the ttm bo's resv lock.

v2:
Add per ctx mutex.

v3:
Rellocate mutex aquisition into amdgpu_cs_parser_init and muex release
into amdgpu_cs_parser_fini to avoid nested locking lockup.
Add rollback code for amdgpu_ctx_add_fence in case of error or signal
interruption.

v4:
Refactor amdgpu_cs_ib_vm_chunk and amdgpu_cs_ib_fill to enable
old fence waiting before reservation lock is aquired.

Change-Id: Ia209beab5036bfc2c38cbf18324fa3efd4bab1cf
Signed-off-by: Monk Liu <[email protected]>
Signed-off-by: Andrey Grodzovsky <[email protected]>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h     |   1 +

drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 164++++++++++++++++++--------------

  drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c |   4 +
  3 files changed, 100 insertions(+), 69 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.hb/drivers/gpu/drm/amd/amdgpu/amdgpu.h

index 53d8df3..baa2953 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -724,6 +724,7 @@ struct amdgpu_ctx {
      struct dma_fence    **fences;
      struct amdgpu_ctx_ring    rings[AMDGPU_MAX_RINGS];
      bool preamble_presented;
+    struct mutex        lock;
  };
    struct amdgpu_ctx_mgr {

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.cb/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c

index 9f1202a..0fa1bc7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c

@@ -89,6 +89,9 @@ static int amdgpu_cs_parser_init(structamdgpu_cs_parser *p, void *data)

          goto free_chunk;
      }
  +
+    mutex_lock(&p->ctx->lock);
+
      /* get chunks */
      chunk_array_user = u64_to_user_ptr(cs->in.chunks);
      if (copy_from_user(chunk_array, chunk_array_user,

@@ -715,28 +718,21 @@ static int amdgpu_cs_sync_rings(structamdgpu_cs_parser *p)

  /**
   * cs_parser_fini() - clean parser states
   * @parser:    parser structure holding parsing context.
- * @error:    error number
- *
- * If error is set than unvalidate buffer, otherwise just free memory
- * used by parsing context.
   **/

-static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser,int error,

-                  bool backoff)
+static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser)

You can now drop this change and keep the logic as it is since wemoved the wait before taking the reservation locks.

By "this change" here and in the comment later bellow I guess you meanthe original change by Monk about moving ttm_eu_fence_buffer_objectsto before amd_sched_entity_push_job ?


Yes, correct.

In that case I am not sure how we can drop it since the wait we movedin the second patch was the one for the old fence while Monk's changewas about the wait happening when the kfifo is full. Isn't revertinghis change will just put us again in the original deadlock of blockingon full kfifo while holding the reservation lock during gpu reset withVRAM recovery ?

Yeah, but we need to fix this kfifo issue in another way anyway. Causewe push to the kfifo during BO migration and page tables updates as welland that can only happen while the BOs are reserved.

In general please concentrate on one change at a time. In other wordsthis first patch should just move filling the IBs before taking the BOreservation lock.

To do this you just need to move the calls to amdgpu_cs_find_mapping()and amdgpu_bo_kmap() and amdgpu_bo_kunmap() out of amdgpu_cs_fill_ib(),but please try to keep the amdgpu_ib_get() at the original place.


Regards,
Christian.


Thanks,
Andrey

  {
      unsigned i;
  -    if (error && backoff)
-        ttm_eu_backoff_reservation(&parser->ticket,
-                       &parser->validated);
-
      for (i = 0; i < parser->num_post_dep_syncobjs; i++)
          drm_syncobj_put(parser->post_dep_syncobjs[i]);
      kfree(parser->post_dep_syncobjs);
        dma_fence_put(parser->fence);
  -    if (parser->ctx)
+    if (parser->ctx) {
+        mutex_unlock(&parser->ctx->lock);
          amdgpu_ctx_put(parser->ctx);
+    }
      if (parser->bo_list)
          amdgpu_bo_list_put(parser->bo_list);

@@ -843,7 +839,72 @@ static int amdgpu_cs_ib_vm_chunk(structamdgpu_device *adev,

      struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
      struct amdgpu_vm *vm = &fpriv->vm;
      struct amdgpu_ring *ring = p->job->ring;
-    int i, r;
+    int i, j, r;
+
+    for (i = 0, j = 0; i < p->nchunks && j < p->job->num_ibs; i++) {
+
+        struct amdgpu_cs_chunk *chunk;
+        struct amdgpu_ib *ib;
+        struct drm_amdgpu_cs_chunk_ib *chunk_ib;
+
+        chunk = &p->chunks[i];
+        ib = &p->job->ibs[j];
+        chunk_ib = (struct drm_amdgpu_cs_chunk_ib *)chunk->kdata;
+
+        if (chunk->chunk_id != AMDGPU_CHUNK_ID_IB)
+                    continue;
+
+        if (p->job->ring->funcs->parse_cs) {
+            struct amdgpu_bo_va_mapping *m;
+            struct amdgpu_bo *aobj = NULL;
+            uint64_t offset;
+            uint8_t *kptr;
+
+            r = amdgpu_cs_find_mapping(p, chunk_ib->va_start,
+                           &aobj, &m);
+            if (r) {
+                DRM_ERROR("IB va_start is invalid\n");
+                return r;
+            }
+
+            if ((chunk_ib->va_start + chunk_ib->ib_bytes) >
+                (m->last + 1) * AMDGPU_GPU_PAGE_SIZE) {
+                DRM_ERROR("IB va_start+ib_bytes is invalid\n");
+                return -EINVAL;
+            }
+
+            /* the IB should be reserved at this point */
+            r = amdgpu_bo_kmap(aobj, (void **)&kptr);
+            if (r) {
+                return r;
+            }
+
+            offset = m->start * AMDGPU_GPU_PAGE_SIZE;
+            kptr += chunk_ib->va_start - offset;
+
+            r =  amdgpu_ib_get(adev, vm, chunk_ib->ib_bytes, ib);
+            if (r) {
+                DRM_ERROR("Failed to get ib !\n");
+                return r;
+            }
+
+            memcpy(ib->ptr, kptr, chunk_ib->ib_bytes);
+            amdgpu_bo_kunmap(aobj);
+        } else {
+            r =  amdgpu_ib_get(adev, vm, 0, ib);
+            if (r) {
+                DRM_ERROR("Failed to get ib !\n");
+                return r;
+            }
+
+        }
+
+        ib->gpu_addr = chunk_ib->va_start;
+        ib->length_dw = chunk_ib->ib_bytes / 4;
+        ib->flags = chunk_ib->flags;


Please keep the calls to amdgpu_ib_get() inside amdgpu_cs_ib_fill().

+        j++;
+
+    }
        /* Only for UVD/VCE VM emulation */
      if (ring->funcs->parse_cs) {

@@ -868,19 +929,15 @@ static int amdgpu_cs_ib_vm_chunk(structamdgpu_device *adev,

  static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
                   struct amdgpu_cs_parser *parser)
  {
-    struct amdgpu_fpriv *fpriv = parser->filp->driver_priv;
-    struct amdgpu_vm *vm = &fpriv->vm;
      int i, j;
      int r, ce_preempt = 0, de_preempt = 0;

for (i = 0, j = 0; i < parser->nchunks && j <parser->job->num_ibs; i++) {

          struct amdgpu_cs_chunk *chunk;
-        struct amdgpu_ib *ib;
          struct drm_amdgpu_cs_chunk_ib *chunk_ib;
          struct amdgpu_ring *ring;
            chunk = &parser->chunks[i];
-        ib = &parser->job->ibs[j];
          chunk_ib = (struct drm_amdgpu_cs_chunk_ib *)chunk->kdata;
            if (chunk->chunk_id != AMDGPU_CHUNK_ID_IB)

@@ -917,54 +974,6 @@ static int amdgpu_cs_ib_fill(structamdgpu_device *adev,

            parser->job->ring = ring;
  -        if (ring->funcs->parse_cs) {
-            struct amdgpu_bo_va_mapping *m;
-            struct amdgpu_bo *aobj = NULL;
-            uint64_t offset;
-            uint8_t *kptr;
-
-            r = amdgpu_cs_find_mapping(parser, chunk_ib->va_start,
-                           &aobj, &m);
-            if (r) {
-                DRM_ERROR("IB va_start is invalid\n");
-                return r;
-            }
-
-            if ((chunk_ib->va_start + chunk_ib->ib_bytes) >
-                (m->last + 1) * AMDGPU_GPU_PAGE_SIZE) {
-                DRM_ERROR("IB va_start+ib_bytes is invalid\n");
-                return -EINVAL;
-            }
-
-            /* the IB should be reserved at this point */
-            r = amdgpu_bo_kmap(aobj, (void **)&kptr);
-            if (r) {
-                return r;
-            }
-
-            offset = m->start * AMDGPU_GPU_PAGE_SIZE;
-            kptr += chunk_ib->va_start - offset;
-
-            r =  amdgpu_ib_get(adev, vm, chunk_ib->ib_bytes, ib);
-            if (r) {
-                DRM_ERROR("Failed to get ib !\n");
-                return r;
-            }
-
-            memcpy(ib->ptr, kptr, chunk_ib->ib_bytes);
-            amdgpu_bo_kunmap(aobj);
-        } else {
-            r =  amdgpu_ib_get(adev, vm, 0, ib);
-            if (r) {
-                DRM_ERROR("Failed to get ib !\n");
-                return r;
-            }
-
-        }
-
-        ib->gpu_addr = chunk_ib->va_start;
-        ib->length_dw = chunk_ib->ib_bytes / 4;
-        ib->flags = chunk_ib->flags;
          j++;
      }

@@ -1160,14 +1169,26 @@ static int amdgpu_cs_submit(structamdgpu_cs_parser *p,

        amdgpu_cs_post_dependencies(p);
  +
+    /* hook sched fence to all BOs' reservation in validated list
+     * and unreserve them.
+     *
+     * we unreserve at here is because otherwise

+ * there'll be deadlock betweenctx_add_fence/sched_entity_push_job+ * and gpu_reset routine's recover_bo_from_shadow on PD/PTEs'ttm bo lock

+     */
+    ttm_eu_fence_buffer_objects(&p->ticket, &p->validated, p->fence);
+
+
      cs->out.handle = amdgpu_ctx_add_fence(p->ctx, ring, p->fence);
+
+

Please drop that change. It isn't necessary any more after you movedthe waiting outside of the BO lock.


Dito for most of the other changes as well.

Regards,
Christian.

      job->uf_sequence = cs->out.handle;
      amdgpu_job_free_resources(job);
        trace_amdgpu_cs_ioctl(job);
      amd_sched_entity_push_job(&job->base);

- ttm_eu_fence_buffer_objects(&p->ticket, &p->validated,p->fence);

      amdgpu_mn_unlock(p->mn);
        return 0;

@@ -1189,6 +1210,7 @@ int amdgpu_cs_ioctl(struct drm_device *dev,void *data, struct drm_file *filp)

        parser.adev = adev;
      parser.filp = filp;
+    fpriv = filp->driver_priv;
        r = amdgpu_cs_parser_init(&parser, data);
      if (r) {

@@ -1196,6 +1218,10 @@ int amdgpu_cs_ioctl(struct drm_device *dev,void *data, struct drm_file *filp)

          goto out;
      }
  +    r = amdgpu_cs_ib_fill(adev, &parser);
+    if (r)
+        goto out;
+
      r = amdgpu_cs_parser_bos(&parser, data);
      if (r) {
          if (r == -ENOMEM)

@@ -1206,9 +1232,6 @@ int amdgpu_cs_ioctl(struct drm_device *dev,void *data, struct drm_file *filp)

      }
        reserved_buffers = true;
-    r = amdgpu_cs_ib_fill(adev, &parser);
-    if (r)
-        goto out;
        r = amdgpu_cs_dependencies(adev, &parser);
      if (r) {

@@ -1226,7 +1249,10 @@ int amdgpu_cs_ioctl(struct drm_device *dev,void *data, struct drm_file *filp)

      r = amdgpu_cs_submit(&parser, cs);
    out:
-    amdgpu_cs_parser_fini(&parser, r, reserved_buffers);
+    if (r && reserved_buffers)
+        ttm_eu_backoff_reservation(&parser.ticket, &parser.validated);
+
+    amdgpu_cs_parser_fini(&parser);
      return r;
  }

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.cb/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c

index a11e443..c073a68 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c

@@ -39,6 +39,8 @@ static int amdgpu_ctx_init(struct amdgpu_device*adev, struct amdgpu_ctx *ctx)

      if (!ctx->fences)
          return -ENOMEM;
  +    mutex_init(&ctx->lock);
+
      for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
          ctx->rings[i].sequence = 1;
          ctx->rings[i].fences = &ctx->fences[amdgpu_sched_jobs * i];
@@ -96,6 +98,8 @@ static void amdgpu_ctx_fini(struct amdgpu_ctx *ctx)
                        &ctx->rings[i].entity);
        amdgpu_queue_mgr_fini(adev, &ctx->queue_mgr);
+
+    mutex_destroy(&ctx->lock);
  }
    static int amdgpu_ctx_alloc(struct amdgpu_device *adev,


_______________________________________________
amd-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH 1/2] drm/amdgpu:resolv deadlock between reset and cs_ioctl v4.

Reply via email to