unsubmitted fences

Nicolai Hähnle Thu, 09 Nov 2017 05:19:29 -0800

On 06.11.2017 19:21, Marek Olšák wrote:

On Mon, Nov 6, 2017 at 11:23 AM, Nicolai Hähnle <nhaeh...@gmail.com> wrote:

From: Nicolai HÃ¤hnle <nicolai.haeh...@amd.com>


The idea is to fix the following interleaving of operations
that can arise from deferred fences:

  Thread 1 / Context 1          Thread 2 / Context 2
  --------------------          --------------------
  f = deferred flush
  <------- application-side synchronization ------->
                                fence_server_sync(f)
                                ...
                                flush()
  flush()

We will now stall in fence_server_sync until the flush of context 1
has completed.

This scenario was unlikely to occur previously, because applications
seem to be doing

  Thread 1 / Context 1          Thread 2 / Context 2
  --------------------          --------------------
  f = glFenceSync()
  glFlush()
  <------- application-side synchronization ------->
                                glWaitSync(f)

... and indeed they probably *have* to use this ordering to avoid
deadlocks in the GLX model, where all GL operations conceptually
go through a single connection to the X server. However, it's less
clear whether applications have to do this with other WSI (i.e. EGL).
Besides, even this sequence of GL commands can be translated into
the Gallium-level sequence outlined above when Gallium threading
and asynchronous flushes are used. So it makes sense to be more
robust.

As a side effect, we no longer busy-wait on submission_in_progress.

We won't enable asynchronous flushes on radeon, but add a
cs_add_fence_dependency stub anyway to document the potential
issue.
---
  src/gallium/drivers/radeon/radeon_winsys.h    |  4 +++-
  src/gallium/winsys/amdgpu/drm/amdgpu_cs.c     | 21 +++++++++++++--------
  src/gallium/winsys/amdgpu/drm/amdgpu_cs.h     |  9 ++++++---
  src/gallium/winsys/radeon/drm/radeon_drm_cs.c | 19 +++++++++++++++++++
  4 files changed, 41 insertions(+), 12 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_winsys.h 
b/src/gallium/drivers/radeon/radeon_winsys.h
index 2d3f646dc65..e8c486cb7f4 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -536,21 +536,23 @@ struct radeon_winsys {
       * \return Negative POSIX error code or 0 for success.
       *         Asynchronous submissions never return an error.
       */
      int (*cs_flush)(struct radeon_winsys_cs *cs,
                      unsigned flags,
                      struct pipe_fence_handle **fence);

      /**
       * Create a fence before the CS is flushed.
       * The user must flush manually to complete the initializaton of the 
fence.
-     * The fence must not be used before the flush.
+     *
+     * The fence must not be used for anything except \ref 
cs_add_fence_dependency
+     * before the flush.
       */
      struct pipe_fence_handle *(*cs_get_next_fence)(struct radeon_winsys_cs 
*cs);

      /**
       * Return true if a buffer is referenced by a command stream.
       *
       * \param cs        A command stream.
       * \param buf       A winsys buffer.
       */
      bool (*cs_is_buffer_referenced)(struct radeon_winsys_cs *cs,
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c 
b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index 0450ccc3596..0628e547351 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -43,21 +43,22 @@ amdgpu_fence_create(struct amdgpu_ctx *ctx, unsigned 
ip_type,
  {
     struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);

     fence->reference.count = 1;
     fence->ws = ctx->ws;
     fence->ctx = ctx;
     fence->fence.context = ctx->ctx;
     fence->fence.ip_type = ip_type;
     fence->fence.ip_instance = ip_instance;
     fence->fence.ring = ring;
-   fence->submission_in_progress = true;
+   util_queue_fence_init(&fence->submitted);
+   util_queue_fence_reset(&fence->submitted);
     p_atomic_inc(&ctx->refcount);
     return (struct pipe_fence_handle *)fence;
  }

  static struct pipe_fence_handle *
  amdgpu_fence_import_sync_file(struct radeon_winsys *rws, int fd)
  {
     struct amdgpu_winsys *ws = amdgpu_winsys(rws);
     struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);

@@ -74,66 +75,69 @@ amdgpu_fence_import_sync_file(struct radeon_winsys *rws, 
int fd)
        FREE(fence);
        return NULL;
     }

     r = amdgpu_cs_syncobj_import_sync_file(ws->dev, fence->syncobj, fd);
     if (r) {
        amdgpu_cs_destroy_syncobj(ws->dev, fence->syncobj);
        FREE(fence);
        return NULL;
     }
+
+   util_queue_fence_init(&fence->submitted);
+
     return (struct pipe_fence_handle*)fence;
  }

  static int amdgpu_fence_export_sync_file(struct radeon_winsys *rws,
                                          struct pipe_fence_handle *pfence)
  {
     struct amdgpu_winsys *ws = amdgpu_winsys(rws);
     struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence;

     if (amdgpu_fence_is_syncobj(fence)) {
        int fd, r;

        /* Convert syncobj into sync_file. */
        r = amdgpu_cs_syncobj_export_sync_file(ws->dev, fence->syncobj, &fd);
        return r ? -1 : fd;
     }

-   os_wait_until_zero(&fence->submission_in_progress, PIPE_TIMEOUT_INFINITE);
+   util_queue_fence_wait(&fence->submitted);

     /* Convert the amdgpu fence into a fence FD. */
     int fd;
     if (amdgpu_cs_fence_to_handle(ws->dev, &fence->fence,
                                   AMDGPU_FENCE_TO_HANDLE_GET_SYNC_FILE_FD,
                                   (uint32_t*)&fd))
        return -1;

     return fd;
  }

  static void amdgpu_fence_submitted(struct pipe_fence_handle *fence,
                                     uint64_t seq_no,
                                     uint64_t *user_fence_cpu_address)
  {
     struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence;

     rfence->fence.fence = seq_no;
     rfence->user_fence_cpu_address = user_fence_cpu_address;
-   rfence->submission_in_progress = false;
+   util_queue_fence_signal(&rfence->submitted);
  }

  static void amdgpu_fence_signalled(struct pipe_fence_handle *fence)
  {
     struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence;

     rfence->signalled = true;
-   rfence->submission_in_progress = false;
+   util_queue_fence_signal(&rfence->submitted);
  }

  bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout,
                         bool absolute)
  {
     struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence;
     uint32_t expired;
     int64_t abs_timeout;
     uint64_t *user_fence_cpu;
     int r;
@@ -157,22 +161,21 @@ bool amdgpu_fence_wait(struct pipe_fence_handle *fence, 
uint64_t timeout,
     }

     if (absolute)
        abs_timeout = timeout;
     else
        abs_timeout = os_time_get_absolute_timeout(timeout);

     /* The fence might not have a number assigned if its IB is being
      * submitted in the other thread right now. Wait until the submission
      * is done. */
-   if (!os_wait_until_zero_abs_timeout(&rfence->submission_in_progress,
-                                       abs_timeout))
+   if (!util_queue_fence_wait_timeout(&rfence->submitted, abs_timeout))
        return false;

     user_fence_cpu = rfence->user_fence_cpu_address;
     if (user_fence_cpu) {
        if (*user_fence_cpu >= rfence->fence.fence) {
           rfence->signalled = true;
           return true;
        }

        /* No timeout, just query: no need for the ioctl. */
@@ -1022,20 +1025,22 @@ static bool is_noop_fence_dependency(struct amdgpu_cs 
*acs,
     return amdgpu_fence_wait((void *)fence, 0, false);
  }

  static void amdgpu_cs_add_fence_dependency(struct radeon_winsys_cs *rws,
                                             struct pipe_fence_handle *pfence)
  {
     struct amdgpu_cs *acs = amdgpu_cs(rws);
     struct amdgpu_cs_context *cs = acs->csc;
     struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence;

+   util_queue_fence_wait(&fence->submitted);
+
     if (is_noop_fence_dependency(acs, fence))
        return;

     unsigned idx = add_fence_dependency_entry(cs);
     amdgpu_fence_reference(&cs->fence_dependencies[idx],
                            (struct pipe_fence_handle*)fence);
  }

  static void amdgpu_add_bo_fence_dependencies(struct amdgpu_cs *acs,
                                               struct amdgpu_cs_buffer *buffer)
@@ -1297,21 +1302,21 @@ bo_list_error:

           for (unsigned i = 0; i < num_dependencies; i++) {
              struct amdgpu_fence *fence =
                 (struct amdgpu_fence*)cs->fence_dependencies[i];

              if (amdgpu_fence_is_syncobj(fence)) {
                 num_syncobj_dependencies++;
                 continue;
              }

-            assert(!fence->submission_in_progress);
+            assert(util_queue_fence_is_signalled(&fence->submitted));
              amdgpu_cs_chunk_fence_to_dep(&fence->fence, &dep_chunk[num++]);
           }

           chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_DEPENDENCIES;
           chunks[num_chunks].length_dw = sizeof(dep_chunk[0]) / 4 * num;
           chunks[num_chunks].chunk_data = (uintptr_t)dep_chunk;
           num_chunks++;
        }

        /* Syncobj dependencies. */
@@ -1320,21 +1325,21 @@ bo_list_error:
              alloca(num_syncobj_dependencies * sizeof(sem_chunk[0]));
           unsigned num = 0;

           for (unsigned i = 0; i < num_dependencies; i++) {
              struct amdgpu_fence *fence =
                 (struct amdgpu_fence*)cs->fence_dependencies[i];

              if (!amdgpu_fence_is_syncobj(fence))
                 continue;

-            assert(!fence->submission_in_progress);
+            assert(util_queue_fence_is_signalled(&fence->submitted));
              sem_chunk[num++].handle = fence->syncobj;
           }

           chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_IN;
           chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4 * num;
           chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk;
           num_chunks++;
        }

        assert(num_chunks <= ARRAY_SIZE(chunks));
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h 
b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
index b7bc9a20bbf..fbf44b36610 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
@@ -137,23 +137,25 @@ struct amdgpu_cs {
  struct amdgpu_fence {
     struct pipe_reference reference;
     /* If ctx == NULL, this fence is syncobj-based. */
     uint32_t syncobj;

     struct amdgpu_winsys *ws;
     struct amdgpu_ctx *ctx;  /* submission context */
     struct amdgpu_cs_fence fence;
     uint64_t *user_fence_cpu_address;

-   /* If the fence is unknown due to an IB still being submitted
-    * in the other thread. */
-   volatile int submission_in_progress; /* bool (int for atomicity) */
+   /* If the fence has been submitted. This is unsignalled for deferred fences
+    * (cs->next_fence) and while an IB is still being submitted in the submit
+    * thread. */
+   struct util_queue_fence submitted;
+
     volatile int signalled;              /* bool (int for atomicity) */
  };

  static inline bool amdgpu_fence_is_syncobj(struct amdgpu_fence *fence)
  {
     return fence->ctx == NULL;
  }

  static inline void amdgpu_ctx_unref(struct amdgpu_ctx *ctx)
  {
@@ -171,20 +173,21 @@ static inline void amdgpu_fence_reference(struct 
pipe_fence_handle **dst,
     struct amdgpu_fence *rsrc = (struct amdgpu_fence *)src;

     if (pipe_reference(&(*rdst)->reference, &rsrc->reference)) {
        struct amdgpu_fence *fence = *rdst;

        if (amdgpu_fence_is_syncobj(fence))
           amdgpu_cs_destroy_syncobj(fence->ws->dev, fence->syncobj);
        else
           amdgpu_ctx_unref(fence->ctx);

+      util_queue_fence_destroy(&fence->submitted);
        FREE(fence);
     }
     *rdst = rsrc;
  }

  int amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct 
amdgpu_winsys_bo *bo);

  static inline struct amdgpu_ib *
  amdgpu_ib(struct radeon_winsys_cs *base)
  {
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c 
b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
index 7220f3a0240..add88f80aae 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -779,28 +779,47 @@ radeon_drm_cs_get_next_fence(struct radeon_winsys_cs *rcs)
     }

     fence = radeon_cs_create_fence(rcs);
     if (!fence)
        return NULL;

     radeon_fence_reference(&cs->next_fence, fence);
     return fence;
  }

+static void
+radeon_drm_cs_add_fence_dependency(struct radeon_winsys_cs *cs,
+                                   struct pipe_fence_handle *fence)
+{
+   /* TODO: Handle the following unlikely multi-threaded scenario:
+    *
+    *  Thread 1 / Context 1                   Thread 2 / Context 2
+    *  --------------------                   --------------------
+    *  f = cs_get_next_fence()
+    *                                         cs_add_fence_dependency(f)
+    *                                         cs_flush()
+    *  cs_flush()
+    *
+    * We currently assume that this does not happen because we don't support
+    * asynchronous flushes on Radeon.
+    */


We do support asynchronous flushes and deferred fences on Radeon. Not
that it matters.

Well, they're not as asynchronous as on amdgpu, becauseu_threaded_context will sync unconditionally on radeon. You're rightabout deferred fences though.

Reviewed-by: Marek Olšák <marek.ol...@amd.com>


Thanks!


Marek



--
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.
_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH v2 07/26] winsys/amdgpu: handle cs_add_fence_dependency for deferred/unsubmitted fences

Reply via email to