u_threaded: implement asynchronous flushes

Nicolai Hähnle Sun, 22 Oct 2017 12:15:15 -0700

From: Nicolai Hähnle <nicolai.haeh...@amd.com>

This requires out-of-band creation of fences, and will be signaled to
the pipe_context::flush implementation by a special TC_FLUSH_ASYNC flag.
---
 src/gallium/auxiliary/util/u_threaded_context.c    | 96 +++++++++++++++++++++-
 src/gallium/auxiliary/util/u_threaded_context.h    | 56 +++++++++++++
 .../auxiliary/util/u_threaded_context_calls.h      |  1 +
 src/gallium/drivers/radeonsi/si_fence.c            | 90 +++++++++++++++++---
 src/gallium/drivers/radeonsi/si_pipe.c             |  1 +
 src/gallium/drivers/radeonsi/si_pipe.h             |  2 +
 6 files changed, 233 insertions(+), 13 deletions(-)


diff --git a/src/gallium/auxiliary/util/u_threaded_context.c 
b/src/gallium/auxiliary/util/u_threaded_context.c
index 24fab7f5cb6..485d912ca28 100644
--- a/src/gallium/auxiliary/util/u_threaded_context.c
+++ b/src/gallium/auxiliary/util/u_threaded_context.c
@@ -81,40 +81,47 @@ tc_debug_check(struct threaded_context *tc)
 
 static void
 tc_batch_execute(void *job, int thread_index)
 {
    struct tc_batch *batch = job;
    struct pipe_context *pipe = batch->pipe;
    struct tc_call *last = &batch->call[batch->num_total_call_slots];
 
    tc_batch_check(batch);
 
+   assert(!batch->token);
+
    for (struct tc_call *iter = batch->call; iter != last;
         iter += iter->num_call_slots) {
       tc_assert(iter->sentinel == TC_SENTINEL);
       execute_func[iter->call_id](pipe, &iter->payload);
    }
 
    tc_batch_check(batch);
    batch->num_total_call_slots = 0;
 }
 
 static void
 tc_batch_flush(struct threaded_context *tc)
 {
    struct tc_batch *next = &tc->batch_slots[tc->next];
 
    tc_assert(next->num_total_call_slots != 0);
    tc_batch_check(next);
    tc_debug_check(tc);
    p_atomic_add(&tc->num_offloaded_slots, next->num_total_call_slots);
 
+   if (next->token) {
+      next->token->tc = NULL;
+      tc_unflushed_batch_token_reference(&next->token, NULL);
+   }
+
    util_queue_add_job(&tc->queue, next, &next->fence, tc_batch_execute,
                       NULL);
    tc->last = tc->next;
    tc->next = (tc->next + 1) % TC_MAX_BATCHES;
 }
 
 /* This is the function that adds variable-sized calls into the current
  * batch. It also flushes the batch if there is not enough space there.
  * All other higher-level "add" functions use it.
  */
@@ -172,40 +179,63 @@ _tc_sync(struct threaded_context *tc, const char *info, 
const char *func)
    tc_debug_check(tc);
 
    /* Only wait for queued calls... */
    if (!util_queue_fence_is_signalled(&last->fence)) {
       util_queue_fence_wait(&last->fence);
       synced = true;
    }
 
    tc_debug_check(tc);
 
+   if (next->token) {
+      next->token->tc = NULL;
+      tc_unflushed_batch_token_reference(&next->token, NULL);
+   }
+
    /* .. and execute unflushed calls directly. */
    if (next->num_total_call_slots) {
       p_atomic_add(&tc->num_direct_slots, next->num_total_call_slots);
       tc_batch_execute(next, 0);
       synced = true;
    }
 
    if (synced) {
       p_atomic_inc(&tc->num_syncs);
 
       if (tc_strcmp(func, "tc_destroy") != 0)
          tc_printf("sync %s %s\n", func, info);
    }
 
    tc_debug_check(tc);
 }
 
 #define tc_sync(tc) _tc_sync(tc, "", __func__)
 #define tc_sync_msg(tc, info) _tc_sync(tc, info, __func__)
 
+/**
+ * Call this from fence_finish for same-context fence waits of deferred fences
+ * that haven't been flushed yet.
+ *
+ * The passed pipe_context must be the one passed to pipe_screen::fence_finish,
+ * i.e., the wrapped one.
+ */
+void
+threaded_context_flush(struct pipe_context *_pipe,
+                       struct tc_unflushed_batch_token *token)
+{
+   struct threaded_context *tc = threaded_context(_pipe);
+
+   /* This is called from the state-tracker / application thread. */
+   if (token->tc && token->tc == tc)
+      tc_sync(token->tc);
+}
+
 static void
 tc_set_resource_reference(struct pipe_resource **dst, struct pipe_resource 
*src)
 {
    *dst = NULL;
    pipe_resource_reference(dst, src);
 }
 
 void
 threaded_resource_init(struct pipe_resource *res)
 {
@@ -1775,36 +1805,94 @@ tc_create_video_buffer(struct pipe_context *_pipe,
 {
    unreachable("Threaded context should not be enabled for video APIs");
    return NULL;
 }
 
 
 /********************************************************************
  * draw, launch, clear, blit, copy, flush
  */
 
+struct tc_flush_payload {
+   struct pipe_fence_handle *fence;
+   unsigned flags;
+};
+
+static void
+tc_call_flush(struct pipe_context *pipe, union tc_payload *payload)
+{
+   struct tc_flush_payload *p = (struct tc_flush_payload *)payload;
+   struct pipe_screen *screen = pipe->screen;
+
+   pipe->flush(pipe, p->fence ? &p->fence : NULL, p->flags);
+   screen->fence_reference(screen, &p->fence, NULL);
+}
+
 static void
 tc_flush(struct pipe_context *_pipe, struct pipe_fence_handle **fence,
          unsigned flags)
 {
    struct threaded_context *tc = threaded_context(_pipe);
    struct pipe_context *pipe = tc->pipe;
+   struct pipe_screen *screen = pipe->screen;
    struct threaded_query *tq, *tmp;
+   bool async = flags & PIPE_FLUSH_DEFERRED;
+
+   if (flags & PIPE_FLUSH_ASYNC) {
+      struct tc_batch *last = &tc->batch_slots[tc->last];
+
+      /* Prefer to do the flush in the driver thread, but avoid the 
inter-thread
+       * communication overhead if the driver thread is currently idle and the
+       * caller is going to wait for the fence immediately anyway.
+       */
+      if (!(util_queue_fence_is_signalled(&last->fence) &&
+            (flags & PIPE_FLUSH_HINT_FINISH)))
+         async = true;
+   }
+
+   if (async) {
+      if (fence) {
+         struct tc_unflushed_batch_token *token = NULL;
+         struct tc_batch *next = &tc->batch_slots[tc->next];
+
+         if (!next->token) {
+            next->token = malloc(sizeof(*next->token));
+            if (!next->token)
+               goto out_of_memory;
 
+            pipe_reference_init(&next->token->ref, 1);
+            next->token->tc = tc;
+         }
+
+         screen->fence_reference(screen, fence, tc->create_fence(pipe, token));
+         if (!*fence)
+            goto out_of_memory;
+      }
+
+      struct tc_flush_payload *p =
+         tc_add_struct_typed_call(tc, TC_CALL_flush, tc_flush_payload);
+      p->fence = fence ? *fence : NULL;
+      p->flags = flags | TC_FLUSH_ASYNC;
+
+      if (!(flags & PIPE_FLUSH_DEFERRED))
+         tc_batch_flush(tc);
+      return;
+   }
+
+out_of_memory:
    if (!(flags & PIPE_FLUSH_DEFERRED)) {
       LIST_FOR_EACH_ENTRY_SAFE(tq, tmp, &tc->unflushed_queries, 
head_unflushed) {
          tq->flushed = true;
          LIST_DEL(&tq->head_unflushed);
       }
    }
 
-   /* TODO: deferred flushes? */
    tc_sync_msg(tc, flags & PIPE_FLUSH_END_OF_FRAME ? "end of frame" :
                    flags & PIPE_FLUSH_DEFERRED ? "deferred fence" : "normal");
    pipe->flush(pipe, fence, flags);
 }
 
 /* This is actually variable-sized, because indirect isn't allocated if it's
  * not needed. */
 struct tc_full_draw_info {
    struct pipe_draw_info draw;
    struct pipe_draw_indirect_info indirect;
@@ -2240,22 +2328,24 @@ tc_destroy(struct pipe_context *_pipe)
       u_upload_destroy(tc->base.const_uploader);
 
    if (tc->base.stream_uploader)
       u_upload_destroy(tc->base.stream_uploader);
 
    tc_sync(tc);
 
    if (util_queue_is_initialized(&tc->queue)) {
       util_queue_destroy(&tc->queue);
 
-      for (unsigned i = 0; i < TC_MAX_BATCHES; i++)
+      for (unsigned i = 0; i < TC_MAX_BATCHES; i++) {
          util_queue_fence_destroy(&tc->batch_slots[i].fence);
+         assert(!tc->batch_slots[i].token);
+      }
    }
 
    slab_destroy_child(&tc->pool_transfers);
    assert(tc->batch_slots[tc->next].num_total_call_slots == 0);
    pipe->destroy(pipe);
    os_free_aligned(tc);
 }
 
 static const tc_execute execute_func[TC_NUM_CALLS] = {
 #define CALL(name) tc_call_##name,
@@ -2272,20 +2362,21 @@ static const tc_execute execute_func[TC_NUM_CALLS] = {
  *                             in pipe_screen.
  * \param replace_buffer  callback for replacing a pipe_resource's storage
  *                        with another pipe_resource's storage.
  * \param out  if successful, the threaded_context will be returned here in
  *             addition to the return value if "out" != NULL
  */
 struct pipe_context *
 threaded_context_create(struct pipe_context *pipe,
                         struct slab_parent_pool *parent_transfer_pool,
                         tc_replace_buffer_storage_func replace_buffer,
+                        tc_create_fence_func create_fence,
                         struct threaded_context **out)
 {
    struct threaded_context *tc;
 
    STATIC_ASSERT(sizeof(union tc_payload) <= 8);
    STATIC_ASSERT(sizeof(struct tc_call) <= 16);
 
    if (!pipe)
       return NULL;
 
@@ -2306,20 +2397,21 @@ threaded_context_create(struct pipe_context *pipe,
    assert(offsetof(struct threaded_context, batch_slots) % 16 == 0);
    assert(offsetof(struct threaded_context, batch_slots[0].call) % 16 == 0);
    assert(offsetof(struct threaded_context, batch_slots[0].call[1]) % 16 == 0);
    assert(offsetof(struct threaded_context, batch_slots[1].call) % 16 == 0);
 
    /* The driver context isn't wrapped, so set its "priv" to NULL. */
    pipe->priv = NULL;
 
    tc->pipe = pipe;
    tc->replace_buffer_storage = replace_buffer;
+   tc->create_fence = create_fence;
    tc->map_buffer_alignment =
       pipe->screen->get_param(pipe->screen, PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT);
    tc->base.priv = pipe; /* priv points to the wrapped driver context */
    tc->base.screen = pipe->screen;
    tc->base.destroy = tc_destroy;
 
    tc->base.stream_uploader = u_upload_clone(&tc->base, pipe->stream_uploader);
    if (pipe->stream_uploader == pipe->const_uploader)
       tc->base.const_uploader = tc->base.stream_uploader;
    else
diff --git a/src/gallium/auxiliary/util/u_threaded_context.h 
b/src/gallium/auxiliary/util/u_threaded_context.h
index 57805ee4a1e..f92f734e57f 100644
--- a/src/gallium/auxiliary/util/u_threaded_context.h
+++ b/src/gallium/auxiliary/util/u_threaded_context.h
@@ -101,20 +101,40 @@
  * 3) The driver isn't allowed to do buffer invalidations by itself under any
  *    circumstances. This is necessary for unsychronized maps to map the latest
  *    version of the buffer. (because invalidations can be queued, while
  *    unsychronized maps are not queued and they should return the latest
  *    storage after invalidation). The threaded context always sends
  *    TC_TRANSFER_MAP_NO_INVALIDATE into transfer_map and buffer_subdata to
  *    indicate this. Ignoring the flag will lead to failures.
  *    The threaded context uses its own buffer invalidation mechanism.
  *
  *
+ * Rules for fences
+ * ----------------
+ *
+ * When the threaded context wants to perform an asynchronous flush, it will
+ * use the create_fence callback to pre-create the fence from the calling
+ * thread. This pre-created fence will be passed to pipe_context::flush
+ * together with the TC_FLUSH_ASYNC flag.
+ *
+ * The callback receives the unwrapped context as a parameter, but must use it
+ * in a thread-safe way because it is called from a non-driver thread.
+ *
+ * If the threaded_context does not immediately flush the current batch, the
+ * callback also receives a tc_unflushed_batch_token. If fence_finish is called
+ * on the returned fence in the context that created the fence,
+ * threaded_context_flush must be called.
+ *
+ * The driver must implement pipe_context::fence_server_sync properly, since
+ * the threaded context handles PIPE_FLUSH_ASYNC.
+ *
+ *
  * Additional requirements
  * -----------------------
  *
  * get_query_result:
  *    If threaded_query::flushed == true, get_query_result should assume that
  *    it's called from a non-driver thread, in which case the driver shouldn't
  *    use the context in an unsafe way.
  *
  * replace_buffer_storage:
  *    The driver has to implement this callback, which will be called when
@@ -153,32 +173,40 @@
  * The batches are ordered in a ring and reused once they are idle again.
  * The batching is necessary for low queue/mutex overhead.
  *
  */
 
 #ifndef U_THREADED_CONTEXT_H
 #define U_THREADED_CONTEXT_H
 
 #include "pipe/p_context.h"
 #include "pipe/p_state.h"
+#include "util/u_inlines.h"
 #include "util/u_queue.h"
 #include "util/u_range.h"
 #include "util/slab.h"
 
+struct threaded_context;
+struct tc_unflushed_batch_token;
+
 /* These are transfer flags sent to drivers. */
 /* Never infer whether it's safe to use unsychronized mappings: */
 #define TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED (1u << 29)
 /* Don't invalidate buffers: */
 #define TC_TRANSFER_MAP_NO_INVALIDATE        (1u << 30)
 /* transfer_map is called from a non-driver thread: */
 #define TC_TRANSFER_MAP_THREADED_UNSYNC      (1u << 31)
 
+/* Custom flush flags sent to drivers. */
+/* fence is pre-populated with a fence created by the create_fence callback */
+#define TC_FLUSH_ASYNC        (1u << 31)
+
 /* Size of the queue = number of batch slots in memory.
  * - 1 batch is always idle and records new commands
  * - 1 batch is being executed
  * so the queue size is TC_MAX_BATCHES - 2 = number of waiting batches.
  *
  * Use a size as small as possible for low CPU L2 cache usage but large enough
  * so that the queue isn't stalled too often for not having enough idle batch
  * slots.
  */
 #define TC_MAX_BATCHES        10
@@ -197,20 +225,22 @@
 /* Threshold for when to enqueue buffer/texture_subdata as-is.
  * If the upload size is greater than this, it will do instead:
  * - for buffers: DISCARD_RANGE is done by the threaded context
  * - for textures: sync and call the driver directly
  */
 #define TC_MAX_SUBDATA_BYTES        320
 
 typedef void (*tc_replace_buffer_storage_func)(struct pipe_context *ctx,
                                                struct pipe_resource *dst,
                                                struct pipe_resource *src);
+typedef struct pipe_fence_handle *(*tc_create_fence_func)(struct pipe_context 
*ctx,
+                                                          struct 
tc_unflushed_batch_token *token);
 
 struct threaded_resource {
    struct pipe_resource b;
    const struct u_resource_vtbl *vtbl;
 
    /* Since buffer invalidations are queued, we can't use the base resource
     * for unsychronized mappings. This points to the latest version of
     * the buffer after the latest invalidation. It's only used for unsychro-
     * nized mappings in the non-driver thread. Initially it's set to &b.
     */
@@ -280,33 +310,45 @@ union tc_payload {
 #endif
 
 /* Each call slot should be aligned to its own size for optimal cache usage. */
 struct ALIGN16 tc_call {
    unsigned sentinel;
    ushort num_call_slots;
    ushort call_id;
    union tc_payload payload;
 };
 
+/**
+ * A token representing an unflushed batch.
+ *
+ * See the general rules for fences for an explanation.
+ */
+struct tc_unflushed_batch_token {
+   struct pipe_reference ref;
+   struct threaded_context *tc;
+};
+
 struct tc_batch {
    struct pipe_context *pipe;
    unsigned sentinel;
    unsigned num_total_call_slots;
+   struct tc_unflushed_batch_token *token;
    struct util_queue_fence fence;
    struct tc_call call[TC_CALLS_PER_BATCH];
 };
 
 struct threaded_context {
    struct pipe_context base;
    struct pipe_context *pipe;
    struct slab_child_pool pool_transfers;
    tc_replace_buffer_storage_func replace_buffer_storage;
+   tc_create_fence_func create_fence;
    unsigned map_buffer_alignment;
 
    struct list_head unflushed_queries;
 
    /* Counters for the HUD. */
    unsigned num_offloaded_slots;
    unsigned num_direct_slots;
    unsigned num_syncs;
 
    struct util_queue queue;
@@ -317,22 +359,27 @@ struct threaded_context {
 };
 
 void threaded_resource_init(struct pipe_resource *res);
 void threaded_resource_deinit(struct pipe_resource *res);
 struct pipe_context *threaded_context_unwrap_sync(struct pipe_context *pipe);
 
 struct pipe_context *
 threaded_context_create(struct pipe_context *pipe,
                         struct slab_parent_pool *parent_transfer_pool,
                         tc_replace_buffer_storage_func replace_buffer,
+                        tc_create_fence_func create_fence,
                         struct threaded_context **out);
 
+void
+threaded_context_flush(struct pipe_context *_pipe,
+                       struct tc_unflushed_batch_token *token);
+
 static inline struct threaded_context *
 threaded_context(struct pipe_context *pipe)
 {
    return (struct threaded_context*)pipe;
 }
 
 static inline struct threaded_resource *
 threaded_resource(struct pipe_resource *res)
 {
    return (struct threaded_resource*)res;
@@ -343,11 +390,20 @@ threaded_query(struct pipe_query *q)
 {
    return (struct threaded_query*)q;
 }
 
 static inline struct threaded_transfer *
 threaded_transfer(struct pipe_transfer *transfer)
 {
    return (struct threaded_transfer*)transfer;
 }
 
+static inline void
+tc_unflushed_batch_token_reference(struct tc_unflushed_batch_token **dst,
+                                   struct tc_unflushed_batch_token *src)
+{
+   if (pipe_reference((struct pipe_reference *)*dst, (struct pipe_reference 
*)src))
+      free(*dst);
+   *dst = src;
+}
+
 #endif
diff --git a/src/gallium/auxiliary/util/u_threaded_context_calls.h 
b/src/gallium/auxiliary/util/u_threaded_context_calls.h
index 546819a2580..1356c54baf2 100644
--- a/src/gallium/auxiliary/util/u_threaded_context_calls.h
+++ b/src/gallium/auxiliary/util/u_threaded_context_calls.h
@@ -1,10 +1,11 @@
+CALL(flush)
 CALL(destroy_query)
 CALL(begin_query)
 CALL(end_query)
 CALL(get_query_result_resource)
 CALL(render_condition)
 CALL(bind_sampler_states)
 CALL(set_framebuffer_state)
 CALL(set_tess_state)
 CALL(set_constant_buffer)
 CALL(set_scissor_states)
diff --git a/src/gallium/drivers/radeonsi/si_fence.c 
b/src/gallium/drivers/radeonsi/si_fence.c
index b416c47aa30..c51efad7106 100644
--- a/src/gallium/drivers/radeonsi/si_fence.c
+++ b/src/gallium/drivers/radeonsi/si_fence.c
@@ -19,27 +19,30 @@
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 
THE
  * SOFTWARE.
  *
  */
 
 #include <libsync.h>
 
 #include "util/os_time.h"
 #include "util/u_memory.h"
+#include "util/u_queue.h"
 
 #include "si_pipe.h"
 
 struct si_multi_fence {
        struct pipe_reference reference;
        struct pipe_fence_handle *gfx;
        struct pipe_fence_handle *sdma;
+       struct tc_unflushed_batch_token *tc_token;
+       struct util_queue_fence ready;
 
        /* If the context wasn't flushed at fence creation, this is non-NULL. */
        struct {
                struct r600_common_context *ctx;
                unsigned ib_index;
        } gfx_unflushed;
 };
 
 static void si_add_fence_dependency(struct r600_common_context *rctx,
                                    struct pipe_fence_handle *fence)
@@ -55,38 +58,66 @@ static void si_fence_reference(struct pipe_screen *screen,
                               struct pipe_fence_handle **dst,
                               struct pipe_fence_handle *src)
 {
        struct radeon_winsys *ws = ((struct r600_common_screen*)screen)->ws;
        struct si_multi_fence **rdst = (struct si_multi_fence **)dst;
        struct si_multi_fence *rsrc = (struct si_multi_fence *)src;
 
        if (pipe_reference(&(*rdst)->reference, &rsrc->reference)) {
                ws->fence_reference(&(*rdst)->gfx, NULL);
                ws->fence_reference(&(*rdst)->sdma, NULL);
+               tc_unflushed_batch_token_reference(&(*rdst)->tc_token, NULL);
                FREE(*rdst);
        }
         *rdst = rsrc;
 }
 
+static struct si_multi_fence *si_create_multi_fence()
+{
+       struct si_multi_fence *fence = CALLOC_STRUCT(si_multi_fence);
+       if (!fence)
+               return NULL;
+
+       pipe_reference_init(&fence->reference, 1);
+       util_queue_fence_init(&fence->ready);
+
+       return fence;
+}
+
+struct pipe_fence_handle *si_create_fence(struct pipe_context *ctx,
+                                         struct tc_unflushed_batch_token 
*tc_token)
+{
+       struct si_multi_fence *fence = si_create_multi_fence();
+       if (!fence)
+               return NULL;
+
+       util_queue_fence_reset(&fence->ready);
+       tc_unflushed_batch_token_reference(&fence->tc_token, tc_token);
+
+       return (struct pipe_fence_handle *)fence;
+}
+
 static void si_fence_server_sync(struct pipe_context *ctx,
                                 struct pipe_fence_handle *fence)
 {
        struct r600_common_context *rctx = (struct r600_common_context *)ctx;
        struct si_multi_fence *rfence = (struct si_multi_fence *)fence;
 
        /* Only amdgpu needs to handle fence dependencies (for fence imports).
         * radeon synchronizes all rings by default and will not implement
         * fence imports.
         */
        if (rctx->screen->info.drm_major == 2)
                return;
 
+       util_queue_fence_wait(&rfence->ready);
+
        /* Only imported fences need to be handled by fence_server_sync,
         * because the winsys handles synchronizations automatically for BOs
         * within the process.
         *
         * Simply skip unflushed fences here, and the winsys will drop no-op
         * dependencies (i.e. dependencies within the same ring).
         */
        if (rfence->gfx_unflushed.ctx)
                return;
 
@@ -107,20 +138,46 @@ static boolean si_fence_finish(struct pipe_screen *screen,
                               uint64_t timeout)
 {
        struct radeon_winsys *rws = ((struct r600_common_screen*)screen)->ws;
        struct si_multi_fence *rfence = (struct si_multi_fence *)fence;
        struct r600_common_context *rctx;
        int64_t abs_timeout = os_time_get_absolute_timeout(timeout);
 
        ctx = threaded_context_unwrap_sync(ctx);
        rctx = ctx ? (struct r600_common_context*)ctx : NULL;
 
+       if (!util_queue_fence_is_signalled(&rfence->ready)) {
+               if (!timeout)
+                       return false;
+
+               if (rfence->tc_token) {
+                       /* Ensure that si_flush_from_st will be called for
+                        * this fence, but only if we're in the API thread
+                        * where the context is current.
+                        *
+                        * Note that the batch containing the flush may already
+                        * be in flight in the driver thread, so the fence
+                        * may not be ready yet when this call returns.
+                        */
+                       threaded_context_flush(ctx, rfence->tc_token);
+               }
+
+               if (timeout == PIPE_TIMEOUT_INFINITE) {
+                       util_queue_fence_wait(&rfence->ready);
+               } else {
+                       if (!util_queue_fence_wait_timeout(&rfence->ready, 
abs_timeout))
+                               return false;
+               }
+
+               assert(!rfence->tc_token);
+       }
+
        if (rfence->sdma) {
                if (!rws->fence_wait(rws, rfence->sdma, timeout))
                        return false;
 
                /* Recompute the timeout after waiting. */
                if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
                        int64_t time = os_time_get_nano();
                        timeout = abs_timeout > time ? abs_timeout - time : 0;
                }
        }
@@ -153,45 +210,46 @@ static void si_create_fence_fd(struct pipe_context *ctx,
 {
        struct r600_common_screen *rscreen = (struct 
r600_common_screen*)ctx->screen;
        struct radeon_winsys *ws = rscreen->ws;
        struct si_multi_fence *rfence;
 
        *pfence = NULL;
 
        if (!rscreen->info.has_sync_file)
                return;
 
-       rfence = CALLOC_STRUCT(si_multi_fence);
+       rfence = si_create_multi_fence();
        if (!rfence)
                return;
 
-       pipe_reference_init(&rfence->reference, 1);
        rfence->gfx = ws->fence_import_sync_file(ws, fd);
        if (!rfence->gfx) {
                FREE(rfence);
                return;
        }
 
        *pfence = (struct pipe_fence_handle*)rfence;
 }
 
 static int si_fence_get_fd(struct pipe_screen *screen,
                           struct pipe_fence_handle *fence)
 {
        struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
        struct radeon_winsys *ws = rscreen->ws;
        struct si_multi_fence *rfence = (struct si_multi_fence *)fence;
        int gfx_fd = -1, sdma_fd = -1;
 
        if (!rscreen->info.has_sync_file)
                return -1;
 
+       util_queue_fence_wait(&rfence->ready);
+
        /* Deferred fences aren't supported. */
        assert(!rfence->gfx_unflushed.ctx);
        if (rfence->gfx_unflushed.ctx)
                return -1;
 
        if (rfence->sdma) {
                sdma_fd = ws->fence_export_sync_file(ws, rfence->sdma);
                if (sdma_fd == -1)
                        return -1;
        }
@@ -253,40 +311,50 @@ static void si_flush_from_st(struct pipe_context *ctx,
                    fence) {
                        gfx_fence = rctx->ws->cs_get_next_fence(rctx->gfx.cs);
                        deferred_fence = true;
                } else {
                        rctx->gfx.flush(rctx, rflags, fence ? &gfx_fence : 
NULL);
                }
        }
 
        /* Both engines can signal out of order, so we need to keep both 
fences. */
        if (fence) {
-               struct si_multi_fence *multi_fence =
-                               CALLOC_STRUCT(si_multi_fence);
-               if (!multi_fence) {
-                       ws->fence_reference(&sdma_fence, NULL);
-                       ws->fence_reference(&gfx_fence, NULL);
-                       goto finish;
+               struct si_multi_fence *multi_fence;
+
+               if (flags & TC_FLUSH_ASYNC) {
+                       multi_fence = (struct si_multi_fence *)*fence;
+                       assert(multi_fence);
+               } else {
+                       multi_fence = si_create_multi_fence();
+                       if (!multi_fence) {
+                               ws->fence_reference(&sdma_fence, NULL);
+                               ws->fence_reference(&gfx_fence, NULL);
+                               goto finish;
+                       }
+
+                       screen->fence_reference(screen, fence, NULL);
+                       *fence = (struct pipe_fence_handle*)multi_fence;
                }
 
-               multi_fence->reference.count = 1;
                /* If both fences are NULL, fence_finish will always return 
true. */
                multi_fence->gfx = gfx_fence;
                multi_fence->sdma = sdma_fence;
 
                if (deferred_fence) {
                        multi_fence->gfx_unflushed.ctx = rctx;
                        multi_fence->gfx_unflushed.ib_index = 
rctx->num_gfx_cs_flushes;
                }
 
-               screen->fence_reference(screen, fence, NULL);
-               *fence = (struct pipe_fence_handle*)multi_fence;
+               if (flags & TC_FLUSH_ASYNC) {
+                       util_queue_fence_signal(&multi_fence->ready);
+                       
tc_unflushed_batch_token_reference(&multi_fence->tc_token, NULL);
+               }
        }
 finish:
        if (!(flags & PIPE_FLUSH_DEFERRED)) {
                if (rctx->dma.cs)
                        ws->cs_sync_flush(rctx->dma.cs);
                ws->cs_sync_flush(rctx->gfx.cs);
        }
 }
 
 void si_init_fence_functions(struct si_context *ctx)
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c 
b/src/gallium/drivers/radeonsi/si_pipe.c
index f5a7c96cc34..19428d8b4e7 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -400,20 +400,21 @@ static struct pipe_context *si_pipe_create_context(struct 
pipe_screen *screen,
        if (flags & (PIPE_CONTEXT_COMPUTE_ONLY | PIPE_CONTEXT_DEBUG))
                return ctx;
 
        /* When shaders are logged to stderr, asynchronous compilation is
         * disabled too. */
        if (sscreen->b.debug_flags & DBG_ALL_SHADERS)
                return ctx;
 
        return threaded_context_create(ctx, &sscreen->b.pool_transfers,
                                       si_replace_buffer_storage,
+                                      si_create_fence,
                                       &((struct si_context*)ctx)->b.tc);
 }
 
 /*
  * pipe_screen
  */
 static bool si_have_tgsi_compute(struct si_screen *sscreen)
 {
        /* Old kernels disallowed some register writes for SI
         * that are used for indirect dispatches. */
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index dad23d997e4..701b051e60f 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -599,20 +599,22 @@ void si_init_debug_functions(struct si_context *sctx);
 void si_check_vm_faults(struct r600_common_context *ctx,
                        struct radeon_saved_cs *saved, enum ring_type ring);
 bool si_replace_shader(unsigned num, struct ac_shader_binary *binary);
 
 /* si_dma.c */
 void si_init_dma_functions(struct si_context *sctx);
 
 /* si_fence.c */
 void si_init_fence_functions(struct si_context *ctx);
 void si_init_screen_fence_functions(struct si_screen *screen);
+struct pipe_fence_handle *si_create_fence(struct pipe_context *ctx,
+                                         struct tc_unflushed_batch_token 
*tc_token);
 
 /* si_hw_context.c */
 void si_destroy_saved_cs(struct si_saved_cs *scs);
 void si_context_gfx_flush(void *context, unsigned flags,
                          struct pipe_fence_handle **fence);
 void si_begin_new_cs(struct si_context *ctx);
 void si_need_cs_space(struct si_context *ctx);
 
 /* si_compute.c */
 void si_init_compute_functions(struct si_context *sctx);
-- 
2.11.0

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 09/25] gallium/u_threaded: implement asynchronous flushes

Reply via email to