From: Nicolai Hähnle <nicolai.haeh...@amd.com> This requires out-of-band creation of fences, and will be signaled to the pipe_context::flush implementation by a special TC_FLUSH_ASYNC flag. --- src/gallium/auxiliary/util/u_threaded_context.c | 96 +++++++++++++++++++++- src/gallium/auxiliary/util/u_threaded_context.h | 56 +++++++++++++ .../auxiliary/util/u_threaded_context_calls.h | 1 + src/gallium/drivers/radeonsi/si_fence.c | 90 +++++++++++++++++--- src/gallium/drivers/radeonsi/si_pipe.c | 1 + src/gallium/drivers/radeonsi/si_pipe.h | 2 + 6 files changed, 233 insertions(+), 13 deletions(-)
diff --git a/src/gallium/auxiliary/util/u_threaded_context.c b/src/gallium/auxiliary/util/u_threaded_context.c index 24fab7f5cb6..485d912ca28 100644 --- a/src/gallium/auxiliary/util/u_threaded_context.c +++ b/src/gallium/auxiliary/util/u_threaded_context.c @@ -81,40 +81,47 @@ tc_debug_check(struct threaded_context *tc) static void tc_batch_execute(void *job, int thread_index) { struct tc_batch *batch = job; struct pipe_context *pipe = batch->pipe; struct tc_call *last = &batch->call[batch->num_total_call_slots]; tc_batch_check(batch); + assert(!batch->token); + for (struct tc_call *iter = batch->call; iter != last; iter += iter->num_call_slots) { tc_assert(iter->sentinel == TC_SENTINEL); execute_func[iter->call_id](pipe, &iter->payload); } tc_batch_check(batch); batch->num_total_call_slots = 0; } static void tc_batch_flush(struct threaded_context *tc) { struct tc_batch *next = &tc->batch_slots[tc->next]; tc_assert(next->num_total_call_slots != 0); tc_batch_check(next); tc_debug_check(tc); p_atomic_add(&tc->num_offloaded_slots, next->num_total_call_slots); + if (next->token) { + next->token->tc = NULL; + tc_unflushed_batch_token_reference(&next->token, NULL); + } + util_queue_add_job(&tc->queue, next, &next->fence, tc_batch_execute, NULL); tc->last = tc->next; tc->next = (tc->next + 1) % TC_MAX_BATCHES; } /* This is the function that adds variable-sized calls into the current * batch. It also flushes the batch if there is not enough space there. * All other higher-level "add" functions use it. */ @@ -172,40 +179,63 @@ _tc_sync(struct threaded_context *tc, const char *info, const char *func) tc_debug_check(tc); /* Only wait for queued calls... */ if (!util_queue_fence_is_signalled(&last->fence)) { util_queue_fence_wait(&last->fence); synced = true; } tc_debug_check(tc); + if (next->token) { + next->token->tc = NULL; + tc_unflushed_batch_token_reference(&next->token, NULL); + } + /* .. and execute unflushed calls directly. */ if (next->num_total_call_slots) { p_atomic_add(&tc->num_direct_slots, next->num_total_call_slots); tc_batch_execute(next, 0); synced = true; } if (synced) { p_atomic_inc(&tc->num_syncs); if (tc_strcmp(func, "tc_destroy") != 0) tc_printf("sync %s %s\n", func, info); } tc_debug_check(tc); } #define tc_sync(tc) _tc_sync(tc, "", __func__) #define tc_sync_msg(tc, info) _tc_sync(tc, info, __func__) +/** + * Call this from fence_finish for same-context fence waits of deferred fences + * that haven't been flushed yet. + * + * The passed pipe_context must be the one passed to pipe_screen::fence_finish, + * i.e., the wrapped one. + */ +void +threaded_context_flush(struct pipe_context *_pipe, + struct tc_unflushed_batch_token *token) +{ + struct threaded_context *tc = threaded_context(_pipe); + + /* This is called from the state-tracker / application thread. */ + if (token->tc && token->tc == tc) + tc_sync(token->tc); +} + static void tc_set_resource_reference(struct pipe_resource **dst, struct pipe_resource *src) { *dst = NULL; pipe_resource_reference(dst, src); } void threaded_resource_init(struct pipe_resource *res) { @@ -1775,36 +1805,94 @@ tc_create_video_buffer(struct pipe_context *_pipe, { unreachable("Threaded context should not be enabled for video APIs"); return NULL; } /******************************************************************** * draw, launch, clear, blit, copy, flush */ +struct tc_flush_payload { + struct pipe_fence_handle *fence; + unsigned flags; +}; + +static void +tc_call_flush(struct pipe_context *pipe, union tc_payload *payload) +{ + struct tc_flush_payload *p = (struct tc_flush_payload *)payload; + struct pipe_screen *screen = pipe->screen; + + pipe->flush(pipe, p->fence ? &p->fence : NULL, p->flags); + screen->fence_reference(screen, &p->fence, NULL); +} + static void tc_flush(struct pipe_context *_pipe, struct pipe_fence_handle **fence, unsigned flags) { struct threaded_context *tc = threaded_context(_pipe); struct pipe_context *pipe = tc->pipe; + struct pipe_screen *screen = pipe->screen; struct threaded_query *tq, *tmp; + bool async = flags & PIPE_FLUSH_DEFERRED; + + if (flags & PIPE_FLUSH_ASYNC) { + struct tc_batch *last = &tc->batch_slots[tc->last]; + + /* Prefer to do the flush in the driver thread, but avoid the inter-thread + * communication overhead if the driver thread is currently idle and the + * caller is going to wait for the fence immediately anyway. + */ + if (!(util_queue_fence_is_signalled(&last->fence) && + (flags & PIPE_FLUSH_HINT_FINISH))) + async = true; + } + + if (async) { + if (fence) { + struct tc_unflushed_batch_token *token = NULL; + struct tc_batch *next = &tc->batch_slots[tc->next]; + + if (!next->token) { + next->token = malloc(sizeof(*next->token)); + if (!next->token) + goto out_of_memory; + pipe_reference_init(&next->token->ref, 1); + next->token->tc = tc; + } + + screen->fence_reference(screen, fence, tc->create_fence(pipe, token)); + if (!*fence) + goto out_of_memory; + } + + struct tc_flush_payload *p = + tc_add_struct_typed_call(tc, TC_CALL_flush, tc_flush_payload); + p->fence = fence ? *fence : NULL; + p->flags = flags | TC_FLUSH_ASYNC; + + if (!(flags & PIPE_FLUSH_DEFERRED)) + tc_batch_flush(tc); + return; + } + +out_of_memory: if (!(flags & PIPE_FLUSH_DEFERRED)) { LIST_FOR_EACH_ENTRY_SAFE(tq, tmp, &tc->unflushed_queries, head_unflushed) { tq->flushed = true; LIST_DEL(&tq->head_unflushed); } } - /* TODO: deferred flushes? */ tc_sync_msg(tc, flags & PIPE_FLUSH_END_OF_FRAME ? "end of frame" : flags & PIPE_FLUSH_DEFERRED ? "deferred fence" : "normal"); pipe->flush(pipe, fence, flags); } /* This is actually variable-sized, because indirect isn't allocated if it's * not needed. */ struct tc_full_draw_info { struct pipe_draw_info draw; struct pipe_draw_indirect_info indirect; @@ -2240,22 +2328,24 @@ tc_destroy(struct pipe_context *_pipe) u_upload_destroy(tc->base.const_uploader); if (tc->base.stream_uploader) u_upload_destroy(tc->base.stream_uploader); tc_sync(tc); if (util_queue_is_initialized(&tc->queue)) { util_queue_destroy(&tc->queue); - for (unsigned i = 0; i < TC_MAX_BATCHES; i++) + for (unsigned i = 0; i < TC_MAX_BATCHES; i++) { util_queue_fence_destroy(&tc->batch_slots[i].fence); + assert(!tc->batch_slots[i].token); + } } slab_destroy_child(&tc->pool_transfers); assert(tc->batch_slots[tc->next].num_total_call_slots == 0); pipe->destroy(pipe); os_free_aligned(tc); } static const tc_execute execute_func[TC_NUM_CALLS] = { #define CALL(name) tc_call_##name, @@ -2272,20 +2362,21 @@ static const tc_execute execute_func[TC_NUM_CALLS] = { * in pipe_screen. * \param replace_buffer callback for replacing a pipe_resource's storage * with another pipe_resource's storage. * \param out if successful, the threaded_context will be returned here in * addition to the return value if "out" != NULL */ struct pipe_context * threaded_context_create(struct pipe_context *pipe, struct slab_parent_pool *parent_transfer_pool, tc_replace_buffer_storage_func replace_buffer, + tc_create_fence_func create_fence, struct threaded_context **out) { struct threaded_context *tc; STATIC_ASSERT(sizeof(union tc_payload) <= 8); STATIC_ASSERT(sizeof(struct tc_call) <= 16); if (!pipe) return NULL; @@ -2306,20 +2397,21 @@ threaded_context_create(struct pipe_context *pipe, assert(offsetof(struct threaded_context, batch_slots) % 16 == 0); assert(offsetof(struct threaded_context, batch_slots[0].call) % 16 == 0); assert(offsetof(struct threaded_context, batch_slots[0].call[1]) % 16 == 0); assert(offsetof(struct threaded_context, batch_slots[1].call) % 16 == 0); /* The driver context isn't wrapped, so set its "priv" to NULL. */ pipe->priv = NULL; tc->pipe = pipe; tc->replace_buffer_storage = replace_buffer; + tc->create_fence = create_fence; tc->map_buffer_alignment = pipe->screen->get_param(pipe->screen, PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT); tc->base.priv = pipe; /* priv points to the wrapped driver context */ tc->base.screen = pipe->screen; tc->base.destroy = tc_destroy; tc->base.stream_uploader = u_upload_clone(&tc->base, pipe->stream_uploader); if (pipe->stream_uploader == pipe->const_uploader) tc->base.const_uploader = tc->base.stream_uploader; else diff --git a/src/gallium/auxiliary/util/u_threaded_context.h b/src/gallium/auxiliary/util/u_threaded_context.h index 57805ee4a1e..f92f734e57f 100644 --- a/src/gallium/auxiliary/util/u_threaded_context.h +++ b/src/gallium/auxiliary/util/u_threaded_context.h @@ -101,20 +101,40 @@ * 3) The driver isn't allowed to do buffer invalidations by itself under any * circumstances. This is necessary for unsychronized maps to map the latest * version of the buffer. (because invalidations can be queued, while * unsychronized maps are not queued and they should return the latest * storage after invalidation). The threaded context always sends * TC_TRANSFER_MAP_NO_INVALIDATE into transfer_map and buffer_subdata to * indicate this. Ignoring the flag will lead to failures. * The threaded context uses its own buffer invalidation mechanism. * * + * Rules for fences + * ---------------- + * + * When the threaded context wants to perform an asynchronous flush, it will + * use the create_fence callback to pre-create the fence from the calling + * thread. This pre-created fence will be passed to pipe_context::flush + * together with the TC_FLUSH_ASYNC flag. + * + * The callback receives the unwrapped context as a parameter, but must use it + * in a thread-safe way because it is called from a non-driver thread. + * + * If the threaded_context does not immediately flush the current batch, the + * callback also receives a tc_unflushed_batch_token. If fence_finish is called + * on the returned fence in the context that created the fence, + * threaded_context_flush must be called. + * + * The driver must implement pipe_context::fence_server_sync properly, since + * the threaded context handles PIPE_FLUSH_ASYNC. + * + * * Additional requirements * ----------------------- * * get_query_result: * If threaded_query::flushed == true, get_query_result should assume that * it's called from a non-driver thread, in which case the driver shouldn't * use the context in an unsafe way. * * replace_buffer_storage: * The driver has to implement this callback, which will be called when @@ -153,32 +173,40 @@ * The batches are ordered in a ring and reused once they are idle again. * The batching is necessary for low queue/mutex overhead. * */ #ifndef U_THREADED_CONTEXT_H #define U_THREADED_CONTEXT_H #include "pipe/p_context.h" #include "pipe/p_state.h" +#include "util/u_inlines.h" #include "util/u_queue.h" #include "util/u_range.h" #include "util/slab.h" +struct threaded_context; +struct tc_unflushed_batch_token; + /* These are transfer flags sent to drivers. */ /* Never infer whether it's safe to use unsychronized mappings: */ #define TC_TRANSFER_MAP_NO_INFER_UNSYNCHRONIZED (1u << 29) /* Don't invalidate buffers: */ #define TC_TRANSFER_MAP_NO_INVALIDATE (1u << 30) /* transfer_map is called from a non-driver thread: */ #define TC_TRANSFER_MAP_THREADED_UNSYNC (1u << 31) +/* Custom flush flags sent to drivers. */ +/* fence is pre-populated with a fence created by the create_fence callback */ +#define TC_FLUSH_ASYNC (1u << 31) + /* Size of the queue = number of batch slots in memory. * - 1 batch is always idle and records new commands * - 1 batch is being executed * so the queue size is TC_MAX_BATCHES - 2 = number of waiting batches. * * Use a size as small as possible for low CPU L2 cache usage but large enough * so that the queue isn't stalled too often for not having enough idle batch * slots. */ #define TC_MAX_BATCHES 10 @@ -197,20 +225,22 @@ /* Threshold for when to enqueue buffer/texture_subdata as-is. * If the upload size is greater than this, it will do instead: * - for buffers: DISCARD_RANGE is done by the threaded context * - for textures: sync and call the driver directly */ #define TC_MAX_SUBDATA_BYTES 320 typedef void (*tc_replace_buffer_storage_func)(struct pipe_context *ctx, struct pipe_resource *dst, struct pipe_resource *src); +typedef struct pipe_fence_handle *(*tc_create_fence_func)(struct pipe_context *ctx, + struct tc_unflushed_batch_token *token); struct threaded_resource { struct pipe_resource b; const struct u_resource_vtbl *vtbl; /* Since buffer invalidations are queued, we can't use the base resource * for unsychronized mappings. This points to the latest version of * the buffer after the latest invalidation. It's only used for unsychro- * nized mappings in the non-driver thread. Initially it's set to &b. */ @@ -280,33 +310,45 @@ union tc_payload { #endif /* Each call slot should be aligned to its own size for optimal cache usage. */ struct ALIGN16 tc_call { unsigned sentinel; ushort num_call_slots; ushort call_id; union tc_payload payload; }; +/** + * A token representing an unflushed batch. + * + * See the general rules for fences for an explanation. + */ +struct tc_unflushed_batch_token { + struct pipe_reference ref; + struct threaded_context *tc; +}; + struct tc_batch { struct pipe_context *pipe; unsigned sentinel; unsigned num_total_call_slots; + struct tc_unflushed_batch_token *token; struct util_queue_fence fence; struct tc_call call[TC_CALLS_PER_BATCH]; }; struct threaded_context { struct pipe_context base; struct pipe_context *pipe; struct slab_child_pool pool_transfers; tc_replace_buffer_storage_func replace_buffer_storage; + tc_create_fence_func create_fence; unsigned map_buffer_alignment; struct list_head unflushed_queries; /* Counters for the HUD. */ unsigned num_offloaded_slots; unsigned num_direct_slots; unsigned num_syncs; struct util_queue queue; @@ -317,22 +359,27 @@ struct threaded_context { }; void threaded_resource_init(struct pipe_resource *res); void threaded_resource_deinit(struct pipe_resource *res); struct pipe_context *threaded_context_unwrap_sync(struct pipe_context *pipe); struct pipe_context * threaded_context_create(struct pipe_context *pipe, struct slab_parent_pool *parent_transfer_pool, tc_replace_buffer_storage_func replace_buffer, + tc_create_fence_func create_fence, struct threaded_context **out); +void +threaded_context_flush(struct pipe_context *_pipe, + struct tc_unflushed_batch_token *token); + static inline struct threaded_context * threaded_context(struct pipe_context *pipe) { return (struct threaded_context*)pipe; } static inline struct threaded_resource * threaded_resource(struct pipe_resource *res) { return (struct threaded_resource*)res; @@ -343,11 +390,20 @@ threaded_query(struct pipe_query *q) { return (struct threaded_query*)q; } static inline struct threaded_transfer * threaded_transfer(struct pipe_transfer *transfer) { return (struct threaded_transfer*)transfer; } +static inline void +tc_unflushed_batch_token_reference(struct tc_unflushed_batch_token **dst, + struct tc_unflushed_batch_token *src) +{ + if (pipe_reference((struct pipe_reference *)*dst, (struct pipe_reference *)src)) + free(*dst); + *dst = src; +} + #endif diff --git a/src/gallium/auxiliary/util/u_threaded_context_calls.h b/src/gallium/auxiliary/util/u_threaded_context_calls.h index 546819a2580..1356c54baf2 100644 --- a/src/gallium/auxiliary/util/u_threaded_context_calls.h +++ b/src/gallium/auxiliary/util/u_threaded_context_calls.h @@ -1,10 +1,11 @@ +CALL(flush) CALL(destroy_query) CALL(begin_query) CALL(end_query) CALL(get_query_result_resource) CALL(render_condition) CALL(bind_sampler_states) CALL(set_framebuffer_state) CALL(set_tess_state) CALL(set_constant_buffer) CALL(set_scissor_states) diff --git a/src/gallium/drivers/radeonsi/si_fence.c b/src/gallium/drivers/radeonsi/si_fence.c index b416c47aa30..c51efad7106 100644 --- a/src/gallium/drivers/radeonsi/si_fence.c +++ b/src/gallium/drivers/radeonsi/si_fence.c @@ -19,27 +19,30 @@ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <libsync.h> #include "util/os_time.h" #include "util/u_memory.h" +#include "util/u_queue.h" #include "si_pipe.h" struct si_multi_fence { struct pipe_reference reference; struct pipe_fence_handle *gfx; struct pipe_fence_handle *sdma; + struct tc_unflushed_batch_token *tc_token; + struct util_queue_fence ready; /* If the context wasn't flushed at fence creation, this is non-NULL. */ struct { struct r600_common_context *ctx; unsigned ib_index; } gfx_unflushed; }; static void si_add_fence_dependency(struct r600_common_context *rctx, struct pipe_fence_handle *fence) @@ -55,38 +58,66 @@ static void si_fence_reference(struct pipe_screen *screen, struct pipe_fence_handle **dst, struct pipe_fence_handle *src) { struct radeon_winsys *ws = ((struct r600_common_screen*)screen)->ws; struct si_multi_fence **rdst = (struct si_multi_fence **)dst; struct si_multi_fence *rsrc = (struct si_multi_fence *)src; if (pipe_reference(&(*rdst)->reference, &rsrc->reference)) { ws->fence_reference(&(*rdst)->gfx, NULL); ws->fence_reference(&(*rdst)->sdma, NULL); + tc_unflushed_batch_token_reference(&(*rdst)->tc_token, NULL); FREE(*rdst); } *rdst = rsrc; } +static struct si_multi_fence *si_create_multi_fence() +{ + struct si_multi_fence *fence = CALLOC_STRUCT(si_multi_fence); + if (!fence) + return NULL; + + pipe_reference_init(&fence->reference, 1); + util_queue_fence_init(&fence->ready); + + return fence; +} + +struct pipe_fence_handle *si_create_fence(struct pipe_context *ctx, + struct tc_unflushed_batch_token *tc_token) +{ + struct si_multi_fence *fence = si_create_multi_fence(); + if (!fence) + return NULL; + + util_queue_fence_reset(&fence->ready); + tc_unflushed_batch_token_reference(&fence->tc_token, tc_token); + + return (struct pipe_fence_handle *)fence; +} + static void si_fence_server_sync(struct pipe_context *ctx, struct pipe_fence_handle *fence) { struct r600_common_context *rctx = (struct r600_common_context *)ctx; struct si_multi_fence *rfence = (struct si_multi_fence *)fence; /* Only amdgpu needs to handle fence dependencies (for fence imports). * radeon synchronizes all rings by default and will not implement * fence imports. */ if (rctx->screen->info.drm_major == 2) return; + util_queue_fence_wait(&rfence->ready); + /* Only imported fences need to be handled by fence_server_sync, * because the winsys handles synchronizations automatically for BOs * within the process. * * Simply skip unflushed fences here, and the winsys will drop no-op * dependencies (i.e. dependencies within the same ring). */ if (rfence->gfx_unflushed.ctx) return; @@ -107,20 +138,46 @@ static boolean si_fence_finish(struct pipe_screen *screen, uint64_t timeout) { struct radeon_winsys *rws = ((struct r600_common_screen*)screen)->ws; struct si_multi_fence *rfence = (struct si_multi_fence *)fence; struct r600_common_context *rctx; int64_t abs_timeout = os_time_get_absolute_timeout(timeout); ctx = threaded_context_unwrap_sync(ctx); rctx = ctx ? (struct r600_common_context*)ctx : NULL; + if (!util_queue_fence_is_signalled(&rfence->ready)) { + if (!timeout) + return false; + + if (rfence->tc_token) { + /* Ensure that si_flush_from_st will be called for + * this fence, but only if we're in the API thread + * where the context is current. + * + * Note that the batch containing the flush may already + * be in flight in the driver thread, so the fence + * may not be ready yet when this call returns. + */ + threaded_context_flush(ctx, rfence->tc_token); + } + + if (timeout == PIPE_TIMEOUT_INFINITE) { + util_queue_fence_wait(&rfence->ready); + } else { + if (!util_queue_fence_wait_timeout(&rfence->ready, abs_timeout)) + return false; + } + + assert(!rfence->tc_token); + } + if (rfence->sdma) { if (!rws->fence_wait(rws, rfence->sdma, timeout)) return false; /* Recompute the timeout after waiting. */ if (timeout && timeout != PIPE_TIMEOUT_INFINITE) { int64_t time = os_time_get_nano(); timeout = abs_timeout > time ? abs_timeout - time : 0; } } @@ -153,45 +210,46 @@ static void si_create_fence_fd(struct pipe_context *ctx, { struct r600_common_screen *rscreen = (struct r600_common_screen*)ctx->screen; struct radeon_winsys *ws = rscreen->ws; struct si_multi_fence *rfence; *pfence = NULL; if (!rscreen->info.has_sync_file) return; - rfence = CALLOC_STRUCT(si_multi_fence); + rfence = si_create_multi_fence(); if (!rfence) return; - pipe_reference_init(&rfence->reference, 1); rfence->gfx = ws->fence_import_sync_file(ws, fd); if (!rfence->gfx) { FREE(rfence); return; } *pfence = (struct pipe_fence_handle*)rfence; } static int si_fence_get_fd(struct pipe_screen *screen, struct pipe_fence_handle *fence) { struct r600_common_screen *rscreen = (struct r600_common_screen*)screen; struct radeon_winsys *ws = rscreen->ws; struct si_multi_fence *rfence = (struct si_multi_fence *)fence; int gfx_fd = -1, sdma_fd = -1; if (!rscreen->info.has_sync_file) return -1; + util_queue_fence_wait(&rfence->ready); + /* Deferred fences aren't supported. */ assert(!rfence->gfx_unflushed.ctx); if (rfence->gfx_unflushed.ctx) return -1; if (rfence->sdma) { sdma_fd = ws->fence_export_sync_file(ws, rfence->sdma); if (sdma_fd == -1) return -1; } @@ -253,40 +311,50 @@ static void si_flush_from_st(struct pipe_context *ctx, fence) { gfx_fence = rctx->ws->cs_get_next_fence(rctx->gfx.cs); deferred_fence = true; } else { rctx->gfx.flush(rctx, rflags, fence ? &gfx_fence : NULL); } } /* Both engines can signal out of order, so we need to keep both fences. */ if (fence) { - struct si_multi_fence *multi_fence = - CALLOC_STRUCT(si_multi_fence); - if (!multi_fence) { - ws->fence_reference(&sdma_fence, NULL); - ws->fence_reference(&gfx_fence, NULL); - goto finish; + struct si_multi_fence *multi_fence; + + if (flags & TC_FLUSH_ASYNC) { + multi_fence = (struct si_multi_fence *)*fence; + assert(multi_fence); + } else { + multi_fence = si_create_multi_fence(); + if (!multi_fence) { + ws->fence_reference(&sdma_fence, NULL); + ws->fence_reference(&gfx_fence, NULL); + goto finish; + } + + screen->fence_reference(screen, fence, NULL); + *fence = (struct pipe_fence_handle*)multi_fence; } - multi_fence->reference.count = 1; /* If both fences are NULL, fence_finish will always return true. */ multi_fence->gfx = gfx_fence; multi_fence->sdma = sdma_fence; if (deferred_fence) { multi_fence->gfx_unflushed.ctx = rctx; multi_fence->gfx_unflushed.ib_index = rctx->num_gfx_cs_flushes; } - screen->fence_reference(screen, fence, NULL); - *fence = (struct pipe_fence_handle*)multi_fence; + if (flags & TC_FLUSH_ASYNC) { + util_queue_fence_signal(&multi_fence->ready); + tc_unflushed_batch_token_reference(&multi_fence->tc_token, NULL); + } } finish: if (!(flags & PIPE_FLUSH_DEFERRED)) { if (rctx->dma.cs) ws->cs_sync_flush(rctx->dma.cs); ws->cs_sync_flush(rctx->gfx.cs); } } void si_init_fence_functions(struct si_context *ctx) diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index f5a7c96cc34..19428d8b4e7 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -400,20 +400,21 @@ static struct pipe_context *si_pipe_create_context(struct pipe_screen *screen, if (flags & (PIPE_CONTEXT_COMPUTE_ONLY | PIPE_CONTEXT_DEBUG)) return ctx; /* When shaders are logged to stderr, asynchronous compilation is * disabled too. */ if (sscreen->b.debug_flags & DBG_ALL_SHADERS) return ctx; return threaded_context_create(ctx, &sscreen->b.pool_transfers, si_replace_buffer_storage, + si_create_fence, &((struct si_context*)ctx)->b.tc); } /* * pipe_screen */ static bool si_have_tgsi_compute(struct si_screen *sscreen) { /* Old kernels disallowed some register writes for SI * that are used for indirect dispatches. */ diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index dad23d997e4..701b051e60f 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -599,20 +599,22 @@ void si_init_debug_functions(struct si_context *sctx); void si_check_vm_faults(struct r600_common_context *ctx, struct radeon_saved_cs *saved, enum ring_type ring); bool si_replace_shader(unsigned num, struct ac_shader_binary *binary); /* si_dma.c */ void si_init_dma_functions(struct si_context *sctx); /* si_fence.c */ void si_init_fence_functions(struct si_context *ctx); void si_init_screen_fence_functions(struct si_screen *screen); +struct pipe_fence_handle *si_create_fence(struct pipe_context *ctx, + struct tc_unflushed_batch_token *tc_token); /* si_hw_context.c */ void si_destroy_saved_cs(struct si_saved_cs *scs); void si_context_gfx_flush(void *context, unsigned flags, struct pipe_fence_handle **fence); void si_begin_new_cs(struct si_context *ctx); void si_need_cs_space(struct si_context *ctx); /* si_compute.c */ void si_init_compute_functions(struct si_context *sctx); -- 2.11.0 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev