From: Nicolai Hähnle <nicolai.haeh...@amd.com> The idea is to fix the following interleaving of operations that can arise from deferred fences:
Thread 1 / Context 1 Thread 2 / Context 2 -------------------- -------------------- f = deferred flush <------- application-side synchronization -------> fence_server_sync(f) ... flush() flush() We will now stall in fence_server_sync until the flush of context 1 has completed. This scenario was unlikely to occur previously, because applications seem to be doing Thread 1 / Context 1 Thread 2 / Context 2 -------------------- -------------------- f = glFenceSync() glFlush() <------- application-side synchronization -------> glWaitSync(f) ... and indeed they probably *have* to use this ordering to avoid deadlocks in the GLX model, where all GL operations conceptually go through a single connection to the X server. However, it's less clear whether applications have to do this with other WSI (i.e. EGL). Besides, even this sequence of GL commands can be translated into the Gallium-level sequence outlined above when Gallium threading and asynchronous flushes are used. So it makes sense to be more robust. As a side effect, we no longer busy-wait on submission_in_progress. We won't enable asynchronous flushes on radeon, but add a cs_add_fence_dependency stub anyway to document the potential issue. --- src/gallium/drivers/radeon/radeon_winsys.h | 4 +++- src/gallium/winsys/amdgpu/drm/amdgpu_cs.c | 21 +++++++++++++-------- src/gallium/winsys/amdgpu/drm/amdgpu_cs.h | 9 ++++++--- src/gallium/winsys/radeon/drm/radeon_drm_cs.c | 19 +++++++++++++++++++ 4 files changed, 41 insertions(+), 12 deletions(-) diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h index 2d3f646dc65..e8c486cb7f4 100644 --- a/src/gallium/drivers/radeon/radeon_winsys.h +++ b/src/gallium/drivers/radeon/radeon_winsys.h @@ -536,21 +536,23 @@ struct radeon_winsys { * \return Negative POSIX error code or 0 for success. * Asynchronous submissions never return an error. */ int (*cs_flush)(struct radeon_winsys_cs *cs, unsigned flags, struct pipe_fence_handle **fence); /** * Create a fence before the CS is flushed. * The user must flush manually to complete the initializaton of the fence. - * The fence must not be used before the flush. + * + * The fence must not be used for anything except \ref cs_add_fence_dependency + * before the flush. */ struct pipe_fence_handle *(*cs_get_next_fence)(struct radeon_winsys_cs *cs); /** * Return true if a buffer is referenced by a command stream. * * \param cs A command stream. * \param buf A winsys buffer. */ bool (*cs_is_buffer_referenced)(struct radeon_winsys_cs *cs, diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c index 0450ccc3596..0628e547351 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c @@ -43,21 +43,22 @@ amdgpu_fence_create(struct amdgpu_ctx *ctx, unsigned ip_type, { struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence); fence->reference.count = 1; fence->ws = ctx->ws; fence->ctx = ctx; fence->fence.context = ctx->ctx; fence->fence.ip_type = ip_type; fence->fence.ip_instance = ip_instance; fence->fence.ring = ring; - fence->submission_in_progress = true; + util_queue_fence_init(&fence->submitted); + util_queue_fence_reset(&fence->submitted); p_atomic_inc(&ctx->refcount); return (struct pipe_fence_handle *)fence; } static struct pipe_fence_handle * amdgpu_fence_import_sync_file(struct radeon_winsys *rws, int fd) { struct amdgpu_winsys *ws = amdgpu_winsys(rws); struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence); @@ -74,66 +75,69 @@ amdgpu_fence_import_sync_file(struct radeon_winsys *rws, int fd) FREE(fence); return NULL; } r = amdgpu_cs_syncobj_import_sync_file(ws->dev, fence->syncobj, fd); if (r) { amdgpu_cs_destroy_syncobj(ws->dev, fence->syncobj); FREE(fence); return NULL; } + + util_queue_fence_init(&fence->submitted); + return (struct pipe_fence_handle*)fence; } static int amdgpu_fence_export_sync_file(struct radeon_winsys *rws, struct pipe_fence_handle *pfence) { struct amdgpu_winsys *ws = amdgpu_winsys(rws); struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence; if (amdgpu_fence_is_syncobj(fence)) { int fd, r; /* Convert syncobj into sync_file. */ r = amdgpu_cs_syncobj_export_sync_file(ws->dev, fence->syncobj, &fd); return r ? -1 : fd; } - os_wait_until_zero(&fence->submission_in_progress, PIPE_TIMEOUT_INFINITE); + util_queue_fence_wait(&fence->submitted); /* Convert the amdgpu fence into a fence FD. */ int fd; if (amdgpu_cs_fence_to_handle(ws->dev, &fence->fence, AMDGPU_FENCE_TO_HANDLE_GET_SYNC_FILE_FD, (uint32_t*)&fd)) return -1; return fd; } static void amdgpu_fence_submitted(struct pipe_fence_handle *fence, uint64_t seq_no, uint64_t *user_fence_cpu_address) { struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence; rfence->fence.fence = seq_no; rfence->user_fence_cpu_address = user_fence_cpu_address; - rfence->submission_in_progress = false; + util_queue_fence_signal(&rfence->submitted); } static void amdgpu_fence_signalled(struct pipe_fence_handle *fence) { struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence; rfence->signalled = true; - rfence->submission_in_progress = false; + util_queue_fence_signal(&rfence->submitted); } bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout, bool absolute) { struct amdgpu_fence *rfence = (struct amdgpu_fence*)fence; uint32_t expired; int64_t abs_timeout; uint64_t *user_fence_cpu; int r; @@ -157,22 +161,21 @@ bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout, } if (absolute) abs_timeout = timeout; else abs_timeout = os_time_get_absolute_timeout(timeout); /* The fence might not have a number assigned if its IB is being * submitted in the other thread right now. Wait until the submission * is done. */ - if (!os_wait_until_zero_abs_timeout(&rfence->submission_in_progress, - abs_timeout)) + if (!util_queue_fence_wait_timeout(&rfence->submitted, abs_timeout)) return false; user_fence_cpu = rfence->user_fence_cpu_address; if (user_fence_cpu) { if (*user_fence_cpu >= rfence->fence.fence) { rfence->signalled = true; return true; } /* No timeout, just query: no need for the ioctl. */ @@ -1022,20 +1025,22 @@ static bool is_noop_fence_dependency(struct amdgpu_cs *acs, return amdgpu_fence_wait((void *)fence, 0, false); } static void amdgpu_cs_add_fence_dependency(struct radeon_winsys_cs *rws, struct pipe_fence_handle *pfence) { struct amdgpu_cs *acs = amdgpu_cs(rws); struct amdgpu_cs_context *cs = acs->csc; struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence; + util_queue_fence_wait(&fence->submitted); + if (is_noop_fence_dependency(acs, fence)) return; unsigned idx = add_fence_dependency_entry(cs); amdgpu_fence_reference(&cs->fence_dependencies[idx], (struct pipe_fence_handle*)fence); } static void amdgpu_add_bo_fence_dependencies(struct amdgpu_cs *acs, struct amdgpu_cs_buffer *buffer) @@ -1297,21 +1302,21 @@ bo_list_error: for (unsigned i = 0; i < num_dependencies; i++) { struct amdgpu_fence *fence = (struct amdgpu_fence*)cs->fence_dependencies[i]; if (amdgpu_fence_is_syncobj(fence)) { num_syncobj_dependencies++; continue; } - assert(!fence->submission_in_progress); + assert(util_queue_fence_is_signalled(&fence->submitted)); amdgpu_cs_chunk_fence_to_dep(&fence->fence, &dep_chunk[num++]); } chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_DEPENDENCIES; chunks[num_chunks].length_dw = sizeof(dep_chunk[0]) / 4 * num; chunks[num_chunks].chunk_data = (uintptr_t)dep_chunk; num_chunks++; } /* Syncobj dependencies. */ @@ -1320,21 +1325,21 @@ bo_list_error: alloca(num_syncobj_dependencies * sizeof(sem_chunk[0])); unsigned num = 0; for (unsigned i = 0; i < num_dependencies; i++) { struct amdgpu_fence *fence = (struct amdgpu_fence*)cs->fence_dependencies[i]; if (!amdgpu_fence_is_syncobj(fence)) continue; - assert(!fence->submission_in_progress); + assert(util_queue_fence_is_signalled(&fence->submitted)); sem_chunk[num++].handle = fence->syncobj; } chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_IN; chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4 * num; chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk; num_chunks++; } assert(num_chunks <= ARRAY_SIZE(chunks)); diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h index b7bc9a20bbf..fbf44b36610 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h @@ -137,23 +137,25 @@ struct amdgpu_cs { struct amdgpu_fence { struct pipe_reference reference; /* If ctx == NULL, this fence is syncobj-based. */ uint32_t syncobj; struct amdgpu_winsys *ws; struct amdgpu_ctx *ctx; /* submission context */ struct amdgpu_cs_fence fence; uint64_t *user_fence_cpu_address; - /* If the fence is unknown due to an IB still being submitted - * in the other thread. */ - volatile int submission_in_progress; /* bool (int for atomicity) */ + /* If the fence has been submitted. This is unsignalled for deferred fences + * (cs->next_fence) and while an IB is still being submitted in the submit + * thread. */ + struct util_queue_fence submitted; + volatile int signalled; /* bool (int for atomicity) */ }; static inline bool amdgpu_fence_is_syncobj(struct amdgpu_fence *fence) { return fence->ctx == NULL; } static inline void amdgpu_ctx_unref(struct amdgpu_ctx *ctx) { @@ -171,20 +173,21 @@ static inline void amdgpu_fence_reference(struct pipe_fence_handle **dst, struct amdgpu_fence *rsrc = (struct amdgpu_fence *)src; if (pipe_reference(&(*rdst)->reference, &rsrc->reference)) { struct amdgpu_fence *fence = *rdst; if (amdgpu_fence_is_syncobj(fence)) amdgpu_cs_destroy_syncobj(fence->ws->dev, fence->syncobj); else amdgpu_ctx_unref(fence->ctx); + util_queue_fence_destroy(&fence->submitted); FREE(fence); } *rdst = rsrc; } int amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo); static inline struct amdgpu_ib * amdgpu_ib(struct radeon_winsys_cs *base) { diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c index 7220f3a0240..add88f80aae 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c @@ -779,28 +779,47 @@ radeon_drm_cs_get_next_fence(struct radeon_winsys_cs *rcs) } fence = radeon_cs_create_fence(rcs); if (!fence) return NULL; radeon_fence_reference(&cs->next_fence, fence); return fence; } +static void +radeon_drm_cs_add_fence_dependency(struct radeon_winsys_cs *cs, + struct pipe_fence_handle *fence) +{ + /* TODO: Handle the following unlikely multi-threaded scenario: + * + * Thread 1 / Context 1 Thread 2 / Context 2 + * -------------------- -------------------- + * f = cs_get_next_fence() + * cs_add_fence_dependency(f) + * cs_flush() + * cs_flush() + * + * We currently assume that this does not happen because we don't support + * asynchronous flushes on Radeon. + */ +} + void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws) { ws->base.ctx_create = radeon_drm_ctx_create; ws->base.ctx_destroy = radeon_drm_ctx_destroy; ws->base.cs_create = radeon_drm_cs_create; ws->base.cs_destroy = radeon_drm_cs_destroy; ws->base.cs_add_buffer = radeon_drm_cs_add_buffer; ws->base.cs_lookup_buffer = radeon_drm_cs_lookup_buffer; ws->base.cs_validate = radeon_drm_cs_validate; ws->base.cs_check_space = radeon_drm_cs_check_space; ws->base.cs_get_buffer_list = radeon_drm_cs_get_buffer_list; ws->base.cs_flush = radeon_drm_cs_flush; ws->base.cs_get_next_fence = radeon_drm_cs_get_next_fence; ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced; ws->base.cs_sync_flush = radeon_drm_cs_sync_flush; + ws->base.cs_add_fence_dependency = radeon_drm_cs_add_fence_dependency; ws->base.fence_wait = radeon_fence_wait; ws->base.fence_reference = radeon_fence_reference; } -- 2.11.0 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev