[Mesa-dev] [PATCH 6/7] radeonsi: use ready fences on all shaders, not just optimized ones

Nicolai Hähnle Sun, 22 Oct 2017 11:34:46 -0700

From: Nicolai Hähnle <nicolai.haeh...@amd.com>

There's a race condition between si_shader_select_with_key and
si_bind_XX_shader:


  Thread 1                         Thread 2
  --------                         --------
  si_shader_select_with_key
    begin compiling the first
    variant
    (guarded by sel->mutex)
                                   si_bind_XX_shader
                                     select first_variant by default
                                     as state->current
                                   si_shader_select_with_key
                                     match state->current and early-out

Since thread 2 never takes sel->mutex, it may go on rendering without a
PM4 for that shader, for example.

The solution taken by this patch is to broaden the scope of
shader->optimized_ready to a fence shader->ready that applies to
all shaders. This does not hurt the fast path (if anything it makes
it faster, because we don't explicitly check is_optimized).

It will also allow reducing the scope of sel->mutex locks, but this is
deferred to a later commit for better bisectability.

Fixes 
dEQP-EGL.functional.sharing.gles2.multithread.simple.buffers.bufferdata_render
---
 src/gallium/drivers/radeonsi/si_shader.c        |  3 +
 src/gallium/drivers/radeonsi/si_shader.h        |  2 +-
 src/gallium/drivers/radeonsi/si_state_shaders.c | 88 ++++++++++++++++++-------
 3 files changed, 67 insertions(+), 26 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
b/src/gallium/drivers/radeonsi/si_shader.c
index c3fe13deeaa..2eee8de54f2 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -5402,20 +5402,23 @@ si_generate_gs_copy_shader(struct si_screen *sscreen,
 
        if (!outputs)
                return NULL;
 
        shader = CALLOC_STRUCT(si_shader);
        if (!shader) {
                FREE(outputs);
                return NULL;
        }
 
+       /* We can leave the fence as permanently signaled because the GS copy
+        * shader only becomes visible globally after it has been compiled. */
+       util_queue_fence_init(&shader->ready);
 
        shader->selector = gs_selector;
        shader->is_gs_copy_shader = true;
 
        si_init_shader_ctx(&ctx, sscreen, tm);
        ctx.shader = shader;
        ctx.type = PIPE_SHADER_VERTEX;
 
        builder = ctx.ac.builder;
 
diff --git a/src/gallium/drivers/radeonsi/si_shader.h 
b/src/gallium/drivers/radeonsi/si_shader.h
index ebe956e709e..aec71eb07d3 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -590,21 +590,21 @@ struct si_shader {
 
        struct si_shader_part           *prolog;
        struct si_shader                *previous_stage; /* for GFX9 */
        struct si_shader_part           *prolog2;
        struct si_shader_part           *epilog;
 
        struct si_pm4_state             *pm4;
        struct r600_resource            *bo;
        struct r600_resource            *scratch_bo;
        struct si_shader_key            key;
-       struct util_queue_fence         optimized_ready;
+       struct util_queue_fence         ready;
        bool                            compilation_failed;
        bool                            is_monolithic;
        bool                            is_optimized;
        bool                            is_binary_shared;
        bool                            is_gs_copy_shader;
 
        /* The following data is all that's needed for binary shaders. */
        struct ac_shader_binary binary;
        struct si_shader_config         config;
        struct si_shader_info           info;
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c 
b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 9340328a72a..8c589928b8c 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1549,20 +1549,25 @@ static bool si_check_missing_main_part(struct si_screen 
*sscreen,
                                       struct si_shader_key *key)
 {
        struct si_shader **mainp = si_get_main_shader_part(sel, key);
 
        if (!*mainp) {
                struct si_shader *main_part = CALLOC_STRUCT(si_shader);
 
                if (!main_part)
                        return false;
 
+               /* We can leave the fence as permanently signaled because the
+                * main part becomes visible globally only after it has been
+                * compiled. */
+               util_queue_fence_init(&main_part->ready);
+
                main_part->selector = sel;
                main_part->key.as_es = key->as_es;
                main_part->key.as_ls = key->as_ls;
 
                if (si_compile_tgsi_shader(sscreen, compiler_state->tm,
                                           main_part, false,
                                           &compiler_state->debug) != 0) {
                        FREE(main_part);
                        return false;
                }
@@ -1582,70 +1587,85 @@ static int si_shader_select_with_key(struct si_screen 
*sscreen,
        struct si_shader_selector *previous_stage_sel = NULL;
        struct si_shader *current = state->current;
        struct si_shader *iter, *shader = NULL;
 
 again:
        /* Check if we don't need to change anything.
         * This path is also used for most shaders that don't need multiple
         * variants, it will cost just a computation of the key and this
         * test. */
        if (likely(current &&
-                  memcmp(&current->key, key, sizeof(*key)) == 0 &&
-                  (!current->is_optimized ||
-                   util_queue_fence_is_signalled(&current->optimized_ready))))
+                  memcmp(&current->key, key, sizeof(*key)) == 0)) {
+               if (unlikely(!util_queue_fence_is_signalled(&current->ready))) {
+                       if (current->is_optimized) {
+                               memset(&key->opt, 0, sizeof(key->opt));
+                               goto current_not_ready;
+                       }
+
+                       util_queue_fence_wait(&current->ready);
+               }
+
                return current->compilation_failed ? -1 : 0;
+       }
+current_not_ready:
 
        /* This must be done before the mutex is locked, because async GS
         * compilation calls this function too, and therefore must enter
         * the mutex first.
         *
         * Only wait if we are in a draw call. Don't wait if we are
         * in a compiler thread.
         */
        if (thread_index < 0)
                util_queue_fence_wait(&sel->ready);
 
        mtx_lock(&sel->mutex);
 
        /* Find the shader variant. */
        for (iter = sel->first_variant; iter; iter = iter->next_variant) {
                /* Don't check the "current" shader. We checked it above. */
                if (current != iter &&
                    memcmp(&iter->key, key, sizeof(*key)) == 0) {
-                       /* If it's an optimized shader and its compilation has
-                        * been started but isn't done, use the unoptimized
-                        * shader so as not to cause a stall due to compilation.
-                        */
-                       if (iter->is_optimized &&
-                           
!util_queue_fence_is_signalled(&iter->optimized_ready)) {
-                               memset(&key->opt, 0, sizeof(key->opt));
-                               mtx_unlock(&sel->mutex);
-                               goto again;
+                       if 
(unlikely(!util_queue_fence_is_signalled(&iter->ready))) {
+                               /* If it's an optimized shader and its 
compilation has
+                                * been started but isn't done, use the 
unoptimized
+                                * shader so as not to cause a stall due to 
compilation.
+                                */
+                               if (iter->is_optimized) {
+                                       memset(&key->opt, 0, sizeof(key->opt));
+                                       mtx_unlock(&sel->mutex);
+                                       goto again;
+                               }
+
+                               util_queue_fence_wait(&iter->ready);
                        }
 
                        if (iter->compilation_failed) {
                                mtx_unlock(&sel->mutex);
                                return -1; /* skip the draw call */
                        }
 
                        state->current = iter;
                        mtx_unlock(&sel->mutex);
                        return 0;
                }
        }
 
        /* Build a new shader. */
        shader = CALLOC_STRUCT(si_shader);
        if (!shader) {
                mtx_unlock(&sel->mutex);
                return -ENOMEM;
        }
+
+       util_queue_fence_init(&shader->ready);
+
        shader->selector = sel;
        shader->key = *key;
        shader->compiler_ctx_state = *compiler_state;
 
        /* If this is a merged shader, get the first shader's selector. */
        if (sscreen->b.chip_class >= GFX9) {
                if (sel->type == PIPE_SHADER_TESS_CTRL)
                        previous_stage_sel = key->part.tcs.ls;
                else if (sel->type == PIPE_SHADER_GEOMETRY)
                        previous_stage_sel = key->part.gs.es;
@@ -1708,49 +1728,62 @@ again:
 
        /* Monolithic-only shaders don't make a distinction between optimized
         * and unoptimized. */
        shader->is_monolithic =
                is_pure_monolithic ||
                memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
 
        shader->is_optimized =
                !is_pure_monolithic &&
                memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
-       if (shader->is_optimized)
-               util_queue_fence_init(&shader->optimized_ready);
-
-       if (!sel->last_variant) {
-               sel->first_variant = shader;
-               sel->last_variant = shader;
-       } else {
-               sel->last_variant->next_variant = shader;
-               sel->last_variant = shader;
-       }
 
        /* If it's an optimized shader, compile it asynchronously. */
        if (shader->is_optimized &&
            !is_pure_monolithic &&
            thread_index < 0) {
                /* Compile it asynchronously. */
                util_queue_add_job(&sscreen->shader_compiler_queue_low_priority,
-                                  shader, &shader->optimized_ready,
+                                  shader, &shader->ready,
                                   si_build_shader_variant_low_priority, NULL);
 
+               /* Add only after the ready fence was reset, to guard against a
+                * race with si_bind_XX_shader. */
+               if (!sel->last_variant) {
+                       sel->first_variant = shader;
+                       sel->last_variant = shader;
+               } else {
+                       sel->last_variant->next_variant = shader;
+                       sel->last_variant = shader;
+               }
+
                /* Use the default (unoptimized) shader for now. */
                memset(&key->opt, 0, sizeof(key->opt));
                mtx_unlock(&sel->mutex);
                goto again;
        }
 
+       /* Reset the fence before adding to the variant list. */
+       util_queue_fence_reset(&shader->ready);
+
+       if (!sel->last_variant) {
+               sel->first_variant = shader;
+               sel->last_variant = shader;
+       } else {
+               sel->last_variant->next_variant = shader;
+               sel->last_variant = shader;
+       }
+
        assert(!shader->is_optimized);
        si_build_shader_variant(shader, thread_index, false);
 
+       util_queue_fence_signal(&shader->ready);
+
        if (!shader->compilation_failed)
                state->current = shader;
 
        mtx_unlock(&sel->mutex);
        return shader->compilation_failed ? -1 : 0;
 }
 
 static int si_shader_select(struct pipe_context *ctx,
                            struct si_shader_ctx_state *state,
                            struct si_compiler_ctx_state *compiler_state)
@@ -1826,20 +1859,24 @@ static void si_init_shader_selector_async(void *job, 
int thread_index)
         */
        if (!sscreen->use_monolithic_shaders) {
                struct si_shader *shader = CALLOC_STRUCT(si_shader);
                void *tgsi_binary = NULL;
 
                if (!shader) {
                        fprintf(stderr, "radeonsi: can't allocate a main shader 
part\n");
                        return;
                }
 
+               /* We can leave the fence signaled because use of the default
+                * main part is guarded by the selector's ready fence. */
+               util_queue_fence_init(&shader->ready);
+
                shader->selector = sel;
                si_parse_next_shader_property(&sel->info,
                                              sel->so.num_outputs != 0,
                                              &shader->key);
 
                if (sel->tokens)
                        tgsi_binary = si_get_tgsi_binary(sel);
 
                /* Try to load the shader from the shader cache. */
                mtx_lock(&sscreen->shader_cache_mutex);
@@ -2439,24 +2476,25 @@ static void si_bind_ps_shader(struct pipe_context *ctx, 
void *state)
                     
sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL]))
                        si_mark_atom_dirty(sctx, &sctx->msaa_config);
        }
        si_set_active_descriptors_for_shader(sctx, sel);
 }
 
 static void si_delete_shader(struct si_context *sctx, struct si_shader *shader)
 {
        if (shader->is_optimized) {
                
util_queue_drop_job(&sctx->screen->shader_compiler_queue_low_priority,
-                                   &shader->optimized_ready);
-               util_queue_fence_destroy(&shader->optimized_ready);
+                                   &shader->ready);
        }
 
+       util_queue_fence_destroy(&shader->ready);
+
        if (shader->pm4) {
                switch (shader->selector->type) {
                case PIPE_SHADER_VERTEX:
                        if (shader->key.as_ls) {
                                assert(sctx->b.chip_class <= VI);
                                si_pm4_delete_state(sctx, ls, shader->pm4);
                        } else if (shader->key.as_es) {
                                assert(sctx->b.chip_class <= VI);
                                si_pm4_delete_state(sctx, es, shader->pm4);
                        } else {
-- 
2.11.0

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 6/7] radeonsi: use ready fences on all shaders, not just optimized ones

Reply via email to