From: Marek Olšák <marek.ol...@amd.com> In addition to the non-monolithic variant. --- src/gallium/drivers/radeonsi/si_shader.h | 10 +++++++++- src/gallium/drivers/radeonsi/si_state_shaders.c | 19 +++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-)
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index e0227e4..4fb79e6 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -438,26 +438,34 @@ struct si_shader_key { unsigned as_ls:1; /* local shader, which precedes TCS */ /* Flags for monolithic compilation only. */ struct { /* One byte for every input: SI_FIX_FETCH_* enums. */ uint8_t vs_fix_fetch[SI_MAX_ATTRIBS]; uint64_t ff_tcs_inputs_to_copy; /* for fixed-func TCS */ } mono; /* Optimization flags for asynchronous compilation only. */ - union { + struct { struct { uint64_t kill_outputs; /* "get_unique_index" bits */ uint32_t kill_outputs2; /* "get_unique_index2" bits */ unsigned clip_disable:1; } hw_vs; /* HW VS (it can be VS, TES, GS) */ + + /* For shaders where monolithic variants have better code. + * + * This is a flag that has no effect on code generation, + * but forces monolithic shaders to be used as soon as + * possible, because it's in the "opt" group. + */ + unsigned prefer_mono:1; } opt; }; struct si_shader_config { unsigned num_sgprs; unsigned num_vgprs; unsigned spilled_sgprs; unsigned spilled_vgprs; unsigned private_mem_vgprs; unsigned lds_size; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 6bb3f50..22bf3cf 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1271,20 +1271,39 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, break; case PIPE_SHADER_GEOMETRY: if (sctx->b.chip_class >= GFX9) { if (sctx->tes_shader.cso) { key->part.gs.es = sctx->tes_shader.cso; } else { si_shader_selector_key_vs(sctx, sctx->vs_shader.cso, key, &key->part.gs.vs_prolog); key->part.gs.es = sctx->vs_shader.cso; } + + /* Merged ES-GS can have unbalanced wave usage. + * + * ES threads are per-vertex, while GS threads are + * per-primitive. So without any amplification, there + * are fewer GS threads than ES threads, which can result + * in empty (no-op) GS waves. With too much amplification, + * there are more GS threads than ES threads, which + * can result in empty (no-op) ES waves. + * + * Non-monolithic shaders are implemented by setting EXEC + * at the beginning of shader parts, and don't jump to + * the end if EXEC is 0. + * + * Monolithic shaders use conditional blocks, so they can + * jump and skip empty waves of ES or GS. So set this to + * always use optimized variants, which are monolithic. + */ + key->opt.prefer_mono = 1; } key->part.gs.prolog.tri_strip_adj_fix = sctx->gs_tri_strip_adj_fix; break; case PIPE_SHADER_FRAGMENT: { struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; struct si_state_blend *blend = sctx->queued.named.blend; if (sel->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] && sel->info.colors_written == 0x1) key->part.ps.epilog.last_cbuf = MAX2(sctx->framebuffer.state.nr_cbufs, 1) - 1; -- 2.7.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev