From: Nicolai Hähnle <nicolai.haeh...@amd.com> With merged ESGS shaders, the GS part of a wave may be empty, and the hardware gets confused if any GS messages are sent from that wave. Since S_SENDMSG is executed even when EXEC = 0, we have to wrap even non-monolithic GS shaders in an if-block, so that the entire shader and hence the S_SENDMSG instructions are skipped in empty waves.
This change is not required for TCS/HS, but applying it there as well simplifies the code a bit. Fixes GL45-CTS.geometry_shader.rendering.rendering.* Cc: mesa-sta...@lists.freedesktop.org --- src/gallium/drivers/radeonsi/si_shader.c | 74 +++++++++++++---------- src/gallium/drivers/radeonsi/si_shader_internal.h | 3 + 2 files changed, 45 insertions(+), 32 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 7a44e61..9aeda49 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -2713,20 +2713,23 @@ si_insert_input_ptr_as_2xi32(struct si_shader_context *ctx, LLVMValueRef ret, } /* This only writes the tessellation factor levels. */ static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base) { struct si_shader_context *ctx = si_shader_context(bld_base); LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset; si_copy_tcs_inputs(bld_base); + if (ctx->screen->b.chip_class >= GFX9) + lp_build_endif(&ctx->merged_wrap_if_state); + rel_patch_id = get_rel_patch_id(ctx); invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5); tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx); /* Return epilog parameters from this function. */ LLVMBuilderRef builder = ctx->gallivm.builder; LLVMValueRef ret = ctx->return_value; unsigned vgpr; if (ctx->screen->b.chip_class >= GFX9) { @@ -2946,20 +2949,23 @@ static LLVMValueRef si_get_gs_wave_id(struct si_shader_context *ctx) else return LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id); } static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base) { struct si_shader_context *ctx = si_shader_context(bld_base); ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, si_get_gs_wave_id(ctx)); + + if (ctx->screen->b.chip_class >= GFX9) + lp_build_endif(&ctx->merged_wrap_if_state); } static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base) { struct si_shader_context *ctx = si_shader_context(bld_base); struct gallivm_state *gallivm = &ctx->gallivm; struct tgsi_shader_info *info = &ctx->shader->selector->info; struct si_shader_output_values *outputs = NULL; int i,j; @@ -5523,39 +5529,55 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx, break; default: assert(!"Unsupported shader type"); return false; } create_function(ctx); preload_ring_buffers(ctx); /* For GFX9 merged shaders: - * - Set EXEC. If the prolog is present, set EXEC there instead. + * - Set EXEC for the first shader. If the prolog is present, set + * EXEC there instead. * - Add a barrier before the second shader. + * - In the second shader, reset EXEC to ~0 and wrap the main part in + * an if-statement. This is required for correctness in geometry + * shaders, to ensure that empty GS waves do not send GS_EMIT and + * GS_CUT messages. * - * The same thing for monolithic shaders is done in - * si_build_wrapper_function. + * For monolithic merged shaders, the first shader is wrapped in an + * if-block together with its prolog in si_build_wrapper_function. */ - if (ctx->screen->b.chip_class >= GFX9 && !is_monolithic) { - if (sel->info.num_instructions > 1 && /* not empty shader */ + if (ctx->screen->b.chip_class >= GFX9) { + if (!is_monolithic && + sel->info.num_instructions > 1 && /* not empty shader */ (shader->key.as_es || shader->key.as_ls) && (ctx->type == PIPE_SHADER_TESS_EVAL || (ctx->type == PIPE_SHADER_VERTEX && !sel->vs_needs_prolog))) { si_init_exec_from_input(ctx, ctx->param_merged_wave_info, 0); } else if (ctx->type == PIPE_SHADER_TESS_CTRL || ctx->type == PIPE_SHADER_GEOMETRY) { - si_init_exec_from_input(ctx, - ctx->param_merged_wave_info, 8); + if (!is_monolithic) + si_init_exec_full_mask(ctx); + + /* The barrier must execute for all shaders in a + * threadgroup. + */ si_llvm_emit_barrier(NULL, bld_base, NULL); + + LLVMValueRef num_threads = unpack_param(ctx, ctx->param_merged_wave_info, 8, 8); + LLVMValueRef ena = + LLVMBuildICmp(ctx->ac.builder, LLVMIntULT, + ac_get_thread_id(&ctx->ac), num_threads, ""); + lp_build_if(&ctx->merged_wrap_if_state, &ctx->gallivm, ena); } } if (ctx->type == PIPE_SHADER_GEOMETRY) { int i; for (i = 0; i < 4; i++) { ctx->gs_next_vertex[i] = lp_build_alloca(&ctx->gallivm, ctx->i32, ""); } @@ -6012,29 +6034,23 @@ static void si_build_wrapper_function(struct si_shader_context *ctx, LLVMValueRef in[48]; LLVMValueRef ret; LLVMTypeRef ret_type; unsigned out_idx = 0; num_params = LLVMCountParams(parts[part]); assert(num_params <= ARRAY_SIZE(param_types)); /* Merged shaders are executed conditionally depending * on the number of enabled threads passed in the input SGPRs. */ - if (is_merged_shader(ctx->shader) && - (part == 0 || part == next_shader_first_part)) { + if (is_merged_shader(ctx->shader) && part == 0) { LLVMValueRef ena, count = initial[3]; - /* The thread count for the 2nd shader is at bit-offset 8. */ - if (part == next_shader_first_part) { - count = LLVMBuildLShr(builder, count, - LLVMConstInt(ctx->i32, 8, 0), ""); - } count = LLVMBuildAnd(builder, count, LLVMConstInt(ctx->i32, 0x7f, 0), ""); ena = LLVMBuildICmp(builder, LLVMIntULT, ac_get_thread_id(&ctx->ac), count, ""); lp_build_if(&if_state, &ctx->gallivm, ena); } /* Derive arguments for the next part from outputs of the * previous one. */ @@ -6077,40 +6093,34 @@ static void si_build_wrapper_function(struct si_shader_context *ctx, } } in[param_idx] = arg; out_idx += param_size; } ret = LLVMBuildCall(builder, parts[part], in, num_params, ""); if (is_merged_shader(ctx->shader) && - (part + 1 == next_shader_first_part || - part + 1 == num_parts)) { + part + 1 == next_shader_first_part) { lp_build_endif(&if_state); - if (part + 1 == next_shader_first_part) { - /* A barrier is required between 2 merged shaders. */ - si_llvm_emit_barrier(NULL, &ctx->bld_base, NULL); - - /* The second half of the merged shader should use - * the inputs from the toplevel (wrapper) function, - * not the return value from the last call. - * - * That's because the last call was executed condi- - * tionally, so we can't consume it in the main - * block. - */ - memcpy(out, initial, sizeof(initial)); - num_out = initial_num_out; - num_out_sgpr = initial_num_out_sgpr; - } + /* The second half of the merged shader should use + * the inputs from the toplevel (wrapper) function, + * not the return value from the last call. + * + * That's because the last call was executed condi- + * tionally, so we can't consume it in the main + * block. + */ + memcpy(out, initial, sizeof(initial)); + num_out = initial_num_out; + num_out_sgpr = initial_num_out_sgpr; continue; } /* Extract the returned GPRs. */ ret_type = LLVMTypeOf(ret); num_out = 0; num_out_sgpr = 0; if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) { assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind); diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index 3556e69..3f127cf 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -18,20 +18,21 @@ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifndef SI_SHADER_PRIVATE_H #define SI_SHADER_PRIVATE_H #include "si_shader.h" +#include "gallivm/lp_bld_flow.h" #include "gallivm/lp_bld_init.h" #include "gallivm/lp_bld_tgsi.h" #include "tgsi/tgsi_parse.h" #include "ac_llvm_util.h" #include "ac_llvm_build.h" #include <llvm-c/Core.h> #include <llvm-c/TargetMachine.h> struct pipe_debug_callback; @@ -98,20 +99,22 @@ struct si_shader_context { unsigned temps_count; LLVMValueRef system_values[RADEON_LLVM_MAX_SYSTEM_VALUES]; LLVMValueRef *imms; unsigned imms_num; struct si_llvm_flow *flow; unsigned flow_depth; unsigned flow_depth_max; + struct lp_build_if_state merged_wrap_if_state; + struct tgsi_array_info *temp_arrays; LLVMValueRef *temp_array_allocas; LLVMValueRef undef_alloca; LLVMValueRef main_fn; LLVMTypeRef return_type; /* Parameter indices for LLVMGetParam. */ int param_rw_buffers; -- 2.9.3 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev