From: Nicolai Hähnle <nicolai.haeh...@amd.com>

With merged ESGS shaders, the GS part of a wave may be empty, and the
hardware gets confused if any GS messages are sent from that wave. Since
S_SENDMSG is executed even when EXEC = 0, we have to wrap even
non-monolithic GS shaders in an if-block, so that the entire shader and
hence the S_SENDMSG instructions are skipped in empty waves.

This change is not required for TCS/HS, but applying it there as well
simplifies the code a bit.

Fixes GL45-CTS.geometry_shader.rendering.rendering.*

Cc: mesa-sta...@lists.freedesktop.org
---
 src/gallium/drivers/radeonsi/si_shader.c          | 74 +++++++++++++----------
 src/gallium/drivers/radeonsi/si_shader_internal.h |  3 +
 2 files changed, 45 insertions(+), 32 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
b/src/gallium/drivers/radeonsi/si_shader.c
index 7a44e61..9aeda49 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2713,20 +2713,23 @@ si_insert_input_ptr_as_2xi32(struct si_shader_context 
*ctx, LLVMValueRef ret,
 }
 
 /* This only writes the tessellation factor levels. */
 static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base)
 {
        struct si_shader_context *ctx = si_shader_context(bld_base);
        LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;
 
        si_copy_tcs_inputs(bld_base);
 
+       if (ctx->screen->b.chip_class >= GFX9)
+               lp_build_endif(&ctx->merged_wrap_if_state);
+
        rel_patch_id = get_rel_patch_id(ctx);
        invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids, 8, 5);
        tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);
 
        /* Return epilog parameters from this function. */
        LLVMBuilderRef builder = ctx->gallivm.builder;
        LLVMValueRef ret = ctx->return_value;
        unsigned vgpr;
 
        if (ctx->screen->b.chip_class >= GFX9) {
@@ -2946,20 +2949,23 @@ static LLVMValueRef si_get_gs_wave_id(struct 
si_shader_context *ctx)
        else
                return LLVMGetParam(ctx->main_fn, ctx->param_gs_wave_id);
 }
 
 static void si_llvm_emit_gs_epilogue(struct lp_build_tgsi_context *bld_base)
 {
        struct si_shader_context *ctx = si_shader_context(bld_base);
 
        ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE,
                         si_get_gs_wave_id(ctx));
+
+       if (ctx->screen->b.chip_class >= GFX9)
+               lp_build_endif(&ctx->merged_wrap_if_state);
 }
 
 static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context *bld_base)
 {
        struct si_shader_context *ctx = si_shader_context(bld_base);
        struct gallivm_state *gallivm = &ctx->gallivm;
        struct tgsi_shader_info *info = &ctx->shader->selector->info;
        struct si_shader_output_values *outputs = NULL;
        int i,j;
 
@@ -5523,39 +5529,55 @@ static bool si_compile_tgsi_main(struct 
si_shader_context *ctx,
                break;
        default:
                assert(!"Unsupported shader type");
                return false;
        }
 
        create_function(ctx);
        preload_ring_buffers(ctx);
 
        /* For GFX9 merged shaders:
-        * - Set EXEC. If the prolog is present, set EXEC there instead.
+        * - Set EXEC for the first shader. If the prolog is present, set
+        *   EXEC there instead.
         * - Add a barrier before the second shader.
+        * - In the second shader, reset EXEC to ~0 and wrap the main part in
+        *   an if-statement. This is required for correctness in geometry
+        *   shaders, to ensure that empty GS waves do not send GS_EMIT and
+        *   GS_CUT messages.
         *
-        * The same thing for monolithic shaders is done in
-        * si_build_wrapper_function.
+        * For monolithic merged shaders, the first shader is wrapped in an
+        * if-block together with its prolog in si_build_wrapper_function.
         */
-       if (ctx->screen->b.chip_class >= GFX9 && !is_monolithic) {
-               if (sel->info.num_instructions > 1 && /* not empty shader */
+       if (ctx->screen->b.chip_class >= GFX9) {
+               if (!is_monolithic &&
+                   sel->info.num_instructions > 1 && /* not empty shader */
                    (shader->key.as_es || shader->key.as_ls) &&
                    (ctx->type == PIPE_SHADER_TESS_EVAL ||
                     (ctx->type == PIPE_SHADER_VERTEX &&
                      !sel->vs_needs_prolog))) {
                        si_init_exec_from_input(ctx,
                                                ctx->param_merged_wave_info, 0);
                } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
                           ctx->type == PIPE_SHADER_GEOMETRY) {
-                       si_init_exec_from_input(ctx,
-                                               ctx->param_merged_wave_info, 8);
+                       if (!is_monolithic)
+                               si_init_exec_full_mask(ctx);
+
+                       /* The barrier must execute for all shaders in a
+                        * threadgroup.
+                        */
                        si_llvm_emit_barrier(NULL, bld_base, NULL);
+
+                       LLVMValueRef num_threads = unpack_param(ctx, 
ctx->param_merged_wave_info, 8, 8);
+                       LLVMValueRef ena =
+                               LLVMBuildICmp(ctx->ac.builder, LLVMIntULT,
+                                           ac_get_thread_id(&ctx->ac), 
num_threads, "");
+                       lp_build_if(&ctx->merged_wrap_if_state, &ctx->gallivm, 
ena);
                }
        }
 
        if (ctx->type == PIPE_SHADER_GEOMETRY) {
                int i;
                for (i = 0; i < 4; i++) {
                        ctx->gs_next_vertex[i] =
                                lp_build_alloca(&ctx->gallivm,
                                                ctx->i32, "");
                }
@@ -6012,29 +6034,23 @@ static void si_build_wrapper_function(struct 
si_shader_context *ctx,
                LLVMValueRef in[48];
                LLVMValueRef ret;
                LLVMTypeRef ret_type;
                unsigned out_idx = 0;
 
                num_params = LLVMCountParams(parts[part]);
                assert(num_params <= ARRAY_SIZE(param_types));
 
                /* Merged shaders are executed conditionally depending
                 * on the number of enabled threads passed in the input SGPRs. 
*/
-               if (is_merged_shader(ctx->shader) &&
-                   (part == 0 || part == next_shader_first_part)) {
+               if (is_merged_shader(ctx->shader) && part == 0) {
                        LLVMValueRef ena, count = initial[3];
 
-                       /* The thread count for the 2nd shader is at bit-offset 
8. */
-                       if (part == next_shader_first_part) {
-                               count = LLVMBuildLShr(builder, count,
-                                                     LLVMConstInt(ctx->i32, 8, 
0), "");
-                       }
                        count = LLVMBuildAnd(builder, count,
                                             LLVMConstInt(ctx->i32, 0x7f, 0), 
"");
                        ena = LLVMBuildICmp(builder, LLVMIntULT,
                                            ac_get_thread_id(&ctx->ac), count, 
"");
                        lp_build_if(&if_state, &ctx->gallivm, ena);
                }
 
                /* Derive arguments for the next part from outputs of the
                 * previous one.
                 */
@@ -6077,40 +6093,34 @@ static void si_build_wrapper_function(struct 
si_shader_context *ctx,
                                }
                        }
 
                        in[param_idx] = arg;
                        out_idx += param_size;
                }
 
                ret = LLVMBuildCall(builder, parts[part], in, num_params, "");
 
                if (is_merged_shader(ctx->shader) &&
-                   (part + 1 == next_shader_first_part ||
-                    part + 1 == num_parts)) {
+                   part + 1 == next_shader_first_part) {
                        lp_build_endif(&if_state);
 
-                       if (part + 1 == next_shader_first_part) {
-                               /* A barrier is required between 2 merged 
shaders. */
-                               si_llvm_emit_barrier(NULL, &ctx->bld_base, 
NULL);
-
-                               /* The second half of the merged shader should 
use
-                                * the inputs from the toplevel (wrapper) 
function,
-                                * not the return value from the last call.
-                                *
-                                * That's because the last call was executed 
condi-
-                                * tionally, so we can't consume it in the main
-                                * block.
-                                */
-                               memcpy(out, initial, sizeof(initial));
-                               num_out = initial_num_out;
-                               num_out_sgpr = initial_num_out_sgpr;
-                       }
+                       /* The second half of the merged shader should use
+                        * the inputs from the toplevel (wrapper) function,
+                        * not the return value from the last call.
+                        *
+                        * That's because the last call was executed condi-
+                        * tionally, so we can't consume it in the main
+                        * block.
+                        */
+                       memcpy(out, initial, sizeof(initial));
+                       num_out = initial_num_out;
+                       num_out_sgpr = initial_num_out_sgpr;
                        continue;
                }
 
                /* Extract the returned GPRs. */
                ret_type = LLVMTypeOf(ret);
                num_out = 0;
                num_out_sgpr = 0;
 
                if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
                        assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h 
b/src/gallium/drivers/radeonsi/si_shader_internal.h
index 3556e69..3f127cf 100644
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@@ -18,20 +18,21 @@
  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
 #ifndef SI_SHADER_PRIVATE_H
 #define SI_SHADER_PRIVATE_H
 
 #include "si_shader.h"
+#include "gallivm/lp_bld_flow.h"
 #include "gallivm/lp_bld_init.h"
 #include "gallivm/lp_bld_tgsi.h"
 #include "tgsi/tgsi_parse.h"
 #include "ac_llvm_util.h"
 #include "ac_llvm_build.h"
 
 #include <llvm-c/Core.h>
 #include <llvm-c/TargetMachine.h>
 
 struct pipe_debug_callback;
@@ -98,20 +99,22 @@ struct si_shader_context {
        unsigned temps_count;
        LLVMValueRef system_values[RADEON_LLVM_MAX_SYSTEM_VALUES];
 
        LLVMValueRef *imms;
        unsigned imms_num;
 
        struct si_llvm_flow *flow;
        unsigned flow_depth;
        unsigned flow_depth_max;
 
+       struct lp_build_if_state merged_wrap_if_state;
+
        struct tgsi_array_info *temp_arrays;
        LLVMValueRef *temp_array_allocas;
 
        LLVMValueRef undef_alloca;
 
        LLVMValueRef main_fn;
        LLVMTypeRef return_type;
 
        /* Parameter indices for LLVMGetParam. */
        int param_rw_buffers;
-- 
2.9.3

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to