gfx9: always wrap GS and TCS in an if-block (v2)

Nicolai Hähnle Sat, 29 Jul 2017 00:23:20 -0700

Thanks. I think I know what the issue is, testing a fix now.


Cheers,
Nicolai


On 28.07.2017 20:16, Marek Olšák wrote:

Hi Nicolai,

FYI, this patch breaks 30 piglit tests on GFX9. Non-monolithic andmonolithic variants fail in different ways:


1) Non-monolithic:

R600_DEBUG=nooptvariant ../piglit/bin/shader_runner/home/marek/dev/piglit/generated_tests/spec/arb_tessellation_shader/execution/built-in-functions/tcs-op-selection-bool-mat2-mat2.shader_test-auto

ATTENTION: default value of option vblank_mode overridden by environment.

LLVM triggered Diagnostic Handler: <unknown>:0:0: in function main <{i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32,i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float,float, float, float }> ([12 x <4 x i32>] addrspace(2)*, i32, i32, i32,i32, i32, i32, i32, i32, [32 x <4 x i32>] addrspace(2)*, [80 x <8 xi32>] addrspace(2)*, [16 x <4 x i32>] addrspace(2)*, i32, i32, i32, i32,i32, i32, i32, i32, i32, i32, [32 x <4 x i32>] addrspace(2)*, [80 x <8 xi32>] addrspace(2)*, i32, i32): unsupported dynamic alloca


LLVM failed to compile shader
radeonsi: can't compile a main shader part

2) Monolithic:

R600_DEBUG=mono ../piglit/bin/shader_runner/home/marek/dev/piglit/generated_tests/spec/arb_tessellation_shader/execution/built-in-functions/tcs-op-selection-bool-mat2-mat2.shader_test-auto

ATTENTION: default value of option vblank_mode overridden by environment.
LLVM ERROR: Cannot select: t5: i32,ch = stacksave t4
In function: wrapper
Segmentation fault


Marek

On Wed, Jul 26, 2017 at 1:56 PM, Nicolai Hähnle <nhaeh...@gmail.com<mailto:nhaeh...@gmail.com>> wrote:


    From: Nicolai Hähnle <nicolai.haeh...@amd.com
    <mailto:nicolai.haeh...@amd.com>>

    With merged ESGS shaders, the GS part of a wave may be empty, and the
    hardware gets confused if any GS messages are sent from that wave. Since
    S_SENDMSG is executed even when EXEC = 0, we have to wrap even
    non-monolithic GS shaders in an if-block, so that the entire shader and
    hence the S_SENDMSG instructions are skipped in empty waves.

    This change is not required for TCS/HS, but applying it there as well
    simplifies the logic a bit.

    Fixes GL45-CTS.geometry_shader.rendering.rendering.*

    v2: ensure that the TCS epilog doesn't run for non-existing patches

    Cc: mesa-sta...@lists.freedesktop.org
    <mailto:mesa-sta...@lists.freedesktop.org>
    ---
      src/gallium/drivers/radeonsi/si_shader.c          | 109
    +++++++++++++++-------
      src/gallium/drivers/radeonsi/si_shader_internal.h |   3 +
      2 files changed, 79 insertions(+), 33 deletions(-)

    diff --git a/src/gallium/drivers/radeonsi/si_shader.c
    b/src/gallium/drivers/radeonsi/si_shader.c
    index a153cb7..9376d90 100644
    --- a/src/gallium/drivers/radeonsi/si_shader.c
    +++ b/src/gallium/drivers/radeonsi/si_shader.c
    @@ -175,6 +175,20 @@ unsigned si_shader_io_get_unique_index(unsigned
    semantic_name, unsigned index)
      }

      /**
    + * Helper function that builds an LLVM IR PHI node and immediately adds
    + * incoming edges.
    + */
    +static LLVMValueRef
    +build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type,
    +         unsigned count_incoming, LLVMValueRef *values,
    +         LLVMBasicBlockRef *blocks)
    +{
    +       LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
    +       LLVMAddIncoming(phi, values, blocks, count_incoming);
    +       return phi;
    +}
    +
    +/**
       * Get the value of a shader input parameter and extract a bitfield.
       */
      static LLVMValueRef unpack_param(struct si_shader_context *ctx,
    @@ -2698,6 +2712,7 @@ si_insert_input_ptr_as_2xi32(struct
    si_shader_context *ctx, LLVMValueRef ret,
      static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context
    *bld_base)
      {
             struct si_shader_context *ctx = si_shader_context(bld_base);
    +       LLVMBuilderRef builder = ctx->gallivm.builder;
             LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset;

             si_copy_tcs_inputs(bld_base);
    @@ -2706,8 +2721,29 @@ static void si_llvm_emit_tcs_epilogue(struct
    lp_build_tgsi_context *bld_base)
             invocation_id = unpack_param(ctx, ctx->param_tcs_rel_ids,
    8, 5);
             tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx);

    +       if (ctx->screen->b.chip_class >= GFX9) {
    +               LLVMBasicBlockRef blocks[2] = {
    +                       LLVMGetInsertBlock(builder),
    +                       ctx->merged_wrap_if_state.entry_block
    +               };
    +               LLVMValueRef values[2];
    +
    +               lp_build_endif(&ctx->merged_wrap_if_state);
    +
    +               values[0] = rel_patch_id;
    +               values[1] = LLVMGetUndef(ctx->i32);
    +               rel_patch_id = build_phi(&ctx->ac, ctx->i32, 2,
    values, blocks);
    +
    +               values[0] = tf_lds_offset;
    +               values[1] = LLVMGetUndef(ctx->i32);
    +               tf_lds_offset = build_phi(&ctx->ac, ctx->i32, 2,
    values, blocks);
    +
    +               values[0] = invocation_id;
    +               values[1] = ctx->i32_1; /* cause the epilog to skip
    threads */
    +               invocation_id = build_phi(&ctx->ac, ctx->i32, 2,
    values, blocks);
    +       }
    +
             /* Return epilog parameters from this function. */
    -       LLVMBuilderRef builder = ctx->gallivm.builder;
             LLVMValueRef ret = ctx->return_value;
             unsigned vgpr;

    @@ -2935,6 +2971,9 @@ static void si_llvm_emit_gs_epilogue(struct
    lp_build_tgsi_context *bld_base)

             ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP |
    AC_SENDMSG_GS_DONE,
                              si_get_gs_wave_id(ctx));
    +
    +       if (ctx->screen->b.chip_class >= GFX9)
    +               lp_build_endif(&ctx->merged_wrap_if_state);
      }

      static void si_llvm_emit_vs_epilogue(struct lp_build_tgsi_context
    *bld_base)
    @@ -5502,14 +5541,20 @@ static bool si_compile_tgsi_main(struct
    si_shader_context *ctx,
             preload_ring_buffers(ctx);

             /* For GFX9 merged shaders:
    -        * - Set EXEC. If the prolog is present, set EXEC there instead.
    +        * - Set EXEC for the first shader. If the prolog is
    present, set
    +        *   EXEC there instead.
              * - Add a barrier before the second shader.
    +        * - In the second shader, reset EXEC to ~0 and wrap the
    main part in
    +        *   an if-statement. This is required for correctness in
    geometry
    +        *   shaders, to ensure that empty GS waves do not send
    GS_EMIT and
    +        *   GS_CUT messages.
              *
    -        * The same thing for monolithic shaders is done in
    -        * si_build_wrapper_function.
    +        * For monolithic merged shaders, the first shader is
    wrapped in an
    +        * if-block together with its prolog in
    si_build_wrapper_function.
              */
    -       if (ctx->screen->b.chip_class >= GFX9 && !is_monolithic) {
    -               if (sel->info.num_instructions > 1 && /* not empty
    shader */
    +       if (ctx->screen->b.chip_class >= GFX9) {
    +               if (!is_monolithic &&
    +                   sel->info.num_instructions > 1 && /* not empty
    shader */
                         (shader->key.as_es || shader->key.as_ls) &&
                         (ctx->type == PIPE_SHADER_TESS_EVAL ||
                          (ctx->type == PIPE_SHADER_VERTEX &&
    @@ -5518,9 +5563,19 @@ static bool si_compile_tgsi_main(struct
    si_shader_context *ctx,

ctx->param_merged_wave_info, 0);

                     } else if (ctx->type == PIPE_SHADER_TESS_CTRL ||
                                ctx->type == PIPE_SHADER_GEOMETRY) {
    -                       si_init_exec_from_input(ctx,

-ctx->param_merged_wave_info, 8);

    +                       if (!is_monolithic)
    +                               si_init_exec_full_mask(ctx);
    +
    +                       /* The barrier must execute for all shaders in a
    +                        * threadgroup.
    +                        */
                             si_llvm_emit_barrier(NULL, bld_base, NULL);
    +
    +                       LLVMValueRef num_threads = unpack_param(ctx,
    ctx->param_merged_wave_info, 8, 8);
    +                       LLVMValueRef ena =
    +                               LLVMBuildICmp(ctx->ac.builder,
    LLVMIntULT,

+ac_get_thread_id(&ctx->ac), num_threads, "");

    +                       lp_build_if(&ctx->merged_wrap_if_state,
    &ctx->gallivm, ena);
                     }
             }

    @@ -5991,15 +6046,9 @@ static void si_build_wrapper_function(struct
    si_shader_context *ctx,

                     /* Merged shaders are executed conditionally depending
                      * on the number of enabled threads passed in the
    input SGPRs. */
    -               if (is_merged_shader(ctx->shader) &&
    -                   (part == 0 || part == next_shader_first_part)) {
    +               if (is_merged_shader(ctx->shader) && part == 0) {
                             LLVMValueRef ena, count = initial[3];

    -                       /* The thread count for the 2nd shader is at
    bit-offset 8. */
    -                       if (part == next_shader_first_part) {
    -                               count = LLVMBuildLShr(builder, count,

-LLVMConstInt(ctx->i32, 8, 0), "");

    -                       }
                             count = LLVMBuildAnd(builder, count,
                                                  LLVMConstInt(ctx->i32,
    0x7f, 0), "");
                             ena = LLVMBuildICmp(builder, LLVMIntULT,
    @@ -6056,26 +6105,20 @@ static void si_build_wrapper_function(struct
    si_shader_context *ctx,
                     ret = LLVMBuildCall(builder, parts[part], in,
    num_params, "");

                     if (is_merged_shader(ctx->shader) &&
    -                   (part + 1 == next_shader_first_part ||
    -                    part + 1 == num_parts)) {
    +                   part + 1 == next_shader_first_part) {
                             lp_build_endif(&if_state);

    -                       if (part + 1 == next_shader_first_part) {
    -                               /* A barrier is required between 2
    merged shaders. */
    -                               si_llvm_emit_barrier(NULL,
    &ctx->bld_base, NULL);
    -
    -                               /* The second half of the merged
    shader should use
    -                                * the inputs from the toplevel
    (wrapper) function,
    -                                * not the return value from the
    last call.
    -                                *
    -                                * That's because the last call was
    executed condi-
    -                                * tionally, so we can't consume it
    in the main
    -                                * block.
    -                                */
    -                               memcpy(out, initial, sizeof(initial));
    -                               num_out = initial_num_out;
    -                               num_out_sgpr = initial_num_out_sgpr;
    -                       }
    +                       /* The second half of the merged shader
    should use
    +                        * the inputs from the toplevel (wrapper)
    function,
    +                        * not the return value from the last call.
    +                        *
    +                        * That's because the last call was executed
    condi-
    +                        * tionally, so we can't consume it in the main
    +                        * block.
    +                        */
    +                       memcpy(out, initial, sizeof(initial));
    +                       num_out = initial_num_out;
    +                       num_out_sgpr = initial_num_out_sgpr;
                             continue;
                     }

    diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h
    b/src/gallium/drivers/radeonsi/si_shader_internal.h
    index 6e86e0b..6b98bca 100644
    --- a/src/gallium/drivers/radeonsi/si_shader_internal.h
    +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
    @@ -25,6 +25,7 @@
      #define SI_SHADER_PRIVATE_H

      #include "si_shader.h"
    +#include "gallivm/lp_bld_flow.h"
      #include "gallivm/lp_bld_init.h"
      #include "gallivm/lp_bld_tgsi.h"
      #include "tgsi/tgsi_parse.h"
    @@ -105,6 +106,8 @@ struct si_shader_context {
             unsigned flow_depth;
             unsigned flow_depth_max;

    +       struct lp_build_if_state merged_wrap_if_state;
    +
             struct tgsi_array_info *temp_arrays;
             LLVMValueRef *temp_array_allocas;

    --
    2.9.3

    _______________________________________________
    mesa-stable mailing list
    mesa-sta...@lists.freedesktop.org
    <mailto:mesa-sta...@lists.freedesktop.org>
    https://lists.freedesktop.org/mailman/listinfo/mesa-stable
    <https://lists.freedesktop.org/mailman/listinfo/mesa-stable>



--
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.
_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [Mesa-stable] [PATCH v2 2/5] radeonsi/gfx9: always wrap GS and TCS in an if-block (v2)

Reply via email to