[Mesa-dev] [PATCH 07/14] radv: drop tess offchip layout for tcs.

Dave Airlie Tue, 20 Feb 2018 17:36:26 -0800

From: Dave Airlie <airl...@redhat.com>

This removes the last TCS specific user sgpr.


Signed-off-by: Dave Airlie <airl...@redhat.com>
---
 src/amd/common/ac_nir_to_llvm.c | 118 ++++++++++++++++++++++++++++++----------
 src/amd/common/ac_nir_to_llvm.h |   2 +-
 src/amd/vulkan/radv_pipeline.c  |   9 ---
 src/amd/vulkan/radv_shader.c    |   2 +-
 4 files changed, 92 insertions(+), 39 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 396b98698e6..90b27603266 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -125,6 +125,7 @@ struct radv_shader_context {
        uint64_t tcs_outputs_read;
        uint32_t tcs_vertices_per_patch;
        uint32_t tcs_num_inputs;
+       uint32_t tcs_num_patches;
 };
 
 static inline struct radv_shader_context *
@@ -319,6 +320,46 @@ static LLVMValueRef get_rel_patch_id(struct 
radv_shader_context *ctx)
        }
 }
 
+static unsigned
+get_tcs_num_patches(struct radv_shader_context *ctx)
+{
+       unsigned num_tcs_input_cp = ctx->options->key.tcs.input_vertices;
+       unsigned num_tcs_output_cp = ctx->tcs_vertices_per_patch;
+       uint32_t input_vertex_size = ctx->tcs_num_inputs * 16;
+       uint32_t input_patch_size = ctx->options->key.tcs.input_vertices * 
input_vertex_size;
+       uint32_t num_tcs_outputs = 
util_last_bit64(ctx->shader_info->info.tcs.outputs_written);
+       uint32_t num_tcs_patch_outputs = 
util_last_bit64(ctx->shader_info->info.tcs.patch_outputs_written);
+       uint32_t output_vertex_size = num_tcs_outputs * 16;
+       uint32_t pervertex_output_patch_size = ctx->tcs_vertices_per_patch * 
output_vertex_size;
+       uint32_t output_patch_size = pervertex_output_patch_size + 
num_tcs_patch_outputs * 16;
+       unsigned num_patches;
+       unsigned hardware_lds_size;
+
+       /* Ensure that we only need one wave per SIMD so we don't need to check
+        * resource usage. Also ensures that the number of tcs in and out
+        * vertices per threadgroup are at most 256.
+        */
+       num_patches = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp) * 4;
+       /* Make sure that the data fits in LDS. This assumes the shaders only
+        * use LDS for the inputs and outputs.
+        */
+       hardware_lds_size = ctx->options->chip_class >= CIK ? 65536 : 32768;
+       num_patches = MIN2(num_patches, hardware_lds_size / (input_patch_size + 
output_patch_size));
+       /* Make sure the output data fits in the offchip buffer */
+       num_patches = MIN2(num_patches, 
(ctx->options->tess_offchip_block_dw_size * 4) / output_patch_size);
+       /* Not necessary for correctness, but improves performance. The
+        * specific value is taken from the proprietary driver.
+        */
+       num_patches = MIN2(num_patches, 40);
+
+       /* SI bug workaround - limit LS-HS threadgroups to only one wave. */
+       if (ctx->options->chip_class == SI) {
+               unsigned one_wave = 64 / MAX2(num_tcs_input_cp, 
num_tcs_output_cp);
+               num_patches = MIN2(num_patches, one_wave);
+       }
+       return num_patches;
+}
+
 /* Tessellation shaders pass outputs to the next shader using LDS.
  *
  * LS outputs = TCS inputs
@@ -378,17 +419,17 @@ get_tcs_out_patch0_offset(struct radv_shader_context *ctx)
        uint32_t input_vertex_size = ctx->tcs_num_inputs * 16;
        uint32_t input_patch_size = ctx->options->key.tcs.input_vertices * 
input_vertex_size;
        uint32_t output_patch0_offset = input_patch_size;
-       LLVMValueRef num_patches = unpack_param(&ctx->ac, 
ctx->tcs_offchip_layout, 0, 9);
+       unsigned num_patches = ctx->tcs_num_patches;
 
+       output_patch0_offset *= num_patches;
        output_patch0_offset /= 4;
-       return LLVMBuildMul(ctx->ac.builder,
-                           num_patches,
-                           LLVMConstInt(ctx->ac.i32, output_patch0_offset, 
false), "");
+       return LLVMConstInt(ctx->ac.i32, output_patch0_offset, false);
 }
 
 static LLVMValueRef
 get_tcs_out_patch0_patch_data_offset(struct radv_shader_context *ctx)
 {
+       assert (ctx->stage == MESA_SHADER_TESS_CTRL);
        uint32_t input_vertex_size = ctx->tcs_num_inputs * 16;
        uint32_t input_patch_size = ctx->options->key.tcs.input_vertices * 
input_vertex_size;
        uint32_t output_patch0_offset = input_patch_size;
@@ -396,15 +437,13 @@ get_tcs_out_patch0_patch_data_offset(struct 
radv_shader_context *ctx)
        uint32_t num_tcs_outputs = 
util_last_bit64(ctx->shader_info->info.tcs.outputs_written);
        uint32_t output_vertex_size = num_tcs_outputs * 16;
        uint32_t pervertex_output_patch_size = ctx->tcs_vertices_per_patch * 
output_vertex_size;
-       LLVMValueRef num_patches = unpack_param(&ctx->ac, 
ctx->tcs_offchip_layout, 0, 9);
+       unsigned num_patches = ctx->tcs_num_patches;
 
+       output_patch0_offset *= num_patches;
+       output_patch0_offset += pervertex_output_patch_size;
        output_patch0_offset /= 4;
-       LLVMValueRef value = LLVMBuildMul(ctx->ac.builder,
-                           num_patches,
-                           LLVMConstInt(ctx->ac.i32, output_patch0_offset, 
false), "");
-       return LLVMBuildAdd(ctx->ac.builder,
-                           value,
-                           LLVMConstInt(ctx->ac.i32, 
pervertex_output_patch_size / 4, false), "");
+
+       return LLVMConstInt(ctx->ac.i32, output_patch0_offset, false);
 }
 
 static LLVMValueRef
@@ -560,7 +599,6 @@ static void allocate_user_sgprs(struct radv_shader_context 
*ctx,
                        if (previous_stage == MESA_SHADER_VERTEX)
                                user_sgpr_info->sgpr_count += 
count_vs_user_sgprs(ctx);
                }
-               user_sgpr_info->sgpr_count += 1;
                break;
        case MESA_SHADER_TESS_EVAL:
                user_sgpr_info->sgpr_count += 1;
@@ -827,8 +865,6 @@ static void create_function(struct radv_shader_context *ctx,
                                                        has_previous_stage,
                                                        previous_stage, &args);
 
-                       add_arg(&args, ARG_SGPR, ctx->ac.i32,
-                               &ctx->tcs_offchip_layout);
                        if (needs_view_index)
                                add_arg(&args, ARG_SGPR, ctx->ac.i32,
                                        &ctx->abi.view_index);
@@ -846,8 +882,6 @@ static void create_function(struct radv_shader_context *ctx,
                                                   &user_sgpr_info, &args,
                                                   &desc_sets);
 
-                       add_arg(&args, ARG_SGPR, ctx->ac.i32,
-                               &ctx->tcs_offchip_layout);
                        if (needs_view_index)
                                add_arg(&args, ARG_SGPR, ctx->ac.i32,
                                        &ctx->abi.view_index);
@@ -1056,7 +1090,6 @@ static void create_function(struct radv_shader_context 
*ctx,
        case MESA_SHADER_TESS_CTRL:
                set_vs_specific_input_locs(ctx, stage, has_previous_stage,
                                           previous_stage, &user_sgpr_idx);
-               set_loc_shader(ctx, AC_UD_TCS_OFFCHIP_LAYOUT, &user_sgpr_idx, 
1);
                if (ctx->abi.view_index)
                        set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 
1);
                break;
@@ -2748,30 +2781,59 @@ out:
  *
  * Note that every attribute has 4 components.
  */
+static LLVMValueRef get_non_vertex_index_offset(struct radv_shader_context 
*ctx)
+{
+       if (ctx->stage == MESA_SHADER_TESS_CTRL) {
+               uint32_t num_tcs_outputs = 
util_last_bit64(ctx->shader_info->info.tcs.outputs_written);
+               uint32_t output_vertex_size = num_tcs_outputs * 16;
+               uint32_t pervertex_output_patch_size = 
ctx->tcs_vertices_per_patch * output_vertex_size;
+               uint32_t num_patches = ctx->tcs_num_patches;
+
+               return LLVMConstInt(ctx->ac.i32, pervertex_output_patch_size * 
num_patches, false);
+       } else
+               return unpack_param(&ctx->ac, ctx->tcs_offchip_layout, 16, 16);
+}
+
+static LLVMValueRef calc_param_stride(struct radv_shader_context *ctx,
+                                     LLVMValueRef vertex_index)
+{
+       LLVMValueRef param_stride;
+       if (ctx->stage == MESA_SHADER_TESS_CTRL) {
+               if (vertex_index)
+                       param_stride = LLVMConstInt(ctx->ac.i32, 
ctx->tcs_vertices_per_patch * ctx->tcs_num_patches, false);
+               else
+                       param_stride = LLVMConstInt(ctx->ac.i32, 
ctx->tcs_num_patches, false);
+       } else {
+               LLVMValueRef num_patches = unpack_param(&ctx->ac, 
ctx->tcs_offchip_layout, 0, 9);
+               LLVMValueRef vertices_per_patch = LLVMConstInt(ctx->ac.i32, 
ctx->tcs_vertices_per_patch, false);
+               if (vertex_index)
+                       param_stride = LLVMBuildMul(ctx->ac.builder, 
vertices_per_patch,
+                                                   num_patches, "");
+               else
+                       param_stride = num_patches;
+       }
+       return param_stride;
+
+}
+
 static LLVMValueRef get_tcs_tes_buffer_address(struct radv_shader_context *ctx,
                                                LLVMValueRef vertex_index,
                                                LLVMValueRef param_index)
 {
-       LLVMValueRef base_addr, vertices_per_patch, num_patches;
+       LLVMValueRef base_addr;
        LLVMValueRef param_stride, constant16;
        LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
-
-       vertices_per_patch = LLVMConstInt(ctx->ac.i32, 
ctx->tcs_vertices_per_patch, false);
-       num_patches = unpack_param(&ctx->ac, ctx->tcs_offchip_layout, 0, 9);
-
+       LLVMValueRef vertices_per_patch = LLVMConstInt(ctx->ac.i32, 
ctx->tcs_vertices_per_patch, false);
        constant16 = LLVMConstInt(ctx->ac.i32, 16, false);
+       param_stride = calc_param_stride(ctx, vertex_index);
        if (vertex_index) {
                base_addr = LLVMBuildMul(ctx->ac.builder, rel_patch_id,
                                         vertices_per_patch, "");
 
                base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
                                         vertex_index, "");
-
-               param_stride = LLVMBuildMul(ctx->ac.builder, vertices_per_patch,
-                                           num_patches, "");
        } else {
                base_addr = rel_patch_id;
-               param_stride = num_patches;
        }
 
        base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
@@ -2781,8 +2843,7 @@ static LLVMValueRef get_tcs_tes_buffer_address(struct 
radv_shader_context *ctx,
        base_addr = LLVMBuildMul(ctx->ac.builder, base_addr, constant16, "");
 
        if (!vertex_index) {
-               LLVMValueRef patch_data_offset =
-                          unpack_param(&ctx->ac, ctx->tcs_offchip_layout, 16, 
16);
+               LLVMValueRef patch_data_offset = 
get_non_vertex_index_offset(ctx);
 
                base_addr = LLVMBuildAdd(ctx->ac.builder, base_addr,
                                         patch_data_offset, "");
@@ -6888,6 +6949,7 @@ LLVMModuleRef 
ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
                                ctx.tcs_num_inputs = 
ctx.options->key.tcs.num_inputs;
                        else
                                ctx.tcs_num_inputs = 
util_last_bit64(shader_info->info.vs.ls_outputs_written);
+                       ctx.tcs_num_patches = get_tcs_num_patches(&ctx);
                } else if (shaders[i]->info.stage == MESA_SHADER_TESS_EVAL) {
                        ctx.tes_primitive_mode = 
shaders[i]->info.tess.primitive_mode;
                        ctx.abi.load_tess_varyings = load_tes_input;
diff --git a/src/amd/common/ac_nir_to_llvm.h b/src/amd/common/ac_nir_to_llvm.h
index 5f8dcd3b9b4..d81123144df 100644
--- a/src/amd/common/ac_nir_to_llvm.h
+++ b/src/amd/common/ac_nir_to_llvm.h
@@ -87,6 +87,7 @@ struct ac_nir_compiler_options {
        bool dump_preoptir;
        enum radeon_family family;
        enum chip_class chip_class;
+       uint32_t tess_offchip_block_dw_size;
 };
 
 struct ac_userdata_info {
@@ -111,7 +112,6 @@ enum ac_ud_index {
        AC_UD_CS_MAX_UD,
        AC_UD_GS_VS_RING_STRIDE_ENTRIES = AC_UD_VS_MAX_UD,
        AC_UD_GS_MAX_UD,
-       AC_UD_TCS_OFFCHIP_LAYOUT = AC_UD_VS_MAX_UD,
        AC_UD_TCS_MAX_UD,
        AC_UD_TES_OFFCHIP_LAYOUT = AC_UD_SHADER_START,
        AC_UD_TES_MAX_UD,
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index 3a8de4717a9..06b2db8455f 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -2605,15 +2605,6 @@ radv_pipeline_generate_tess_shaders(struct 
radeon_winsys_cs *cs,
 
        struct ac_userdata_info *loc;
 
-       loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_TESS_CTRL, 
AC_UD_TCS_OFFCHIP_LAYOUT);
-       if (loc->sgpr_idx != -1) {
-               uint32_t base_reg = 
pipeline->user_data_0[MESA_SHADER_TESS_CTRL];
-               assert(loc->num_sgprs == 1);
-               assert(!loc->indirect);
-               radeon_set_sh_reg_seq(cs, base_reg + loc->sgpr_idx * 4, 1);
-               radeon_emit(cs, tess->offchip_layout);
-       }
-
        loc = radv_lookup_user_sgpr(pipeline, MESA_SHADER_TESS_EVAL, 
AC_UD_TES_OFFCHIP_LAYOUT);
        if (loc->sgpr_idx != -1) {
                uint32_t base_reg = 
pipeline->user_data_0[MESA_SHADER_TESS_EVAL];
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index d9b8e209a99..de1075cb762 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -487,7 +487,7 @@ shader_variant_create(struct radv_device *device,
        options->chip_class = device->physical_device->rad_info.chip_class;
        options->dump_preoptir = radv_can_dump_shader(device, module) &&
                                 device->instance->debug_flags & 
RADV_DEBUG_PREOPTIR;
-
+       options->tess_offchip_block_dw_size = 
device->tess_offchip_block_dw_size;
        if (options->supports_spill)
                tm_options |= AC_TM_SUPPORTS_SPILL;
        if (device->instance->perftest_flags & RADV_PERFTEST_SISCHED)
-- 
2.14.3

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 07/14] radv: drop tess offchip layout for tcs.

Reply via email to