[Mesa-dev] [PATCH 07/14] radv: drop tess offchip layout for tcs.
From: Dave AirlieThis removes the last TCS specific user sgpr. Signed-off-by: Dave Airlie --- src/amd/vulkan/radv_nir_to_llvm.c | 116 +- src/amd/vulkan/radv_pipeline.c| 9 --- src/amd/vulkan/radv_shader.c | 2 +- src/amd/vulkan/radv_shader.h | 2 +- 4 files changed, 90 insertions(+), 39 deletions(-) diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index 8414247b54a..4bdc0e6e9ec 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -106,6 +106,7 @@ struct radv_shader_context { uint64_t tcs_outputs_read; uint32_t tcs_vertices_per_patch; uint32_t tcs_num_inputs; + uint32_t tcs_num_patches; }; enum radeon_llvm_calling_convention { @@ -136,6 +137,46 @@ static LLVMValueRef get_rel_patch_id(struct radv_shader_context *ctx) } } +static unsigned +get_tcs_num_patches(struct radv_shader_context *ctx) +{ + unsigned num_tcs_input_cp = ctx->options->key.tcs.input_vertices; + unsigned num_tcs_output_cp = ctx->tcs_vertices_per_patch; + uint32_t input_vertex_size = ctx->tcs_num_inputs * 16; + uint32_t input_patch_size = ctx->options->key.tcs.input_vertices * input_vertex_size; + uint32_t num_tcs_outputs = util_last_bit64(ctx->shader_info->info.tcs.outputs_written); + uint32_t num_tcs_patch_outputs = util_last_bit64(ctx->shader_info->info.tcs.patch_outputs_written); + uint32_t output_vertex_size = num_tcs_outputs * 16; + uint32_t pervertex_output_patch_size = ctx->tcs_vertices_per_patch * output_vertex_size; + uint32_t output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16; + unsigned num_patches; + unsigned hardware_lds_size; + + /* Ensure that we only need one wave per SIMD so we don't need to check +* resource usage. Also ensures that the number of tcs in and out +* vertices per threadgroup are at most 256. +*/ + num_patches = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp) * 4; + /* Make sure that the data fits in LDS. This assumes the shaders only +* use LDS for the inputs and outputs. +*/ + hardware_lds_size = ctx->options->chip_class >= CIK ? 65536 : 32768; + num_patches = MIN2(num_patches, hardware_lds_size / (input_patch_size + output_patch_size)); + /* Make sure the output data fits in the offchip buffer */ + num_patches = MIN2(num_patches, (ctx->options->tess_offchip_block_dw_size * 4) / output_patch_size); + /* Not necessary for correctness, but improves performance. The +* specific value is taken from the proprietary driver. +*/ + num_patches = MIN2(num_patches, 40); + + /* SI bug workaround - limit LS-HS threadgroups to only one wave. */ + if (ctx->options->chip_class == SI) { + unsigned one_wave = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp); + num_patches = MIN2(num_patches, one_wave); + } + return num_patches; +} + /* Tessellation shaders pass outputs to the next shader using LDS. * * LS outputs = TCS inputs @@ -195,17 +236,17 @@ get_tcs_out_patch0_offset(struct radv_shader_context *ctx) uint32_t input_vertex_size = ctx->tcs_num_inputs * 16; uint32_t input_patch_size = ctx->options->key.tcs.input_vertices * input_vertex_size; uint32_t output_patch0_offset = input_patch_size; - LLVMValueRef num_patches = ac_unpack_param(>ac, ctx->tcs_offchip_layout, 0, 9); + unsigned num_patches = ctx->tcs_num_patches; + output_patch0_offset *= num_patches; output_patch0_offset /= 4; - return LLVMBuildMul(ctx->ac.builder, - num_patches, - LLVMConstInt(ctx->ac.i32, output_patch0_offset, false), ""); + return LLVMConstInt(ctx->ac.i32, output_patch0_offset, false); } static LLVMValueRef get_tcs_out_patch0_patch_data_offset(struct radv_shader_context *ctx) { + assert (ctx->stage == MESA_SHADER_TESS_CTRL); uint32_t input_vertex_size = ctx->tcs_num_inputs * 16; uint32_t input_patch_size = ctx->options->key.tcs.input_vertices * input_vertex_size; uint32_t output_patch0_offset = input_patch_size; @@ -213,15 +254,12 @@ get_tcs_out_patch0_patch_data_offset(struct radv_shader_context *ctx) uint32_t num_tcs_outputs = util_last_bit64(ctx->shader_info->info.tcs.outputs_written); uint32_t output_vertex_size = num_tcs_outputs * 16; uint32_t pervertex_output_patch_size = ctx->tcs_vertices_per_patch * output_vertex_size; - LLVMValueRef num_patches = ac_unpack_param(>ac, ctx->tcs_offchip_layout, 0, 9); + unsigned num_patches = ctx->tcs_num_patches; + output_patch0_offset *= num_patches; + output_patch0_offset += pervertex_output_patch_size;
[Mesa-dev] [PATCH 07/14] radv: drop tess offchip layout for tcs.
From: Dave AirlieThis removes the last TCS specific user sgpr. Signed-off-by: Dave Airlie --- src/amd/common/ac_nir_to_llvm.c | 118 ++-- src/amd/common/ac_nir_to_llvm.h | 2 +- src/amd/vulkan/radv_pipeline.c | 9 --- src/amd/vulkan/radv_shader.c| 2 +- 4 files changed, 92 insertions(+), 39 deletions(-) diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c index 396b98698e6..90b27603266 100644 --- a/src/amd/common/ac_nir_to_llvm.c +++ b/src/amd/common/ac_nir_to_llvm.c @@ -125,6 +125,7 @@ struct radv_shader_context { uint64_t tcs_outputs_read; uint32_t tcs_vertices_per_patch; uint32_t tcs_num_inputs; + uint32_t tcs_num_patches; }; static inline struct radv_shader_context * @@ -319,6 +320,46 @@ static LLVMValueRef get_rel_patch_id(struct radv_shader_context *ctx) } } +static unsigned +get_tcs_num_patches(struct radv_shader_context *ctx) +{ + unsigned num_tcs_input_cp = ctx->options->key.tcs.input_vertices; + unsigned num_tcs_output_cp = ctx->tcs_vertices_per_patch; + uint32_t input_vertex_size = ctx->tcs_num_inputs * 16; + uint32_t input_patch_size = ctx->options->key.tcs.input_vertices * input_vertex_size; + uint32_t num_tcs_outputs = util_last_bit64(ctx->shader_info->info.tcs.outputs_written); + uint32_t num_tcs_patch_outputs = util_last_bit64(ctx->shader_info->info.tcs.patch_outputs_written); + uint32_t output_vertex_size = num_tcs_outputs * 16; + uint32_t pervertex_output_patch_size = ctx->tcs_vertices_per_patch * output_vertex_size; + uint32_t output_patch_size = pervertex_output_patch_size + num_tcs_patch_outputs * 16; + unsigned num_patches; + unsigned hardware_lds_size; + + /* Ensure that we only need one wave per SIMD so we don't need to check +* resource usage. Also ensures that the number of tcs in and out +* vertices per threadgroup are at most 256. +*/ + num_patches = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp) * 4; + /* Make sure that the data fits in LDS. This assumes the shaders only +* use LDS for the inputs and outputs. +*/ + hardware_lds_size = ctx->options->chip_class >= CIK ? 65536 : 32768; + num_patches = MIN2(num_patches, hardware_lds_size / (input_patch_size + output_patch_size)); + /* Make sure the output data fits in the offchip buffer */ + num_patches = MIN2(num_patches, (ctx->options->tess_offchip_block_dw_size * 4) / output_patch_size); + /* Not necessary for correctness, but improves performance. The +* specific value is taken from the proprietary driver. +*/ + num_patches = MIN2(num_patches, 40); + + /* SI bug workaround - limit LS-HS threadgroups to only one wave. */ + if (ctx->options->chip_class == SI) { + unsigned one_wave = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp); + num_patches = MIN2(num_patches, one_wave); + } + return num_patches; +} + /* Tessellation shaders pass outputs to the next shader using LDS. * * LS outputs = TCS inputs @@ -378,17 +419,17 @@ get_tcs_out_patch0_offset(struct radv_shader_context *ctx) uint32_t input_vertex_size = ctx->tcs_num_inputs * 16; uint32_t input_patch_size = ctx->options->key.tcs.input_vertices * input_vertex_size; uint32_t output_patch0_offset = input_patch_size; - LLVMValueRef num_patches = unpack_param(>ac, ctx->tcs_offchip_layout, 0, 9); + unsigned num_patches = ctx->tcs_num_patches; + output_patch0_offset *= num_patches; output_patch0_offset /= 4; - return LLVMBuildMul(ctx->ac.builder, - num_patches, - LLVMConstInt(ctx->ac.i32, output_patch0_offset, false), ""); + return LLVMConstInt(ctx->ac.i32, output_patch0_offset, false); } static LLVMValueRef get_tcs_out_patch0_patch_data_offset(struct radv_shader_context *ctx) { + assert (ctx->stage == MESA_SHADER_TESS_CTRL); uint32_t input_vertex_size = ctx->tcs_num_inputs * 16; uint32_t input_patch_size = ctx->options->key.tcs.input_vertices * input_vertex_size; uint32_t output_patch0_offset = input_patch_size; @@ -396,15 +437,13 @@ get_tcs_out_patch0_patch_data_offset(struct radv_shader_context *ctx) uint32_t num_tcs_outputs = util_last_bit64(ctx->shader_info->info.tcs.outputs_written); uint32_t output_vertex_size = num_tcs_outputs * 16; uint32_t pervertex_output_patch_size = ctx->tcs_vertices_per_patch * output_vertex_size; - LLVMValueRef num_patches = unpack_param(>ac, ctx->tcs_offchip_layout, 0, 9); + unsigned num_patches = ctx->tcs_num_patches; + output_patch0_offset *= num_patches; + output_patch0_offset += pervertex_output_patch_size; output_patch0_offset /= 4; -