[Mesa-dev] [PATCH 07/14] radv: drop tess offchip layout for tcs.

2018-03-13 Thread Dave Airlie
From: Dave Airlie 

This removes the last TCS specific user sgpr.

Signed-off-by: Dave Airlie 
---
 src/amd/vulkan/radv_nir_to_llvm.c | 116 +-
 src/amd/vulkan/radv_pipeline.c|   9 ---
 src/amd/vulkan/radv_shader.c  |   2 +-
 src/amd/vulkan/radv_shader.h  |   2 +-
 4 files changed, 90 insertions(+), 39 deletions(-)

diff --git a/src/amd/vulkan/radv_nir_to_llvm.c 
b/src/amd/vulkan/radv_nir_to_llvm.c
index 8414247b54a..4bdc0e6e9ec 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -106,6 +106,7 @@ struct radv_shader_context {
uint64_t tcs_outputs_read;
uint32_t tcs_vertices_per_patch;
uint32_t tcs_num_inputs;
+   uint32_t tcs_num_patches;
 };
 
 enum radeon_llvm_calling_convention {
@@ -136,6 +137,46 @@ static LLVMValueRef get_rel_patch_id(struct 
radv_shader_context *ctx)
}
 }
 
+static unsigned
+get_tcs_num_patches(struct radv_shader_context *ctx)
+{
+   unsigned num_tcs_input_cp = ctx->options->key.tcs.input_vertices;
+   unsigned num_tcs_output_cp = ctx->tcs_vertices_per_patch;
+   uint32_t input_vertex_size = ctx->tcs_num_inputs * 16;
+   uint32_t input_patch_size = ctx->options->key.tcs.input_vertices * 
input_vertex_size;
+   uint32_t num_tcs_outputs = 
util_last_bit64(ctx->shader_info->info.tcs.outputs_written);
+   uint32_t num_tcs_patch_outputs = 
util_last_bit64(ctx->shader_info->info.tcs.patch_outputs_written);
+   uint32_t output_vertex_size = num_tcs_outputs * 16;
+   uint32_t pervertex_output_patch_size = ctx->tcs_vertices_per_patch * 
output_vertex_size;
+   uint32_t output_patch_size = pervertex_output_patch_size + 
num_tcs_patch_outputs * 16;
+   unsigned num_patches;
+   unsigned hardware_lds_size;
+
+   /* Ensure that we only need one wave per SIMD so we don't need to check
+* resource usage. Also ensures that the number of tcs in and out
+* vertices per threadgroup are at most 256.
+*/
+   num_patches = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp) * 4;
+   /* Make sure that the data fits in LDS. This assumes the shaders only
+* use LDS for the inputs and outputs.
+*/
+   hardware_lds_size = ctx->options->chip_class >= CIK ? 65536 : 32768;
+   num_patches = MIN2(num_patches, hardware_lds_size / (input_patch_size + 
output_patch_size));
+   /* Make sure the output data fits in the offchip buffer */
+   num_patches = MIN2(num_patches, 
(ctx->options->tess_offchip_block_dw_size * 4) / output_patch_size);
+   /* Not necessary for correctness, but improves performance. The
+* specific value is taken from the proprietary driver.
+*/
+   num_patches = MIN2(num_patches, 40);
+
+   /* SI bug workaround - limit LS-HS threadgroups to only one wave. */
+   if (ctx->options->chip_class == SI) {
+   unsigned one_wave = 64 / MAX2(num_tcs_input_cp, 
num_tcs_output_cp);
+   num_patches = MIN2(num_patches, one_wave);
+   }
+   return num_patches;
+}
+
 /* Tessellation shaders pass outputs to the next shader using LDS.
  *
  * LS outputs = TCS inputs
@@ -195,17 +236,17 @@ get_tcs_out_patch0_offset(struct radv_shader_context *ctx)
uint32_t input_vertex_size = ctx->tcs_num_inputs * 16;
uint32_t input_patch_size = ctx->options->key.tcs.input_vertices * 
input_vertex_size;
uint32_t output_patch0_offset = input_patch_size;
-   LLVMValueRef num_patches = ac_unpack_param(>ac, 
ctx->tcs_offchip_layout, 0, 9);
+   unsigned num_patches = ctx->tcs_num_patches;
 
+   output_patch0_offset *= num_patches;
output_patch0_offset /= 4;
-   return LLVMBuildMul(ctx->ac.builder,
-   num_patches,
-   LLVMConstInt(ctx->ac.i32, output_patch0_offset, 
false), "");
+   return LLVMConstInt(ctx->ac.i32, output_patch0_offset, false);
 }
 
 static LLVMValueRef
 get_tcs_out_patch0_patch_data_offset(struct radv_shader_context *ctx)
 {
+   assert (ctx->stage == MESA_SHADER_TESS_CTRL);
uint32_t input_vertex_size = ctx->tcs_num_inputs * 16;
uint32_t input_patch_size = ctx->options->key.tcs.input_vertices * 
input_vertex_size;
uint32_t output_patch0_offset = input_patch_size;
@@ -213,15 +254,12 @@ get_tcs_out_patch0_patch_data_offset(struct 
radv_shader_context *ctx)
uint32_t num_tcs_outputs = 
util_last_bit64(ctx->shader_info->info.tcs.outputs_written);
uint32_t output_vertex_size = num_tcs_outputs * 16;
uint32_t pervertex_output_patch_size = ctx->tcs_vertices_per_patch * 
output_vertex_size;
-   LLVMValueRef num_patches = ac_unpack_param(>ac, 
ctx->tcs_offchip_layout, 0, 9);
+   unsigned num_patches = ctx->tcs_num_patches;
 
+   output_patch0_offset *= num_patches;
+   output_patch0_offset += pervertex_output_patch_size;

[Mesa-dev] [PATCH 07/14] radv: drop tess offchip layout for tcs.

2018-02-20 Thread Dave Airlie
From: Dave Airlie 

This removes the last TCS specific user sgpr.

Signed-off-by: Dave Airlie 
---
 src/amd/common/ac_nir_to_llvm.c | 118 ++--
 src/amd/common/ac_nir_to_llvm.h |   2 +-
 src/amd/vulkan/radv_pipeline.c  |   9 ---
 src/amd/vulkan/radv_shader.c|   2 +-
 4 files changed, 92 insertions(+), 39 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 396b98698e6..90b27603266 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -125,6 +125,7 @@ struct radv_shader_context {
uint64_t tcs_outputs_read;
uint32_t tcs_vertices_per_patch;
uint32_t tcs_num_inputs;
+   uint32_t tcs_num_patches;
 };
 
 static inline struct radv_shader_context *
@@ -319,6 +320,46 @@ static LLVMValueRef get_rel_patch_id(struct 
radv_shader_context *ctx)
}
 }
 
+static unsigned
+get_tcs_num_patches(struct radv_shader_context *ctx)
+{
+   unsigned num_tcs_input_cp = ctx->options->key.tcs.input_vertices;
+   unsigned num_tcs_output_cp = ctx->tcs_vertices_per_patch;
+   uint32_t input_vertex_size = ctx->tcs_num_inputs * 16;
+   uint32_t input_patch_size = ctx->options->key.tcs.input_vertices * 
input_vertex_size;
+   uint32_t num_tcs_outputs = 
util_last_bit64(ctx->shader_info->info.tcs.outputs_written);
+   uint32_t num_tcs_patch_outputs = 
util_last_bit64(ctx->shader_info->info.tcs.patch_outputs_written);
+   uint32_t output_vertex_size = num_tcs_outputs * 16;
+   uint32_t pervertex_output_patch_size = ctx->tcs_vertices_per_patch * 
output_vertex_size;
+   uint32_t output_patch_size = pervertex_output_patch_size + 
num_tcs_patch_outputs * 16;
+   unsigned num_patches;
+   unsigned hardware_lds_size;
+
+   /* Ensure that we only need one wave per SIMD so we don't need to check
+* resource usage. Also ensures that the number of tcs in and out
+* vertices per threadgroup are at most 256.
+*/
+   num_patches = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp) * 4;
+   /* Make sure that the data fits in LDS. This assumes the shaders only
+* use LDS for the inputs and outputs.
+*/
+   hardware_lds_size = ctx->options->chip_class >= CIK ? 65536 : 32768;
+   num_patches = MIN2(num_patches, hardware_lds_size / (input_patch_size + 
output_patch_size));
+   /* Make sure the output data fits in the offchip buffer */
+   num_patches = MIN2(num_patches, 
(ctx->options->tess_offchip_block_dw_size * 4) / output_patch_size);
+   /* Not necessary for correctness, but improves performance. The
+* specific value is taken from the proprietary driver.
+*/
+   num_patches = MIN2(num_patches, 40);
+
+   /* SI bug workaround - limit LS-HS threadgroups to only one wave. */
+   if (ctx->options->chip_class == SI) {
+   unsigned one_wave = 64 / MAX2(num_tcs_input_cp, 
num_tcs_output_cp);
+   num_patches = MIN2(num_patches, one_wave);
+   }
+   return num_patches;
+}
+
 /* Tessellation shaders pass outputs to the next shader using LDS.
  *
  * LS outputs = TCS inputs
@@ -378,17 +419,17 @@ get_tcs_out_patch0_offset(struct radv_shader_context *ctx)
uint32_t input_vertex_size = ctx->tcs_num_inputs * 16;
uint32_t input_patch_size = ctx->options->key.tcs.input_vertices * 
input_vertex_size;
uint32_t output_patch0_offset = input_patch_size;
-   LLVMValueRef num_patches = unpack_param(>ac, 
ctx->tcs_offchip_layout, 0, 9);
+   unsigned num_patches = ctx->tcs_num_patches;
 
+   output_patch0_offset *= num_patches;
output_patch0_offset /= 4;
-   return LLVMBuildMul(ctx->ac.builder,
-   num_patches,
-   LLVMConstInt(ctx->ac.i32, output_patch0_offset, 
false), "");
+   return LLVMConstInt(ctx->ac.i32, output_patch0_offset, false);
 }
 
 static LLVMValueRef
 get_tcs_out_patch0_patch_data_offset(struct radv_shader_context *ctx)
 {
+   assert (ctx->stage == MESA_SHADER_TESS_CTRL);
uint32_t input_vertex_size = ctx->tcs_num_inputs * 16;
uint32_t input_patch_size = ctx->options->key.tcs.input_vertices * 
input_vertex_size;
uint32_t output_patch0_offset = input_patch_size;
@@ -396,15 +437,13 @@ get_tcs_out_patch0_patch_data_offset(struct 
radv_shader_context *ctx)
uint32_t num_tcs_outputs = 
util_last_bit64(ctx->shader_info->info.tcs.outputs_written);
uint32_t output_vertex_size = num_tcs_outputs * 16;
uint32_t pervertex_output_patch_size = ctx->tcs_vertices_per_patch * 
output_vertex_size;
-   LLVMValueRef num_patches = unpack_param(>ac, 
ctx->tcs_offchip_layout, 0, 9);
+   unsigned num_patches = ctx->tcs_num_patches;
 
+   output_patch0_offset *= num_patches;
+   output_patch0_offset += pervertex_output_patch_size;
output_patch0_offset /= 4;
-