Re: [Mesa-dev] [PATCH 4/4] radv: reduce the number of loaded channels for vertex input fetches

2019-02-13 Thread Samuel Pitoiset


On 2/13/19 10:59 PM, Bas Nieuwenhuizen wrote:

On Tue, Feb 12, 2019 at 3:07 PM Samuel Pitoiset
 wrote:

It's unnecessary to load more channels than the vertex attribute
format. The remaining channels are filled with 0 for y and z,
and 1 for w.

29077 shaders in 15096 tests
Totals:
SGPRS: 1321605 -> 1318869 (-0.21 %)
VGPRS: 935236 -> 932252 (-0.32 %)
Spilled SGPRs: 24860 -> 24776 (-0.34 %)
Code Size: 49832348 -> 49819464 (-0.03 %) bytes
Max Waves: 242101 -> 242611 (0.21 %)

Totals from affected shaders:
SGPRS: 93675 -> 90939 (-2.92 %)
VGPRS: 58016 -> 55032 (-5.14 %)
Spilled SGPRs: 172 -> 88 (-48.84 %)
Code Size: 2862740 -> 2849856 (-0.45 %) bytes
Max Waves: 15474 -> 15984 (3.30 %)

This mostly helps Croteam games (Talos/Sam2017).

Signed-off-by: Samuel Pitoiset 
---
  src/amd/vulkan/radv_nir_to_llvm.c | 83 ++-
  1 file changed, 81 insertions(+), 2 deletions(-)

diff --git a/src/amd/vulkan/radv_nir_to_llvm.c 
b/src/amd/vulkan/radv_nir_to_llvm.c
index 7f74678d5f1..b1e0c64e4e1 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -1967,6 +1967,72 @@ adjust_vertex_fetch_alpha(struct radv_shader_context 
*ctx,
 return alpha;
  }

+static unsigned
+get_num_channels_from_data_format(unsigned data_format)
+{
+   switch (data_format) {
+   case V_008F0C_BUF_DATA_FORMAT_8:
+   case V_008F0C_BUF_DATA_FORMAT_16:
+   case V_008F0C_BUF_DATA_FORMAT_32:
+   return 1;
+   case V_008F0C_BUF_DATA_FORMAT_8_8:
+   case V_008F0C_BUF_DATA_FORMAT_16_16:
+   case V_008F0C_BUF_DATA_FORMAT_32_32:
+   return 2;
+   case V_008F0C_BUF_DATA_FORMAT_10_11_11:
+   case V_008F0C_BUF_DATA_FORMAT_11_11_10:
+   case V_008F0C_BUF_DATA_FORMAT_32_32_32:
+   return 3;
+   case V_008F0C_BUF_DATA_FORMAT_8_8_8_8:
+   case V_008F0C_BUF_DATA_FORMAT_10_10_10_2:
+   case V_008F0C_BUF_DATA_FORMAT_2_10_10_10:
+   case V_008F0C_BUF_DATA_FORMAT_16_16_16_16:
+   case V_008F0C_BUF_DATA_FORMAT_32_32_32_32:
+   return 4;
+   default:
+   break;
+   }
+
+   return 4;
+}
+
+static LLVMValueRef
+radv_fixup_vertex_input_fetches(struct radv_shader_context *ctx,
+   LLVMValueRef value,
+   unsigned num_channels,
+   bool is_float)
+{
+   LLVMValueRef zero = is_float ? ctx->ac.f32_0 : ctx->ac.i32_0;
+   LLVMValueRef one = is_float ? ctx->ac.f32_1 : ctx->ac.i32_1;
+   LLVMTypeRef elemtype;
+   LLVMValueRef chan[4];
+
+   if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
+   unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
+
+   if (num_channels == 4 && vec_size == 4)
+   return value;

Just num_channels == vec_size ?

Ok.



+
+   num_channels = MIN2(num_channels, vec_size);
+
+   for (unsigned i = 0; i < num_channels; i++)
+   chan[i] = ac_llvm_extract_elem(>ac, value, i);
+
+   elemtype = LLVMGetElementType(LLVMTypeOf(value));
+   } else {
+   if (num_channels) {
+   assert(num_channels == 1);
+   chan[0] = value;
+   }
+   elemtype = LLVMTypeOf(value);
+   }
+
+   for (unsigned i = num_channels; i < 4; i++)
+   chan[i] = i == 3 ? one : zero;
+
+   return ac_build_gather_values(>ac, chan, 4);
+}
+
  static void
  handle_vs_input_decl(struct radv_shader_context *ctx,
  struct nir_variable *variable)
@@ -1979,7 +2045,7 @@ handle_vs_input_decl(struct radv_shader_context *ctx,
 unsigned attrib_count = glsl_count_attribute_slots(variable->type, 
true);
 uint8_t input_usage_mask =
 
ctx->shader_info->info.vs.input_usage_mask[variable->data.location];
-   unsigned num_channels = util_last_bit(input_usage_mask);
+   unsigned num_input_channels = util_last_bit(input_usage_mask);

 variable->data.driver_location = variable->data.location * 4;

@@ -1987,6 +2053,10 @@ handle_vs_input_decl(struct radv_shader_context *ctx,
 for (unsigned i = 0; i < attrib_count; ++i) {
 LLVMValueRef output[4];
 unsigned attrib_index = variable->data.location + i - 
VERT_ATTRIB_GENERIC0;
+   unsigned attrib_format = 
ctx->options->key.vs.vertex_attribute_formats[attrib_index];
+   unsigned data_format = attrib_format & 0x0f;
+   unsigned num_format = (attrib_format >> 4) & 0x07;
+   bool is_float = num_format == V_008F0C_BUF_NUM_FORMAT_FLOAT;

 if (ctx->options->key.vs.instance_rate_inputs & (1u << 
attrib_index)) {
 uint32_t divisor = 
ctx->options->key.vs.instance_rate_divisors[attrib_index];
@@ -2018,12 +2088,21 @@ handle_vs_input_decl(struct radv_shader_context *ctx,

 

Re: [Mesa-dev] [PATCH 4/4] radv: reduce the number of loaded channels for vertex input fetches

2019-02-13 Thread Bas Nieuwenhuizen
On Tue, Feb 12, 2019 at 3:07 PM Samuel Pitoiset
 wrote:
>
> It's unnecessary to load more channels than the vertex attribute
> format. The remaining channels are filled with 0 for y and z,
> and 1 for w.
>
> 29077 shaders in 15096 tests
> Totals:
> SGPRS: 1321605 -> 1318869 (-0.21 %)
> VGPRS: 935236 -> 932252 (-0.32 %)
> Spilled SGPRs: 24860 -> 24776 (-0.34 %)
> Code Size: 49832348 -> 49819464 (-0.03 %) bytes
> Max Waves: 242101 -> 242611 (0.21 %)
>
> Totals from affected shaders:
> SGPRS: 93675 -> 90939 (-2.92 %)
> VGPRS: 58016 -> 55032 (-5.14 %)
> Spilled SGPRs: 172 -> 88 (-48.84 %)
> Code Size: 2862740 -> 2849856 (-0.45 %) bytes
> Max Waves: 15474 -> 15984 (3.30 %)
>
> This mostly helps Croteam games (Talos/Sam2017).
>
> Signed-off-by: Samuel Pitoiset 
> ---
>  src/amd/vulkan/radv_nir_to_llvm.c | 83 ++-
>  1 file changed, 81 insertions(+), 2 deletions(-)
>
> diff --git a/src/amd/vulkan/radv_nir_to_llvm.c 
> b/src/amd/vulkan/radv_nir_to_llvm.c
> index 7f74678d5f1..b1e0c64e4e1 100644
> --- a/src/amd/vulkan/radv_nir_to_llvm.c
> +++ b/src/amd/vulkan/radv_nir_to_llvm.c
> @@ -1967,6 +1967,72 @@ adjust_vertex_fetch_alpha(struct radv_shader_context 
> *ctx,
> return alpha;
>  }
>
> +static unsigned
> +get_num_channels_from_data_format(unsigned data_format)
> +{
> +   switch (data_format) {
> +   case V_008F0C_BUF_DATA_FORMAT_8:
> +   case V_008F0C_BUF_DATA_FORMAT_16:
> +   case V_008F0C_BUF_DATA_FORMAT_32:
> +   return 1;
> +   case V_008F0C_BUF_DATA_FORMAT_8_8:
> +   case V_008F0C_BUF_DATA_FORMAT_16_16:
> +   case V_008F0C_BUF_DATA_FORMAT_32_32:
> +   return 2;
> +   case V_008F0C_BUF_DATA_FORMAT_10_11_11:
> +   case V_008F0C_BUF_DATA_FORMAT_11_11_10:
> +   case V_008F0C_BUF_DATA_FORMAT_32_32_32:
> +   return 3;
> +   case V_008F0C_BUF_DATA_FORMAT_8_8_8_8:
> +   case V_008F0C_BUF_DATA_FORMAT_10_10_10_2:
> +   case V_008F0C_BUF_DATA_FORMAT_2_10_10_10:
> +   case V_008F0C_BUF_DATA_FORMAT_16_16_16_16:
> +   case V_008F0C_BUF_DATA_FORMAT_32_32_32_32:
> +   return 4;
> +   default:
> +   break;
> +   }
> +
> +   return 4;
> +}
> +
> +static LLVMValueRef
> +radv_fixup_vertex_input_fetches(struct radv_shader_context *ctx,
> +   LLVMValueRef value,
> +   unsigned num_channels,
> +   bool is_float)
> +{
> +   LLVMValueRef zero = is_float ? ctx->ac.f32_0 : ctx->ac.i32_0;
> +   LLVMValueRef one = is_float ? ctx->ac.f32_1 : ctx->ac.i32_1;
> +   LLVMTypeRef elemtype;
> +   LLVMValueRef chan[4];
> +
> +   if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
> +   unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
> +
> +   if (num_channels == 4 && vec_size == 4)
> +   return value;

Just num_channels == vec_size ?

> +
> +   num_channels = MIN2(num_channels, vec_size);
> +
> +   for (unsigned i = 0; i < num_channels; i++)
> +   chan[i] = ac_llvm_extract_elem(>ac, value, i);
> +
> +   elemtype = LLVMGetElementType(LLVMTypeOf(value));
> +   } else {
> +   if (num_channels) {
> +   assert(num_channels == 1);
> +   chan[0] = value;
> +   }
> +   elemtype = LLVMTypeOf(value);
> +   }
> +
> +   for (unsigned i = num_channels; i < 4; i++)
> +   chan[i] = i == 3 ? one : zero;
> +
> +   return ac_build_gather_values(>ac, chan, 4);
> +}
> +
>  static void
>  handle_vs_input_decl(struct radv_shader_context *ctx,
>  struct nir_variable *variable)
> @@ -1979,7 +2045,7 @@ handle_vs_input_decl(struct radv_shader_context *ctx,
> unsigned attrib_count = glsl_count_attribute_slots(variable->type, 
> true);
> uint8_t input_usage_mask =
> 
> ctx->shader_info->info.vs.input_usage_mask[variable->data.location];
> -   unsigned num_channels = util_last_bit(input_usage_mask);
> +   unsigned num_input_channels = util_last_bit(input_usage_mask);
>
> variable->data.driver_location = variable->data.location * 4;
>
> @@ -1987,6 +2053,10 @@ handle_vs_input_decl(struct radv_shader_context *ctx,
> for (unsigned i = 0; i < attrib_count; ++i) {
> LLVMValueRef output[4];
> unsigned attrib_index = variable->data.location + i - 
> VERT_ATTRIB_GENERIC0;
> +   unsigned attrib_format = 
> ctx->options->key.vs.vertex_attribute_formats[attrib_index];
> +   unsigned data_format = attrib_format & 0x0f;
> +   unsigned num_format = (attrib_format >> 4) & 0x07;
> +   bool is_float = num_format == V_008F0C_BUF_NUM_FORMAT_FLOAT;
>
> if (ctx->options->key.vs.instance_rate_inputs & (1u << 
> attrib_index)) {
>

[Mesa-dev] [PATCH 4/4] radv: reduce the number of loaded channels for vertex input fetches

2019-02-12 Thread Samuel Pitoiset
It's unnecessary to load more channels than the vertex attribute
format. The remaining channels are filled with 0 for y and z,
and 1 for w.

29077 shaders in 15096 tests
Totals:
SGPRS: 1321605 -> 1318869 (-0.21 %)
VGPRS: 935236 -> 932252 (-0.32 %)
Spilled SGPRs: 24860 -> 24776 (-0.34 %)
Code Size: 49832348 -> 49819464 (-0.03 %) bytes
Max Waves: 242101 -> 242611 (0.21 %)

Totals from affected shaders:
SGPRS: 93675 -> 90939 (-2.92 %)
VGPRS: 58016 -> 55032 (-5.14 %)
Spilled SGPRs: 172 -> 88 (-48.84 %)
Code Size: 2862740 -> 2849856 (-0.45 %) bytes
Max Waves: 15474 -> 15984 (3.30 %)

This mostly helps Croteam games (Talos/Sam2017).

Signed-off-by: Samuel Pitoiset 
---
 src/amd/vulkan/radv_nir_to_llvm.c | 83 ++-
 1 file changed, 81 insertions(+), 2 deletions(-)

diff --git a/src/amd/vulkan/radv_nir_to_llvm.c 
b/src/amd/vulkan/radv_nir_to_llvm.c
index 7f74678d5f1..b1e0c64e4e1 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -1967,6 +1967,72 @@ adjust_vertex_fetch_alpha(struct radv_shader_context 
*ctx,
return alpha;
 }
 
+static unsigned
+get_num_channels_from_data_format(unsigned data_format)
+{
+   switch (data_format) {
+   case V_008F0C_BUF_DATA_FORMAT_8:
+   case V_008F0C_BUF_DATA_FORMAT_16:
+   case V_008F0C_BUF_DATA_FORMAT_32:
+   return 1;
+   case V_008F0C_BUF_DATA_FORMAT_8_8:
+   case V_008F0C_BUF_DATA_FORMAT_16_16:
+   case V_008F0C_BUF_DATA_FORMAT_32_32:
+   return 2;
+   case V_008F0C_BUF_DATA_FORMAT_10_11_11:
+   case V_008F0C_BUF_DATA_FORMAT_11_11_10:
+   case V_008F0C_BUF_DATA_FORMAT_32_32_32:
+   return 3;
+   case V_008F0C_BUF_DATA_FORMAT_8_8_8_8:
+   case V_008F0C_BUF_DATA_FORMAT_10_10_10_2:
+   case V_008F0C_BUF_DATA_FORMAT_2_10_10_10:
+   case V_008F0C_BUF_DATA_FORMAT_16_16_16_16:
+   case V_008F0C_BUF_DATA_FORMAT_32_32_32_32:
+   return 4;
+   default:
+   break;
+   }
+
+   return 4;
+}
+
+static LLVMValueRef
+radv_fixup_vertex_input_fetches(struct radv_shader_context *ctx,
+   LLVMValueRef value,
+   unsigned num_channels,
+   bool is_float)
+{
+   LLVMValueRef zero = is_float ? ctx->ac.f32_0 : ctx->ac.i32_0;
+   LLVMValueRef one = is_float ? ctx->ac.f32_1 : ctx->ac.i32_1;
+   LLVMTypeRef elemtype;
+   LLVMValueRef chan[4];
+
+   if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
+   unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
+
+   if (num_channels == 4 && vec_size == 4)
+   return value;
+
+   num_channels = MIN2(num_channels, vec_size);
+
+   for (unsigned i = 0; i < num_channels; i++)
+   chan[i] = ac_llvm_extract_elem(>ac, value, i);
+
+   elemtype = LLVMGetElementType(LLVMTypeOf(value));
+   } else {
+   if (num_channels) {
+   assert(num_channels == 1);
+   chan[0] = value;
+   }
+   elemtype = LLVMTypeOf(value);
+   }
+
+   for (unsigned i = num_channels; i < 4; i++)
+   chan[i] = i == 3 ? one : zero;
+
+   return ac_build_gather_values(>ac, chan, 4);
+}
+
 static void
 handle_vs_input_decl(struct radv_shader_context *ctx,
 struct nir_variable *variable)
@@ -1979,7 +2045,7 @@ handle_vs_input_decl(struct radv_shader_context *ctx,
unsigned attrib_count = glsl_count_attribute_slots(variable->type, 
true);
uint8_t input_usage_mask =

ctx->shader_info->info.vs.input_usage_mask[variable->data.location];
-   unsigned num_channels = util_last_bit(input_usage_mask);
+   unsigned num_input_channels = util_last_bit(input_usage_mask);
 
variable->data.driver_location = variable->data.location * 4;
 
@@ -1987,6 +2053,10 @@ handle_vs_input_decl(struct radv_shader_context *ctx,
for (unsigned i = 0; i < attrib_count; ++i) {
LLVMValueRef output[4];
unsigned attrib_index = variable->data.location + i - 
VERT_ATTRIB_GENERIC0;
+   unsigned attrib_format = 
ctx->options->key.vs.vertex_attribute_formats[attrib_index];
+   unsigned data_format = attrib_format & 0x0f;
+   unsigned num_format = (attrib_format >> 4) & 0x07;
+   bool is_float = num_format == V_008F0C_BUF_NUM_FORMAT_FLOAT;
 
if (ctx->options->key.vs.instance_rate_inputs & (1u << 
attrib_index)) {
uint32_t divisor = 
ctx->options->key.vs.instance_rate_divisors[attrib_index];
@@ -2018,12 +2088,21 @@ handle_vs_input_decl(struct radv_shader_context *ctx,
 
t_list = ac_build_load_to_sgpr(>ac, t_list_ptr, t_offset);
 
+   /* Adjust the number of channels to load based on the vertex
+