nir: Don't treat each element as a vec4 in compute shared memory

Connor Abbott Mon, 03 Jul 2017 20:33:07 -0700

I'm going to do a full CTS run later this week before I send the
patches to the list, but can you confirm that this series fixes your
problems: https://cgit.freedesktop.org/~cwabbott0/mesa/log/?h=radv-rewrite-vars
(it passes all the CTS dEQP.compute.* tests, but that's not saying
much...)


On Mon, Jul 3, 2017 at 6:33 PM, Connor Abbott <[email protected]> wrote:
> I think a better approach would be to translate the NIR variables
> directly to LLVM variables instead of creating one giant LLVM
> variable. I have a series coming up that does a similar thing for
> local variables, I can try and convert shared variables too.
>
> On Tue, Jun 27, 2017 at 8:53 AM, Alex Smith <[email protected]> 
> wrote:
>> Currently every element in shared memory (including individual elements
>> of an array) are treated as a vec4. For example, the following:
>>
>>     shared uint array[1024];
>>
>> gets treated as an array of 1024 vec4s with only the first component
>> ever used.
>>
>> This can be highly wasteful of LDS space. We have a shader which only
>> really requires ~24KB of shared memory, but since scalars are being
>> padded to vec4, it actually ends up needing 96KB. This is more than
>> the hardware supports, and leads to an LLVM compilation failure.
>>
>> Therefore, instead pack data in LDS according to the std430 layout.
>>
>> Signed-off-by: Alex Smith <[email protected]>
>> ---
>> RFC since I'm not sure whether this is the best approach to handle this,
>> and there may be something I've missed.
>>
>> No regressions seen in Mad Max or Dawn of War 3, which both have a few
>> shaders which use shared memory. Also no regressions in the CTS
>> 'dEQP-VK.compute.*' tests.
>>
>> I've also checked over various test cases using arrays/structs/etc. in
>> shared memory, and as far as I can tell the generated LLVM/GCN code is
>> correct.
>> ---
>>  src/amd/common/ac_nir_to_llvm.c | 55 
>> ++++++++++++++++++++++-------------------
>>  1 file changed, 29 insertions(+), 26 deletions(-)
>>
>> diff --git a/src/amd/common/ac_nir_to_llvm.c 
>> b/src/amd/common/ac_nir_to_llvm.c
>> index 468ce4d..a48bb3b 100644
>> --- a/src/amd/common/ac_nir_to_llvm.c
>> +++ b/src/amd/common/ac_nir_to_llvm.c
>> @@ -397,7 +397,7 @@ static LLVMValueRef get_shared_memory_ptr(struct 
>> nir_to_llvm_context *ctx,
>>         LLVMValueRef ptr;
>>         int addr_space;
>>
>> -       offset = LLVMConstInt(ctx->i32, idx * 16, false);
>> +       offset = LLVMConstInt(ctx->i32, idx, false);
>>
>>         ptr = ctx->shared_memory;
>>         ptr = LLVMBuildGEP(ctx->builder, ptr, &offset, 1, "");
>> @@ -2404,7 +2404,7 @@ static LLVMValueRef visit_load_ubo_buffer(struct 
>> nir_to_llvm_context *ctx,
>>
>>  static void
>>  radv_get_deref_offset(struct nir_to_llvm_context *ctx, nir_deref_var *deref,
>> -                     bool vs_in, unsigned *vertex_index_out,
>> +                     bool vs_in, bool compute_shared, unsigned 
>> *vertex_index_out,
>>                       LLVMValueRef *vertex_index_ref,
>>                       unsigned *const_out, LLVMValueRef *indir_out)
>>  {
>> @@ -2445,7 +2445,14 @@ radv_get_deref_offset(struct nir_to_llvm_context 
>> *ctx, nir_deref_var *deref,
>>                 if (tail->deref_type == nir_deref_type_array) {
>>                         nir_deref_array *deref_array = 
>> nir_deref_as_array(tail);
>>                         LLVMValueRef index, stride, local_offset;
>> -                       unsigned size = 
>> glsl_count_attribute_slots(tail->type, vs_in);
>> +
>> +                       /* For compute LDS, data is packed rather than 
>> treating
>> +                        * each individual element as a vec4. The returned
>> +                        * index is used to index into an i32 array so divide
>> +                        * by 4.
>> +                        */
>> +                       unsigned size = compute_shared ? 
>> glsl_get_std430_array_stride(tail->type, false) / 4
>> +                                                      : 
>> glsl_count_attribute_slots(tail->type, vs_in);
>>
>>                         const_offset += size * deref_array->base_offset;
>>                         if (deref_array->deref_array_type == 
>> nir_deref_array_type_direct)
>> @@ -2465,7 +2472,11 @@ radv_get_deref_offset(struct nir_to_llvm_context 
>> *ctx, nir_deref_var *deref,
>>
>>                         for (unsigned i = 0; i < deref_struct->index; i++) {
>>                                 const struct glsl_type *ft = 
>> glsl_get_struct_field(parent_type, i);
>> -                               const_offset += 
>> glsl_count_attribute_slots(ft, vs_in);
>> +
>> +                               /* Same as above. */
>> +                               unsigned size = compute_shared ? 
>> glsl_get_std430_array_stride(ft, false) / 4
>> +                                                              : 
>> glsl_count_attribute_slots(tail->type, vs_in);
>> +                               const_offset += size;
>>                         }
>>                 } else
>>                         unreachable("unsupported deref type");
>> @@ -2641,7 +2652,7 @@ load_tcs_input(struct nir_to_llvm_context *ctx,
>>         const bool is_compact = instr->variables[0]->var->data.compact;
>>         param = 
>> shader_io_get_unique_index(instr->variables[0]->var->data.location);
>>         radv_get_deref_offset(ctx, instr->variables[0],
>> -                             false, NULL, per_vertex ? &vertex_index : NULL,
>> +                             false, false, NULL, per_vertex ? &vertex_index 
>> : NULL,
>>                               &const_index, &indir_index);
>>
>>         stride = unpack_param(ctx, ctx->tcs_in_layout, 13, 8);
>> @@ -2673,7 +2684,7 @@ load_tcs_output(struct nir_to_llvm_context *ctx,
>>         const bool is_compact = instr->variables[0]->var->data.compact;
>>         param = 
>> shader_io_get_unique_index(instr->variables[0]->var->data.location);
>>         radv_get_deref_offset(ctx, instr->variables[0],
>> -                             false, NULL, per_vertex ? &vertex_index : NULL,
>> +                             false, false, NULL, per_vertex ? &vertex_index 
>> : NULL,
>>                               &const_index, &indir_index);
>>
>>         if (!instr->variables[0]->var->data.patch) {
>> @@ -2712,7 +2723,7 @@ store_tcs_output(struct nir_to_llvm_context *ctx,
>>         const bool is_compact = instr->variables[0]->var->data.compact;
>>
>>         radv_get_deref_offset(ctx, instr->variables[0],
>> -                             false, NULL, per_vertex ? &vertex_index : NULL,
>> +                             false, false, NULL, per_vertex ? &vertex_index 
>> : NULL,
>>                               &const_index, &indir_index);
>>
>>         param = 
>> shader_io_get_unique_index(instr->variables[0]->var->data.location);
>> @@ -2779,7 +2790,7 @@ load_tes_input(struct nir_to_llvm_context *ctx,
>>         const bool is_compact = instr->variables[0]->var->data.compact;
>>
>>         radv_get_deref_offset(ctx, instr->variables[0],
>> -                             false, NULL, per_vertex ? &vertex_index : NULL,
>> +                             false, false, NULL, per_vertex ? &vertex_index 
>> : NULL,
>>                               &const_index, &indir_index);
>>         param = 
>> shader_io_get_unique_index(instr->variables[0]->var->data.location);
>>         if (instr->variables[0]->var->data.location == 
>> VARYING_SLOT_CLIP_DIST0 &&
>> @@ -2808,7 +2819,7 @@ load_gs_input(struct nir_to_llvm_context *ctx,
>>         LLVMValueRef value[4], result;
>>         unsigned vertex_index;
>>         radv_get_deref_offset(ctx, instr->variables[0],
>> -                             false, &vertex_index, NULL,
>> +                             false, false, &vertex_index, NULL,
>>                               &const_index, &indir_index);
>>         vtx_offset_param = vertex_index;
>>         assert(vtx_offset_param < 6);
>> @@ -2849,8 +2860,9 @@ static LLVMValueRef visit_load_var(struct 
>> nir_to_llvm_context *ctx,
>>         unsigned const_index;
>>         bool vs_in = ctx->stage == MESA_SHADER_VERTEX &&
>>                      instr->variables[0]->var->data.mode == 
>> nir_var_shader_in;
>> -       radv_get_deref_offset(ctx, instr->variables[0], vs_in, NULL, NULL,
>> -                                     &const_index, &indir_index);
>> +       bool compute_shared = instr->variables[0]->var->data.mode == 
>> nir_var_shared;
>> +       radv_get_deref_offset(ctx, instr->variables[0], vs_in, 
>> compute_shared,
>> +                             NULL, NULL, &const_index, &indir_index);
>>
>>         if (instr->dest.ssa.bit_size == 64)
>>                 ve *= 2;
>> @@ -2925,9 +2937,6 @@ static LLVMValueRef visit_load_var(struct 
>> nir_to_llvm_context *ctx,
>>                 LLVMValueRef ptr = get_shared_memory_ptr(ctx, idx, ctx->i32);
>>                 LLVMValueRef derived_ptr;
>>
>> -               if (indir_index)
>> -                       indir_index = LLVMBuildMul(ctx->builder, 
>> indir_index, LLVMConstInt(ctx->i32, 4, false), "");
>> -
>>                 for (unsigned chan = 0; chan < ve; chan++) {
>>                         LLVMValueRef index = LLVMConstInt(ctx->i32, chan, 
>> false);
>>                         if (indir_index)
>> @@ -2955,7 +2964,8 @@ visit_store_var(struct nir_to_llvm_context *ctx,
>>         int writemask = instr->const_index[0];
>>         LLVMValueRef indir_index;
>>         unsigned const_index;
>> -       radv_get_deref_offset(ctx, instr->variables[0], false,
>> +       bool compute_shared = instr->variables[0]->var->data.mode == 
>> nir_var_shared;
>> +       radv_get_deref_offset(ctx, instr->variables[0], false, 
>> compute_shared,
>>                               NULL, NULL, &const_index, &indir_index);
>>
>>         if (get_elem_bits(ctx, LLVMTypeOf(src)) == 64) {
>> @@ -3040,9 +3050,6 @@ visit_store_var(struct nir_to_llvm_context *ctx,
>>         case nir_var_shared: {
>>                 LLVMValueRef ptr = get_shared_memory_ptr(ctx, idx, ctx->i32);
>>
>> -               if (indir_index)
>> -                       indir_index = LLVMBuildMul(ctx->builder, 
>> indir_index, LLVMConstInt(ctx->i32, 4, false), "");
>> -
>>                 for (unsigned chan = 0; chan < 8; chan++) {
>>                         if (!(writemask & (1 << chan)))
>>                                 continue;
>> @@ -5756,9 +5763,9 @@ handle_shader_outputs_post(struct nir_to_llvm_context 
>> *ctx)
>>
>>  static void
>>  handle_shared_compute_var(struct nir_to_llvm_context *ctx,
>> -                         struct nir_variable *variable, uint32_t *offset, 
>> int idx)
>> +                         struct nir_variable *variable, uint32_t *offset)
>>  {
>> -       unsigned size = glsl_count_attribute_slots(variable->type, false);
>> +       unsigned size = glsl_get_std430_array_stride(variable->type, false);
>>         variable->data.driver_location = *offset;
>>         *offset += size;
>>  }
>> @@ -5926,16 +5933,12 @@ LLVMModuleRef 
>> ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
>>                 nir_foreach_variable(variable, &nir->shared)
>>                         num_shared++;
>>                 if (num_shared) {
>> -                       int idx = 0;
>>                         uint32_t shared_size = 0;
>>                         LLVMValueRef var;
>>                         LLVMTypeRef i8p = LLVMPointerType(ctx.i8, 
>> LOCAL_ADDR_SPACE);
>> -                       nir_foreach_variable(variable, &nir->shared) {
>> -                               handle_shared_compute_var(&ctx, variable, 
>> &shared_size, idx);
>> -                               idx++;
>> -                       }
>> +                       nir_foreach_variable(variable, &nir->shared)
>> +                               handle_shared_compute_var(&ctx, variable, 
>> &shared_size);
>>
>> -                       shared_size *= 16;
>>                         var = LLVMAddGlobalInAddressSpace(ctx.module,
>>                                                           
>> LLVMArrayType(ctx.i8, shared_size),
>>                                                           "compute_lds",
>> --
>> 2.9.4
>>
>> _______________________________________________
>> mesa-dev mailing list
>> [email protected]
>> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
_______________________________________________
mesa-dev mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [RFC PATCH 2/2] ac/nir: Don't treat each element as a vec4 in compute shared memory

Reply via email to