I'm going to do a full CTS run later this week before I send the patches to the list, but can you confirm that this series fixes your problems: https://cgit.freedesktop.org/~cwabbott0/mesa/log/?h=radv-rewrite-vars (it passes all the CTS dEQP.compute.* tests, but that's not saying much...)
On Mon, Jul 3, 2017 at 6:33 PM, Connor Abbott <[email protected]> wrote: > I think a better approach would be to translate the NIR variables > directly to LLVM variables instead of creating one giant LLVM > variable. I have a series coming up that does a similar thing for > local variables, I can try and convert shared variables too. > > On Tue, Jun 27, 2017 at 8:53 AM, Alex Smith <[email protected]> > wrote: >> Currently every element in shared memory (including individual elements >> of an array) are treated as a vec4. For example, the following: >> >> shared uint array[1024]; >> >> gets treated as an array of 1024 vec4s with only the first component >> ever used. >> >> This can be highly wasteful of LDS space. We have a shader which only >> really requires ~24KB of shared memory, but since scalars are being >> padded to vec4, it actually ends up needing 96KB. This is more than >> the hardware supports, and leads to an LLVM compilation failure. >> >> Therefore, instead pack data in LDS according to the std430 layout. >> >> Signed-off-by: Alex Smith <[email protected]> >> --- >> RFC since I'm not sure whether this is the best approach to handle this, >> and there may be something I've missed. >> >> No regressions seen in Mad Max or Dawn of War 3, which both have a few >> shaders which use shared memory. Also no regressions in the CTS >> 'dEQP-VK.compute.*' tests. >> >> I've also checked over various test cases using arrays/structs/etc. in >> shared memory, and as far as I can tell the generated LLVM/GCN code is >> correct. >> --- >> src/amd/common/ac_nir_to_llvm.c | 55 >> ++++++++++++++++++++++------------------- >> 1 file changed, 29 insertions(+), 26 deletions(-) >> >> diff --git a/src/amd/common/ac_nir_to_llvm.c >> b/src/amd/common/ac_nir_to_llvm.c >> index 468ce4d..a48bb3b 100644 >> --- a/src/amd/common/ac_nir_to_llvm.c >> +++ b/src/amd/common/ac_nir_to_llvm.c >> @@ -397,7 +397,7 @@ static LLVMValueRef get_shared_memory_ptr(struct >> nir_to_llvm_context *ctx, >> LLVMValueRef ptr; >> int addr_space; >> >> - offset = LLVMConstInt(ctx->i32, idx * 16, false); >> + offset = LLVMConstInt(ctx->i32, idx, false); >> >> ptr = ctx->shared_memory; >> ptr = LLVMBuildGEP(ctx->builder, ptr, &offset, 1, ""); >> @@ -2404,7 +2404,7 @@ static LLVMValueRef visit_load_ubo_buffer(struct >> nir_to_llvm_context *ctx, >> >> static void >> radv_get_deref_offset(struct nir_to_llvm_context *ctx, nir_deref_var *deref, >> - bool vs_in, unsigned *vertex_index_out, >> + bool vs_in, bool compute_shared, unsigned >> *vertex_index_out, >> LLVMValueRef *vertex_index_ref, >> unsigned *const_out, LLVMValueRef *indir_out) >> { >> @@ -2445,7 +2445,14 @@ radv_get_deref_offset(struct nir_to_llvm_context >> *ctx, nir_deref_var *deref, >> if (tail->deref_type == nir_deref_type_array) { >> nir_deref_array *deref_array = >> nir_deref_as_array(tail); >> LLVMValueRef index, stride, local_offset; >> - unsigned size = >> glsl_count_attribute_slots(tail->type, vs_in); >> + >> + /* For compute LDS, data is packed rather than >> treating >> + * each individual element as a vec4. The returned >> + * index is used to index into an i32 array so divide >> + * by 4. >> + */ >> + unsigned size = compute_shared ? >> glsl_get_std430_array_stride(tail->type, false) / 4 >> + : >> glsl_count_attribute_slots(tail->type, vs_in); >> >> const_offset += size * deref_array->base_offset; >> if (deref_array->deref_array_type == >> nir_deref_array_type_direct) >> @@ -2465,7 +2472,11 @@ radv_get_deref_offset(struct nir_to_llvm_context >> *ctx, nir_deref_var *deref, >> >> for (unsigned i = 0; i < deref_struct->index; i++) { >> const struct glsl_type *ft = >> glsl_get_struct_field(parent_type, i); >> - const_offset += >> glsl_count_attribute_slots(ft, vs_in); >> + >> + /* Same as above. */ >> + unsigned size = compute_shared ? >> glsl_get_std430_array_stride(ft, false) / 4 >> + : >> glsl_count_attribute_slots(tail->type, vs_in); >> + const_offset += size; >> } >> } else >> unreachable("unsupported deref type"); >> @@ -2641,7 +2652,7 @@ load_tcs_input(struct nir_to_llvm_context *ctx, >> const bool is_compact = instr->variables[0]->var->data.compact; >> param = >> shader_io_get_unique_index(instr->variables[0]->var->data.location); >> radv_get_deref_offset(ctx, instr->variables[0], >> - false, NULL, per_vertex ? &vertex_index : NULL, >> + false, false, NULL, per_vertex ? &vertex_index >> : NULL, >> &const_index, &indir_index); >> >> stride = unpack_param(ctx, ctx->tcs_in_layout, 13, 8); >> @@ -2673,7 +2684,7 @@ load_tcs_output(struct nir_to_llvm_context *ctx, >> const bool is_compact = instr->variables[0]->var->data.compact; >> param = >> shader_io_get_unique_index(instr->variables[0]->var->data.location); >> radv_get_deref_offset(ctx, instr->variables[0], >> - false, NULL, per_vertex ? &vertex_index : NULL, >> + false, false, NULL, per_vertex ? &vertex_index >> : NULL, >> &const_index, &indir_index); >> >> if (!instr->variables[0]->var->data.patch) { >> @@ -2712,7 +2723,7 @@ store_tcs_output(struct nir_to_llvm_context *ctx, >> const bool is_compact = instr->variables[0]->var->data.compact; >> >> radv_get_deref_offset(ctx, instr->variables[0], >> - false, NULL, per_vertex ? &vertex_index : NULL, >> + false, false, NULL, per_vertex ? &vertex_index >> : NULL, >> &const_index, &indir_index); >> >> param = >> shader_io_get_unique_index(instr->variables[0]->var->data.location); >> @@ -2779,7 +2790,7 @@ load_tes_input(struct nir_to_llvm_context *ctx, >> const bool is_compact = instr->variables[0]->var->data.compact; >> >> radv_get_deref_offset(ctx, instr->variables[0], >> - false, NULL, per_vertex ? &vertex_index : NULL, >> + false, false, NULL, per_vertex ? &vertex_index >> : NULL, >> &const_index, &indir_index); >> param = >> shader_io_get_unique_index(instr->variables[0]->var->data.location); >> if (instr->variables[0]->var->data.location == >> VARYING_SLOT_CLIP_DIST0 && >> @@ -2808,7 +2819,7 @@ load_gs_input(struct nir_to_llvm_context *ctx, >> LLVMValueRef value[4], result; >> unsigned vertex_index; >> radv_get_deref_offset(ctx, instr->variables[0], >> - false, &vertex_index, NULL, >> + false, false, &vertex_index, NULL, >> &const_index, &indir_index); >> vtx_offset_param = vertex_index; >> assert(vtx_offset_param < 6); >> @@ -2849,8 +2860,9 @@ static LLVMValueRef visit_load_var(struct >> nir_to_llvm_context *ctx, >> unsigned const_index; >> bool vs_in = ctx->stage == MESA_SHADER_VERTEX && >> instr->variables[0]->var->data.mode == >> nir_var_shader_in; >> - radv_get_deref_offset(ctx, instr->variables[0], vs_in, NULL, NULL, >> - &const_index, &indir_index); >> + bool compute_shared = instr->variables[0]->var->data.mode == >> nir_var_shared; >> + radv_get_deref_offset(ctx, instr->variables[0], vs_in, >> compute_shared, >> + NULL, NULL, &const_index, &indir_index); >> >> if (instr->dest.ssa.bit_size == 64) >> ve *= 2; >> @@ -2925,9 +2937,6 @@ static LLVMValueRef visit_load_var(struct >> nir_to_llvm_context *ctx, >> LLVMValueRef ptr = get_shared_memory_ptr(ctx, idx, ctx->i32); >> LLVMValueRef derived_ptr; >> >> - if (indir_index) >> - indir_index = LLVMBuildMul(ctx->builder, >> indir_index, LLVMConstInt(ctx->i32, 4, false), ""); >> - >> for (unsigned chan = 0; chan < ve; chan++) { >> LLVMValueRef index = LLVMConstInt(ctx->i32, chan, >> false); >> if (indir_index) >> @@ -2955,7 +2964,8 @@ visit_store_var(struct nir_to_llvm_context *ctx, >> int writemask = instr->const_index[0]; >> LLVMValueRef indir_index; >> unsigned const_index; >> - radv_get_deref_offset(ctx, instr->variables[0], false, >> + bool compute_shared = instr->variables[0]->var->data.mode == >> nir_var_shared; >> + radv_get_deref_offset(ctx, instr->variables[0], false, >> compute_shared, >> NULL, NULL, &const_index, &indir_index); >> >> if (get_elem_bits(ctx, LLVMTypeOf(src)) == 64) { >> @@ -3040,9 +3050,6 @@ visit_store_var(struct nir_to_llvm_context *ctx, >> case nir_var_shared: { >> LLVMValueRef ptr = get_shared_memory_ptr(ctx, idx, ctx->i32); >> >> - if (indir_index) >> - indir_index = LLVMBuildMul(ctx->builder, >> indir_index, LLVMConstInt(ctx->i32, 4, false), ""); >> - >> for (unsigned chan = 0; chan < 8; chan++) { >> if (!(writemask & (1 << chan))) >> continue; >> @@ -5756,9 +5763,9 @@ handle_shader_outputs_post(struct nir_to_llvm_context >> *ctx) >> >> static void >> handle_shared_compute_var(struct nir_to_llvm_context *ctx, >> - struct nir_variable *variable, uint32_t *offset, >> int idx) >> + struct nir_variable *variable, uint32_t *offset) >> { >> - unsigned size = glsl_count_attribute_slots(variable->type, false); >> + unsigned size = glsl_get_std430_array_stride(variable->type, false); >> variable->data.driver_location = *offset; >> *offset += size; >> } >> @@ -5926,16 +5933,12 @@ LLVMModuleRef >> ac_translate_nir_to_llvm(LLVMTargetMachineRef tm, >> nir_foreach_variable(variable, &nir->shared) >> num_shared++; >> if (num_shared) { >> - int idx = 0; >> uint32_t shared_size = 0; >> LLVMValueRef var; >> LLVMTypeRef i8p = LLVMPointerType(ctx.i8, >> LOCAL_ADDR_SPACE); >> - nir_foreach_variable(variable, &nir->shared) { >> - handle_shared_compute_var(&ctx, variable, >> &shared_size, idx); >> - idx++; >> - } >> + nir_foreach_variable(variable, &nir->shared) >> + handle_shared_compute_var(&ctx, variable, >> &shared_size); >> >> - shared_size *= 16; >> var = LLVMAddGlobalInAddressSpace(ctx.module, >> >> LLVMArrayType(ctx.i8, shared_size), >> "compute_lds", >> -- >> 2.9.4 >> >> _______________________________________________ >> mesa-dev mailing list >> [email protected] >> https://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/mesa-dev
