Hi Connor, Can confirm this does work here, thanks!
Makes this patch redundant too, since that problem's also fixed by your changes: https://lists.freedesktop.org/archives/mesa-dev/2017-June/161559.html Alex On 4 July 2017 at 04:32, Connor Abbott <[email protected]> wrote: > I'm going to do a full CTS run later this week before I send the > patches to the list, but can you confirm that this series fixes your > problems: > https://cgit.freedesktop.org/~cwabbott0/mesa/log/?h=radv-rewrite-vars > (it passes all the CTS dEQP.compute.* tests, but that's not saying > much...) > > On Mon, Jul 3, 2017 at 6:33 PM, Connor Abbott <[email protected]> wrote: >> I think a better approach would be to translate the NIR variables >> directly to LLVM variables instead of creating one giant LLVM >> variable. I have a series coming up that does a similar thing for >> local variables, I can try and convert shared variables too. >> >> On Tue, Jun 27, 2017 at 8:53 AM, Alex Smith <[email protected]> >> wrote: >>> Currently every element in shared memory (including individual elements >>> of an array) are treated as a vec4. For example, the following: >>> >>> shared uint array[1024]; >>> >>> gets treated as an array of 1024 vec4s with only the first component >>> ever used. >>> >>> This can be highly wasteful of LDS space. We have a shader which only >>> really requires ~24KB of shared memory, but since scalars are being >>> padded to vec4, it actually ends up needing 96KB. This is more than >>> the hardware supports, and leads to an LLVM compilation failure. >>> >>> Therefore, instead pack data in LDS according to the std430 layout. >>> >>> Signed-off-by: Alex Smith <[email protected]> >>> --- >>> RFC since I'm not sure whether this is the best approach to handle this, >>> and there may be something I've missed. >>> >>> No regressions seen in Mad Max or Dawn of War 3, which both have a few >>> shaders which use shared memory. Also no regressions in the CTS >>> 'dEQP-VK.compute.*' tests. >>> >>> I've also checked over various test cases using arrays/structs/etc. in >>> shared memory, and as far as I can tell the generated LLVM/GCN code is >>> correct. >>> --- >>> src/amd/common/ac_nir_to_llvm.c | 55 >>> ++++++++++++++++++++++------------------- >>> 1 file changed, 29 insertions(+), 26 deletions(-) >>> >>> diff --git a/src/amd/common/ac_nir_to_llvm.c >>> b/src/amd/common/ac_nir_to_llvm.c >>> index 468ce4d..a48bb3b 100644 >>> --- a/src/amd/common/ac_nir_to_llvm.c >>> +++ b/src/amd/common/ac_nir_to_llvm.c >>> @@ -397,7 +397,7 @@ static LLVMValueRef get_shared_memory_ptr(struct >>> nir_to_llvm_context *ctx, >>> LLVMValueRef ptr; >>> int addr_space; >>> >>> - offset = LLVMConstInt(ctx->i32, idx * 16, false); >>> + offset = LLVMConstInt(ctx->i32, idx, false); >>> >>> ptr = ctx->shared_memory; >>> ptr = LLVMBuildGEP(ctx->builder, ptr, &offset, 1, ""); >>> @@ -2404,7 +2404,7 @@ static LLVMValueRef visit_load_ubo_buffer(struct >>> nir_to_llvm_context *ctx, >>> >>> static void >>> radv_get_deref_offset(struct nir_to_llvm_context *ctx, nir_deref_var >>> *deref, >>> - bool vs_in, unsigned *vertex_index_out, >>> + bool vs_in, bool compute_shared, unsigned >>> *vertex_index_out, >>> LLVMValueRef *vertex_index_ref, >>> unsigned *const_out, LLVMValueRef *indir_out) >>> { >>> @@ -2445,7 +2445,14 @@ radv_get_deref_offset(struct nir_to_llvm_context >>> *ctx, nir_deref_var *deref, >>> if (tail->deref_type == nir_deref_type_array) { >>> nir_deref_array *deref_array = >>> nir_deref_as_array(tail); >>> LLVMValueRef index, stride, local_offset; >>> - unsigned size = >>> glsl_count_attribute_slots(tail->type, vs_in); >>> + >>> + /* For compute LDS, data is packed rather than >>> treating >>> + * each individual element as a vec4. The returned >>> + * index is used to index into an i32 array so >>> divide >>> + * by 4. >>> + */ >>> + unsigned size = compute_shared ? >>> glsl_get_std430_array_stride(tail->type, false) / 4 >>> + : >>> glsl_count_attribute_slots(tail->type, vs_in); >>> >>> const_offset += size * deref_array->base_offset; >>> if (deref_array->deref_array_type == >>> nir_deref_array_type_direct) >>> @@ -2465,7 +2472,11 @@ radv_get_deref_offset(struct nir_to_llvm_context >>> *ctx, nir_deref_var *deref, >>> >>> for (unsigned i = 0; i < deref_struct->index; i++) { >>> const struct glsl_type *ft = >>> glsl_get_struct_field(parent_type, i); >>> - const_offset += >>> glsl_count_attribute_slots(ft, vs_in); >>> + >>> + /* Same as above. */ >>> + unsigned size = compute_shared ? >>> glsl_get_std430_array_stride(ft, false) / 4 >>> + : >>> glsl_count_attribute_slots(tail->type, vs_in); >>> + const_offset += size; >>> } >>> } else >>> unreachable("unsupported deref type"); >>> @@ -2641,7 +2652,7 @@ load_tcs_input(struct nir_to_llvm_context *ctx, >>> const bool is_compact = instr->variables[0]->var->data.compact; >>> param = >>> shader_io_get_unique_index(instr->variables[0]->var->data.location); >>> radv_get_deref_offset(ctx, instr->variables[0], >>> - false, NULL, per_vertex ? &vertex_index : >>> NULL, >>> + false, false, NULL, per_vertex ? >>> &vertex_index : NULL, >>> &const_index, &indir_index); >>> >>> stride = unpack_param(ctx, ctx->tcs_in_layout, 13, 8); >>> @@ -2673,7 +2684,7 @@ load_tcs_output(struct nir_to_llvm_context *ctx, >>> const bool is_compact = instr->variables[0]->var->data.compact; >>> param = >>> shader_io_get_unique_index(instr->variables[0]->var->data.location); >>> radv_get_deref_offset(ctx, instr->variables[0], >>> - false, NULL, per_vertex ? &vertex_index : >>> NULL, >>> + false, false, NULL, per_vertex ? >>> &vertex_index : NULL, >>> &const_index, &indir_index); >>> >>> if (!instr->variables[0]->var->data.patch) { >>> @@ -2712,7 +2723,7 @@ store_tcs_output(struct nir_to_llvm_context *ctx, >>> const bool is_compact = instr->variables[0]->var->data.compact; >>> >>> radv_get_deref_offset(ctx, instr->variables[0], >>> - false, NULL, per_vertex ? &vertex_index : >>> NULL, >>> + false, false, NULL, per_vertex ? >>> &vertex_index : NULL, >>> &const_index, &indir_index); >>> >>> param = >>> shader_io_get_unique_index(instr->variables[0]->var->data.location); >>> @@ -2779,7 +2790,7 @@ load_tes_input(struct nir_to_llvm_context *ctx, >>> const bool is_compact = instr->variables[0]->var->data.compact; >>> >>> radv_get_deref_offset(ctx, instr->variables[0], >>> - false, NULL, per_vertex ? &vertex_index : >>> NULL, >>> + false, false, NULL, per_vertex ? >>> &vertex_index : NULL, >>> &const_index, &indir_index); >>> param = >>> shader_io_get_unique_index(instr->variables[0]->var->data.location); >>> if (instr->variables[0]->var->data.location == >>> VARYING_SLOT_CLIP_DIST0 && >>> @@ -2808,7 +2819,7 @@ load_gs_input(struct nir_to_llvm_context *ctx, >>> LLVMValueRef value[4], result; >>> unsigned vertex_index; >>> radv_get_deref_offset(ctx, instr->variables[0], >>> - false, &vertex_index, NULL, >>> + false, false, &vertex_index, NULL, >>> &const_index, &indir_index); >>> vtx_offset_param = vertex_index; >>> assert(vtx_offset_param < 6); >>> @@ -2849,8 +2860,9 @@ static LLVMValueRef visit_load_var(struct >>> nir_to_llvm_context *ctx, >>> unsigned const_index; >>> bool vs_in = ctx->stage == MESA_SHADER_VERTEX && >>> instr->variables[0]->var->data.mode == >>> nir_var_shader_in; >>> - radv_get_deref_offset(ctx, instr->variables[0], vs_in, NULL, NULL, >>> - &const_index, &indir_index); >>> + bool compute_shared = instr->variables[0]->var->data.mode == >>> nir_var_shared; >>> + radv_get_deref_offset(ctx, instr->variables[0], vs_in, >>> compute_shared, >>> + NULL, NULL, &const_index, &indir_index); >>> >>> if (instr->dest.ssa.bit_size == 64) >>> ve *= 2; >>> @@ -2925,9 +2937,6 @@ static LLVMValueRef visit_load_var(struct >>> nir_to_llvm_context *ctx, >>> LLVMValueRef ptr = get_shared_memory_ptr(ctx, idx, >>> ctx->i32); >>> LLVMValueRef derived_ptr; >>> >>> - if (indir_index) >>> - indir_index = LLVMBuildMul(ctx->builder, >>> indir_index, LLVMConstInt(ctx->i32, 4, false), ""); >>> - >>> for (unsigned chan = 0; chan < ve; chan++) { >>> LLVMValueRef index = LLVMConstInt(ctx->i32, chan, >>> false); >>> if (indir_index) >>> @@ -2955,7 +2964,8 @@ visit_store_var(struct nir_to_llvm_context *ctx, >>> int writemask = instr->const_index[0]; >>> LLVMValueRef indir_index; >>> unsigned const_index; >>> - radv_get_deref_offset(ctx, instr->variables[0], false, >>> + bool compute_shared = instr->variables[0]->var->data.mode == >>> nir_var_shared; >>> + radv_get_deref_offset(ctx, instr->variables[0], false, >>> compute_shared, >>> NULL, NULL, &const_index, &indir_index); >>> >>> if (get_elem_bits(ctx, LLVMTypeOf(src)) == 64) { >>> @@ -3040,9 +3050,6 @@ visit_store_var(struct nir_to_llvm_context *ctx, >>> case nir_var_shared: { >>> LLVMValueRef ptr = get_shared_memory_ptr(ctx, idx, >>> ctx->i32); >>> >>> - if (indir_index) >>> - indir_index = LLVMBuildMul(ctx->builder, >>> indir_index, LLVMConstInt(ctx->i32, 4, false), ""); >>> - >>> for (unsigned chan = 0; chan < 8; chan++) { >>> if (!(writemask & (1 << chan))) >>> continue; >>> @@ -5756,9 +5763,9 @@ handle_shader_outputs_post(struct nir_to_llvm_context >>> *ctx) >>> >>> static void >>> handle_shared_compute_var(struct nir_to_llvm_context *ctx, >>> - struct nir_variable *variable, uint32_t *offset, >>> int idx) >>> + struct nir_variable *variable, uint32_t *offset) >>> { >>> - unsigned size = glsl_count_attribute_slots(variable->type, false); >>> + unsigned size = glsl_get_std430_array_stride(variable->type, false); >>> variable->data.driver_location = *offset; >>> *offset += size; >>> } >>> @@ -5926,16 +5933,12 @@ LLVMModuleRef >>> ac_translate_nir_to_llvm(LLVMTargetMachineRef tm, >>> nir_foreach_variable(variable, &nir->shared) >>> num_shared++; >>> if (num_shared) { >>> - int idx = 0; >>> uint32_t shared_size = 0; >>> LLVMValueRef var; >>> LLVMTypeRef i8p = LLVMPointerType(ctx.i8, >>> LOCAL_ADDR_SPACE); >>> - nir_foreach_variable(variable, &nir->shared) { >>> - handle_shared_compute_var(&ctx, variable, >>> &shared_size, idx); >>> - idx++; >>> - } >>> + nir_foreach_variable(variable, &nir->shared) >>> + handle_shared_compute_var(&ctx, variable, >>> &shared_size); >>> >>> - shared_size *= 16; >>> var = LLVMAddGlobalInAddressSpace(ctx.module, >>> >>> LLVMArrayType(ctx.i8, shared_size), >>> "compute_lds", >>> -- >>> 2.9.4 >>> >>> _______________________________________________ >>> mesa-dev mailing list >>> [email protected] >>> https://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/mesa-dev
