Module: Mesa Branch: main Commit: 04956d54ce5b623e7e9276a0eead49c5db15965a URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=04956d54ce5b623e7e9276a0eead49c5db15965a
Author: Georg Lehmann <dadschoo...@gmail.com> Date: Tue Oct 31 15:42:31 2023 +0100 aco: force uniform result for LDS load with uniform address if it can be non uniform Because a LDS load is 2 separate loads on gfx10+ with wave64, a different wave can write LDS in between and cause a non uniform result. Use v_readfirst_lane instead of p_as_uniform because it cannot be copy propagated. Fixes a OpenCL CTS test with zink+rusticl. Totals from 136 (0.17% of 78196) affected shaders: MaxWaves: 3236 -> 3244 (+0.25%) Instrs: 130069 -> 131221 (+0.89%) CodeSize: 698048 -> 703436 (+0.77%) VGPRs: 5464 -> 5440 (-0.44%) SpillSGPRs: 94 -> 96 (+2.13%) Latency: 5361017 -> 5363781 (+0.05%); split: -0.00%, +0.05% InvThroughput: 883010 -> 884100 (+0.12%) SClause: 3822 -> 3821 (-0.03%); split: -0.05%, +0.03% Copies: 14220 -> 14314 (+0.66%); split: -0.01%, +0.68% Branches: 4549 -> 4551 (+0.04%) PreSGPRs: 4934 -> 4940 (+0.12%) PreVGPRs: 4666 -> 4655 (-0.24%) Reviewed-by: Daniel Schürmann <dan...@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25973> --- src/amd/compiler/aco_instruction_selection.cpp | 74 +++++++++++++++++++++----- 1 file changed, 60 insertions(+), 14 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index cb39a68dcd9..b55d60afab9 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -3941,6 +3941,44 @@ visit_load_const(isel_context* ctx, nir_load_const_instr* instr) } } +Temp +emit_readfirstlane(isel_context* ctx, Temp src, Temp dst) +{ + Builder bld(ctx->program, ctx->block); + + if (src.regClass().type() == RegType::sgpr) { + bld.copy(Definition(dst), src); + } else if (src.size() == 1) { + bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(dst), src); + } else { + aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>( + aco_opcode::p_split_vector, Format::PSEUDO, 1, src.size())}; + split->operands[0] = Operand(src); + + for (unsigned i = 0; i < src.size(); i++) { + split->definitions[i] = + bld.def(RegClass::get(RegType::vgpr, MIN2(src.bytes() - i * 4, 4))); + } + + Instruction* split_raw = split.get(); + ctx->block->instructions.emplace_back(std::move(split)); + + aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>( + aco_opcode::p_create_vector, Format::PSEUDO, src.size(), 1)}; + vec->definitions[0] = Definition(dst); + for (unsigned i = 0; i < src.size(); i++) { + vec->operands[i] = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), + split_raw->definitions[i].getTemp()); + } + + ctx->block->instructions.emplace_back(std::move(vec)); + if (src.bytes() % 4 == 0) + emit_split_vector(ctx, dst, src.size()); + } + + return dst; +} + bool can_use_byte_align_for_global_load(unsigned num_components, unsigned component_size, unsigned align_, bool support_12_byte) @@ -3974,6 +4012,7 @@ struct LoadEmitInfo { bool glc = false; bool slc = false; bool split_by_component_stride = true; + bool readfirstlane_for_uniform = false; unsigned swizzle_component_size = 0; memory_sync_info sync; Temp soffset = Temp(0, s1); @@ -4220,8 +4259,14 @@ emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info, /* try to p_as_uniform early so we can create more optimizable code and * also update allocated_vec */ for (unsigned j = start; j < components_split; j++) { - if (allocated_vec[j].bytes() % 4 == 0 && info.dst.type() == RegType::sgpr) - allocated_vec[j] = bld.as_uniform(allocated_vec[j]); + if (allocated_vec[j].bytes() % 4 == 0 && info.dst.type() == RegType::sgpr) { + if (info.readfirstlane_for_uniform) { + allocated_vec[j] = emit_readfirstlane( + ctx, allocated_vec[j], bld.tmp(RegClass(RegType::sgpr, allocated_vec[j].size()))); + } else { + allocated_vec[j] = bld.as_uniform(allocated_vec[j]); + } + } has_vgprs |= allocated_vec[j].type() == RegType::vgpr; } } @@ -4243,7 +4288,10 @@ emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info, Temp tmp = bld.tmp(RegType::vgpr, info.dst.size()); vec->definitions[0] = Definition(tmp); bld.insert(std::move(vec)); - bld.pseudo(aco_opcode::p_as_uniform, Definition(info.dst), tmp); + if (info.readfirstlane_for_uniform) + emit_readfirstlane(ctx, tmp, info.dst); + else + bld.pseudo(aco_opcode::p_as_uniform, Definition(info.dst), tmp); } else { vec->definitions[0] = Definition(info.dst); bld.insert(std::move(vec)); @@ -4771,6 +4819,13 @@ load_lds(isel_context* ctx, unsigned elem_size_bytes, unsigned num_components, T info.align_offset = 0; info.sync = memory_sync_info(storage_shared); info.const_offset = base_offset; + /* The 2 separate loads for gfx10+ wave64 can see different values, even for uniform addresses, + * if another wave writes LDS in between. Use v_readfirstlane instead of p_as_uniform in order + * to avoid copy-propagation. + */ + info.readfirstlane_for_uniform = ctx->options->gfx_level >= GFX10 && + ctx->program->wave_size == 64 && + ctx->program->workgroup_size > 64; emit_load(ctx, bld, info, lds_load_params); return dst; @@ -8496,22 +8551,13 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) case nir_intrinsic_read_first_invocation: { Temp src = get_ssa_temp(ctx, instr->src[0].ssa); Temp dst = get_ssa_temp(ctx, &instr->def); - if (src.regClass() == v1b || src.regClass() == v2b || src.regClass() == v1) { - bld.vop1(aco_opcode::v_readfirstlane_b32, Definition(dst), src); - } else if (src.regClass() == v2) { - Temp lo = bld.tmp(v1), hi = bld.tmp(v1); - bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src); - lo = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), lo); - hi = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), hi); - bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); - emit_split_vector(ctx, dst, 2); - } else if (instr->def.bit_size == 1) { + if (instr->def.bit_size == 1) { assert(src.regClass() == bld.lm); Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src, bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm))); bool_to_vector_condition(ctx, tmp, dst); } else { - bld.copy(Definition(dst), src); + emit_readfirstlane(ctx, src, dst); } set_wqm(ctx); break;