Module: Mesa Branch: master Commit: f53d4e5f6087b5a2d09d4513332919592e1c8242 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=f53d4e5f6087b5a2d09d4513332919592e1c8242
Author: Rhys Perry <[email protected]> Date: Thu Nov 26 14:18:18 2020 +0000 aco: use v_lshrrev_b64 for 64-bit VGPR copies on GFX10+ This isn't worth it on GFX9-, but the proprietary compiler uses it on GFX10. fossil-db (Navi): Totals from 23825 (17.17% of 138791) affected shaders: CodeSize: 130623632 -> 130623800 (+0.00%); split: -0.00%, +0.00% Instrs: 25185559 -> 25108597 (-0.31%) Cycles: 709864740 -> 708910860 (-0.13%) VMEM: 7205343 -> 7168839 (-0.51%); split: +0.00%, -0.51% SMEM: 1584946 -> 1575183 (-0.62%) Copies: 2043134 -> 1966230 (-3.76%) Signed-off-by: Rhys Perry <[email protected]> Reviewed-by: Daniel Schürmann <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7798> --- src/amd/compiler/aco_lower_to_hw_instr.cpp | 35 +++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 1d98b931763..c42e2bb671e 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -932,16 +932,20 @@ void split_copy(lower_context *ctx, unsigned offset, Definition *def, Operand *o def_reg.reg_b += offset; op_reg.reg_b += offset; - max_size = MIN2(max_size, src.def.regClass().type() == RegType::vgpr ? 4 : 8); + /* 64-bit VGPR copies (implemented with v_lshrrev_b64) are slow before GFX10 */ + if (ctx->program->chip_class < GFX10 && + src.def.regClass().type() == RegType::vgpr) + max_size = MIN2(max_size, 4); + unsigned max_align = src.def.regClass().type() == RegType::vgpr ? 4 : 16; /* make sure the size is a power of two and reg % bytes == 0 */ unsigned bytes = 1; for (; bytes <= max_size; bytes *= 2) { unsigned next = bytes * 2u; - bool can_increase = def_reg.reg_b % next == 0 && + bool can_increase = def_reg.reg_b % MIN2(next, max_align) == 0 && offset + next <= src.bytes && next <= max_size; if (!src.op.isConstant() && can_increase) - can_increase = op_reg.reg_b % next == 0; + can_increase = op_reg.reg_b % MIN2(next, max_align) == 0; for (unsigned i = 0; !ignore_uses && can_increase && (i < bytes); i++) can_increase = (src.uses[offset + bytes + i] == 0) == (src.uses[offset] == 0); if (!can_increase) @@ -1007,7 +1011,16 @@ void copy_constant(lower_context *ctx, Builder& bld, Definition dst, Operand op) if (dst.regClass() == s1) { bld.sop1(aco_opcode::s_mov_b32, dst, op); } else if (dst.regClass() == s2) { + /* s_ashr_i64 writes SCC, so we can't use it */ + assert(Operand::is_constant_representable(op.constantValue64(), 8, true, false)); bld.sop1(aco_opcode::s_mov_b64, dst, op); + } else if (dst.regClass() == v2) { + if (Operand::is_constant_representable(op.constantValue64(), 8, true, false)) { + bld.vop3(aco_opcode::v_lshrrev_b64, dst, Operand(0u), op); + } else { + assert(Operand::is_constant_representable(op.constantValue64(), 8, false, true)); + bld.vop3(aco_opcode::v_ashrrev_i64, dst, Operand(0u), op); + } } else if (dst.regClass() == v1) { bld.vop1(aco_opcode::v_mov_b32, dst, op); } else if (dst.regClass() == v1b) { @@ -1076,6 +1089,8 @@ bool do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool copy_constant(ctx, bld, def, op); } else if (def.regClass() == v1) { bld.vop1(aco_opcode::v_mov_b32, def, op); + } else if (def.regClass() == v2) { + bld.vop3(aco_opcode::v_lshrrev_b64, def, Operand(0u), op); } else if (def.regClass() == s1) { bld.sop1(aco_opcode::s_mov_b32, def, op); } else if (def.regClass() == s2) { @@ -1155,7 +1170,8 @@ void do_swap(lower_context *ctx, Builder& bld, const copy_operation& copy, bool for (; offset < copy.bytes;) { Definition def; Operand op; - split_copy(ctx, offset, &def, &op, copy, true, 8); + unsigned max_size = copy.def.regClass().type() == RegType::vgpr ? 4 : 8; + split_copy(ctx, offset, &def, &op, copy, true, max_size); assert(op.regClass() == def.regClass()); Operand def_as_op = Operand(def.physReg(), def.regClass()); @@ -1353,9 +1369,16 @@ void handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* } /* try to coalesce copies */ + unsigned next_def_align = util_next_power_of_two(it->second.bytes + 1); + unsigned next_op_align = next_def_align; + if (it->second.def.regClass().type() == RegType::vgpr) + next_def_align = MIN2(next_def_align, 4); + if (it->second.op.regClass().type() == RegType::vgpr) + next_op_align = MIN2(next_op_align, 4); + if (it->second.bytes < 8 && !it->second.op.isConstant() && - it->first.reg_b % util_next_power_of_two(it->second.bytes + 1) == 0 && - it->second.op.physReg().reg_b % util_next_power_of_two(it->second.bytes + 1) == 0) { + it->first.reg_b % next_def_align == 0 && + it->second.op.physReg().reg_b % next_op_align == 0) { // TODO try more relaxed alignment for subdword copies PhysReg other_def_reg = it->first; other_def_reg.reg_b += it->second.bytes; _______________________________________________ mesa-commit mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/mesa-commit
