Mesa (master): aco: use v_lshrrev_b64 for 64-bit VGPR copies on GFX10+

GitLab Mirror Fri, 04 Dec 2020 06:54:14 -0800

Module: Mesa
Branch: master
Commit: f53d4e5f6087b5a2d09d4513332919592e1c8242
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=f53d4e5f6087b5a2d09d4513332919592e1c8242


Author: Rhys Perry <[email protected]>
Date:   Thu Nov 26 14:18:18 2020 +0000

aco: use v_lshrrev_b64 for 64-bit VGPR copies on GFX10+

This isn't worth it on GFX9-, but the proprietary compiler uses it on
GFX10.

fossil-db (Navi):
Totals from 23825 (17.17% of 138791) affected shaders:
CodeSize: 130623632 -> 130623800 (+0.00%); split: -0.00%, +0.00%
Instrs: 25185559 -> 25108597 (-0.31%)
Cycles: 709864740 -> 708910860 (-0.13%)
VMEM: 7205343 -> 7168839 (-0.51%); split: +0.00%, -0.51%
SMEM: 1584946 -> 1575183 (-0.62%)
Copies: 2043134 -> 1966230 (-3.76%)

Signed-off-by: Rhys Perry <[email protected]>
Reviewed-by: Daniel Schürmann <[email protected]>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7798>

---

 src/amd/compiler/aco_lower_to_hw_instr.cpp | 35 +++++++++++++++++++++++++-----
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp 
b/src/amd/compiler/aco_lower_to_hw_instr.cpp
index 1d98b931763..c42e2bb671e 100644
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@@ -932,16 +932,20 @@ void split_copy(lower_context *ctx, unsigned offset, 
Definition *def, Operand *o
    def_reg.reg_b += offset;
    op_reg.reg_b += offset;
 
-   max_size = MIN2(max_size, src.def.regClass().type() == RegType::vgpr ? 4 : 
8);
+   /* 64-bit VGPR copies (implemented with v_lshrrev_b64) are slow before 
GFX10 */
+   if (ctx->program->chip_class < GFX10 &&
+       src.def.regClass().type() == RegType::vgpr)
+      max_size = MIN2(max_size, 4);
+   unsigned max_align = src.def.regClass().type() == RegType::vgpr ? 4 : 16;
 
    /* make sure the size is a power of two and reg % bytes == 0 */
    unsigned bytes = 1;
    for (; bytes <= max_size; bytes *= 2) {
       unsigned next = bytes * 2u;
-      bool can_increase = def_reg.reg_b % next == 0 &&
+      bool can_increase = def_reg.reg_b % MIN2(next, max_align) == 0 &&
                           offset + next <= src.bytes && next <= max_size;
       if (!src.op.isConstant() && can_increase)
-         can_increase = op_reg.reg_b % next == 0;
+         can_increase = op_reg.reg_b % MIN2(next, max_align) == 0;
       for (unsigned i = 0; !ignore_uses && can_increase && (i < bytes); i++)
          can_increase = (src.uses[offset + bytes + i] == 0) == 
(src.uses[offset] == 0);
       if (!can_increase)
@@ -1007,7 +1011,16 @@ void copy_constant(lower_context *ctx, Builder& bld, 
Definition dst, Operand op)
    if (dst.regClass() == s1) {
       bld.sop1(aco_opcode::s_mov_b32, dst, op);
    } else if (dst.regClass() == s2) {
+      /* s_ashr_i64 writes SCC, so we can't use it */
+      assert(Operand::is_constant_representable(op.constantValue64(), 8, true, 
false));
       bld.sop1(aco_opcode::s_mov_b64, dst, op);
+   } else if (dst.regClass() == v2) {
+      if (Operand::is_constant_representable(op.constantValue64(), 8, true, 
false)) {
+         bld.vop3(aco_opcode::v_lshrrev_b64, dst, Operand(0u), op);
+      } else {
+         assert(Operand::is_constant_representable(op.constantValue64(), 8, 
false, true));
+         bld.vop3(aco_opcode::v_ashrrev_i64, dst, Operand(0u), op);
+      }
    } else if (dst.regClass() == v1) {
       bld.vop1(aco_opcode::v_mov_b32, dst, op);
    } else if (dst.regClass() == v1b) {
@@ -1076,6 +1089,8 @@ bool do_copy(lower_context* ctx, Builder& bld, const 
copy_operation& copy, bool
          copy_constant(ctx, bld, def, op);
       } else if (def.regClass() == v1) {
          bld.vop1(aco_opcode::v_mov_b32, def, op);
+      } else if (def.regClass() == v2) {
+         bld.vop3(aco_opcode::v_lshrrev_b64, def, Operand(0u), op);
       } else if (def.regClass() == s1) {
          bld.sop1(aco_opcode::s_mov_b32, def, op);
       } else if (def.regClass() == s2) {
@@ -1155,7 +1170,8 @@ void do_swap(lower_context *ctx, Builder& bld, const 
copy_operation& copy, bool
    for (; offset < copy.bytes;) {
       Definition def;
       Operand op;
-      split_copy(ctx, offset, &def, &op, copy, true, 8);
+      unsigned max_size = copy.def.regClass().type() == RegType::vgpr ? 4 : 8;
+      split_copy(ctx, offset, &def, &op, copy, true, max_size);
 
       assert(op.regClass() == def.regClass());
       Operand def_as_op = Operand(def.physReg(), def.regClass());
@@ -1353,9 +1369,16 @@ void handle_operands(std::map<PhysReg, copy_operation>& 
copy_map, lower_context*
       }
 
       /* try to coalesce copies */
+      unsigned next_def_align = util_next_power_of_two(it->second.bytes + 1);
+      unsigned next_op_align = next_def_align;
+      if (it->second.def.regClass().type() == RegType::vgpr)
+         next_def_align = MIN2(next_def_align, 4);
+      if (it->second.op.regClass().type() == RegType::vgpr)
+         next_op_align = MIN2(next_op_align, 4);
+
       if (it->second.bytes < 8 && !it->second.op.isConstant() &&
-          it->first.reg_b % util_next_power_of_two(it->second.bytes + 1) == 0 
&&
-          it->second.op.physReg().reg_b % 
util_next_power_of_two(it->second.bytes + 1) == 0) {
+          it->first.reg_b % next_def_align == 0 &&
+          it->second.op.physReg().reg_b % next_op_align == 0) {
          // TODO try more relaxed alignment for subdword copies
          PhysReg other_def_reg = it->first;
          other_def_reg.reg_b += it->second.bytes;

_______________________________________________
mesa-commit mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/mesa-commit

Mesa (master): aco: use v_lshrrev_b64 for 64-bit VGPR copies on GFX10+

Reply via email to