Mesa (master): aco: always use p_parallelcopy for pre-RA copies

GitLab Mirror Tue, 27 Oct 2020 08:36:05 -0700

Module: Mesa
Branch: master
Commit: e54c111c4596274e21b9368e6fea503107e9e441
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=e54c111c4596274e21b9368e6fea503107e9e441


Author: Rhys Perry <[email protected]>
Date:   Wed Oct 14 13:50:24 2020 +0100

aco: always use p_parallelcopy for pre-RA copies

Most fossil-db changes are because literals are applied earlier
(in label_instruction), so use counts are more accurate and more literals
are applied.

fossil-db (Navi):
Totals from 79551 (57.89% of 137413) affected shaders:
SGPRs: 4549610 -> 4542802 (-0.15%); split: -0.19%, +0.04%
VGPRs: 3326764 -> 3324172 (-0.08%); split: -0.10%, +0.03%
SpillSGPRs: 38886 -> 34562 (-11.12%); split: -11.14%, +0.02%
CodeSize: 240143456 -> 240001008 (-0.06%); split: -0.11%, +0.05%
MaxWaves: 1078919 -> 1079281 (+0.03%); split: +0.04%, -0.01%
Instrs: 46627073 -> 46528490 (-0.21%); split: -0.22%, +0.01%

fossil-db (Polaris):
Totals from 98463 (70.90% of 138881) affected shaders:
SGPRs: 5164689 -> 5164353 (-0.01%); split: -0.02%, +0.01%
VGPRs: 3920936 -> 3921856 (+0.02%); split: -0.00%, +0.03%
SpillSGPRs: 56298 -> 52259 (-7.17%); split: -7.22%, +0.04%
CodeSize: 258680092 -> 258692712 (+0.00%); split: -0.02%, +0.03%
MaxWaves: 620863 -> 620823 (-0.01%); split: +0.00%, -0.01%
Instrs: 50776289 -> 50757577 (-0.04%); split: -0.04%, +0.00%

Signed-off-by: Rhys Perry <[email protected]>
Reviewed-by: Timur Kristóf <[email protected]>
Reviewed-by: Daniel Schürmann <[email protected]>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7216>

---

 src/amd/compiler/aco_builder_h.py            | 80 +---------------------------
 src/amd/compiler/aco_insert_exec_mask.cpp    |  1 +
 src/amd/compiler/aco_opt_value_numbering.cpp |  1 +
 src/amd/compiler/aco_optimizer.cpp           |  4 +-
 src/amd/compiler/aco_spill.cpp               |  7 +--
 src/amd/compiler/tests/test_isel.cpp         |  4 +-
 6 files changed, 13 insertions(+), 84 deletions(-)

diff --git a/src/amd/compiler/aco_builder_h.py 
b/src/amd/compiler/aco_builder_h.py
index bcfdb91aecf..8d687fdb96a 100644
--- a/src/amd/compiler/aco_builder_h.py
+++ b/src/amd/compiler/aco_builder_h.py
@@ -85,8 +85,6 @@ ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, 
unsigned xor_mask)
 
 aco_ptr<Instruction> create_s_mov(Definition dst, Operand src);
 
-extern uint8_t int8_mul_table[512];
-
 enum sendmsg {
    sendmsg_none = 0,
    _sendmsg_gs = 2,
@@ -386,82 +384,8 @@ public:
       return v_mul_imm(dst, tmp, imm, true);
    }
 
-   Result copy(Definition dst, Op op_) {
-      Operand op = op_.op;
-      assert(op.bytes() == dst.bytes());
-      if (dst.regClass() == s1 && op.size() == 1 && op.isLiteral()) {
-         uint32_t imm = op.constantValue();
-         if (imm == 0x3e22f983) {
-            if (program->chip_class >= GFX8)
-               op.setFixed(PhysReg{248}); /* it can be an inline constant on 
GFX8+ */
-         } else if (imm >= 0xffff8000 || imm <= 0x7fff) {
-            return sopk(aco_opcode::s_movk_i32, dst, imm & 0xFFFFu);
-         } else if (util_bitreverse(imm) <= 64 || util_bitreverse(imm) >= 
0xFFFFFFF0) {
-            uint32_t rev = util_bitreverse(imm);
-            return dst.regClass() == v1 ?
-                   vop1(aco_opcode::v_bfrev_b32, dst, Operand(rev)) :
-                   sop1(aco_opcode::s_brev_b32, dst, Operand(rev));
-         } else if (imm != 0) {
-            unsigned start = (ffs(imm) - 1) & 0x1f;
-            unsigned size = util_bitcount(imm) & 0x1f;
-            if ((((1u << size) - 1u) << start) == imm)
-                return sop2(aco_opcode::s_bfm_b32, dst, Operand(size), 
Operand(start));
-         }
-      }
-
-      if (dst.regClass() == s1) {
-        return sop1(aco_opcode::s_mov_b32, dst, op);
-      } else if (dst.regClass() == s2) {
-        return sop1(aco_opcode::s_mov_b64, dst, op);
-      } else if (dst.regClass() == v1 || dst.regClass() == v1.as_linear()) {
-        return vop1(aco_opcode::v_mov_b32, dst, op);
-      } else if (op.bytes() > 2 || (op.isLiteral() && 
dst.regClass().is_subdword())) {
-         return pseudo(aco_opcode::p_create_vector, dst, op);
-      } else if (op.bytes() == 1 && op.isConstant()) {
-        uint8_t val = op.constantValue();
-        Operand op32((uint32_t)val | (val & 0x80u ? 0xffffff00u : 0u));
-        aco_ptr<SDWA_instruction> sdwa;
-        if (op32.isLiteral()) {
-            
sdwa.reset(create_instruction<SDWA_instruction>(aco_opcode::v_mul_u32_u24, 
asSDWA(Format::VOP2), 2, 1));
-            uint32_t a = (uint32_t)int8_mul_table[val * 2];
-            uint32_t b = (uint32_t)int8_mul_table[val * 2 + 1];
-            sdwa->operands[0] = Operand(a | (a & 0x80u ? 0xffffff00u : 0x0u));
-            sdwa->operands[1] = Operand(b | (b & 0x80u ? 0xffffff00u : 0x0u));
-        } else {
-            
sdwa.reset(create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, 
asSDWA(Format::VOP1), 1, 1));
-            sdwa->operands[0] = op32;
-        }
-        sdwa->definitions[0] = dst;
-        sdwa->sel[0] = sdwa_udword;
-        sdwa->sel[1] = sdwa_udword;
-        sdwa->dst_sel = sdwa_ubyte;
-        sdwa->dst_preserve = true;
-        return insert(std::move(sdwa));
-      } else if (op.bytes() == 2 && op.isConstant() && !op.isLiteral()) {
-        aco_ptr<SDWA_instruction> 
sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_add_f16, 
asSDWA(Format::VOP2), 2, 1)};
-        sdwa->operands[0] = op;
-        sdwa->operands[1] = Operand(0u);
-        sdwa->definitions[0] = dst;
-        sdwa->sel[0] = sdwa_uword;
-        sdwa->sel[1] = sdwa_udword;
-        sdwa->dst_sel = dst.bytes() == 1 ? sdwa_ubyte : sdwa_uword;
-        sdwa->dst_preserve = true;
-        return insert(std::move(sdwa));
-      } else if (dst.regClass().is_subdword()) {
-        if (program->chip_class >= GFX8) {
-            aco_ptr<SDWA_instruction> 
sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, 
asSDWA(Format::VOP1), 1, 1)};
-            sdwa->operands[0] = op;
-            sdwa->definitions[0] = dst;
-            sdwa->sel[0] = op.bytes() == 1 ? sdwa_ubyte : sdwa_uword;
-            sdwa->dst_sel = dst.bytes() == 1 ? sdwa_ubyte : sdwa_uword;
-            sdwa->dst_preserve = true;
-            return insert(std::move(sdwa));
-        } else {
-            return vop1(aco_opcode::v_mov_b32, dst, op);
-        }
-      } else {
-        unreachable("Unhandled case in bld.copy()");
-      }
+   Result copy(Definition dst, Op op) {
+      return pseudo(aco_opcode::p_parallelcopy, dst, op);
    }
 
    Result vadd32(Definition dst, Op a, Op b, bool carry_out=false, Op 
carry_in=Op(Operand(s2)), bool post_ra=false) {
diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp 
b/src/amd/compiler/aco_insert_exec_mask.cpp
index c5965c631a1..30c408a354e 100644
--- a/src/amd/compiler/aco_insert_exec_mask.cpp
+++ b/src/amd/compiler/aco_insert_exec_mask.cpp
@@ -109,6 +109,7 @@ bool pred_by_exec_mask(aco_ptr<Instruction>& instr) {
       case aco_opcode::p_create_vector:
       case aco_opcode::p_extract_vector:
       case aco_opcode::p_split_vector:
+      case aco_opcode::p_parallelcopy:
          for (Definition def : instr->definitions) {
             if (def.getTemp().type() == RegType::vgpr)
                return true;
diff --git a/src/amd/compiler/aco_opt_value_numbering.cpp 
b/src/amd/compiler/aco_opt_value_numbering.cpp
index 4c35294e183..12b376fb91c 100644
--- a/src/amd/compiler/aco_opt_value_numbering.cpp
+++ b/src/amd/compiler/aco_opt_value_numbering.cpp
@@ -393,6 +393,7 @@ void process_block(vn_ctx& ctx, Block& block)
          instr->opcode == aco_opcode::s_mov_b32 ||
          instr->opcode == aco_opcode::s_mov_b64 ||
          instr->opcode == aco_opcode::v_mov_b32 ||
+         instr->opcode == aco_opcode::p_parallelcopy ||
          (instr->opcode == aco_opcode::p_create_vector && 
instr->operands.size() == 1);
       if (copy_instr && !instr->definitions[0].isFixed() && 
instr->operands[0].isTemp() &&
           instr->operands[0].regClass() == instr->definitions[0].regClass() &&
diff --git a/src/amd/compiler/aco_optimizer.cpp 
b/src/amd/compiler/aco_optimizer.cpp
index c839e0696a3..2d7f848ed94 100644
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@@ -853,7 +853,8 @@ void label_instruction(opt_ctx &ctx, Block& block, 
aco_ptr<Instruction>& instr)
             case aco_opcode::p_create_vector:
             case aco_opcode::p_split_vector:
             case aco_opcode::p_extract_vector:
-            case aco_opcode::p_phi: {
+            case aco_opcode::p_phi:
+            case aco_opcode::p_parallelcopy: {
                const bool all_vgpr = std::none_of(instr->definitions.begin(), 
instr->definitions.end(),
                                                   [] (const Definition& def) { 
return def.getTemp().type() != RegType::vgpr;});
                if (all_vgpr) {
@@ -1212,6 +1213,7 @@ void label_instruction(opt_ctx &ctx, Block& block, 
aco_ptr<Instruction>& instr)
    case aco_opcode::s_mov_b32: /* propagate */
    case aco_opcode::s_mov_b64:
    case aco_opcode::v_mov_b32:
+   case aco_opcode::p_parallelcopy:
       if (instr->operands[0].isTemp() && 
ctx.info[instr->operands[0].tempId()].is_vec() &&
           instr->operands[0].regClass() != instr->definitions[0].regClass()) {
          /* We might not be able to copy-propagate if it's a SGPR->VGPR copy, 
so
diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp
index 4196b652405..67c6712e0e4 100644
--- a/src/amd/compiler/aco_spill.cpp
+++ b/src/amd/compiler/aco_spill.cpp
@@ -245,8 +245,9 @@ bool should_rematerialize(aco_ptr<Instruction>& instr)
    /* TODO: rematerialization is only supported for VOP1, SOP1 and PSEUDO */
    if (instr->format != Format::VOP1 && instr->format != Format::SOP1 && 
instr->format != Format::PSEUDO && instr->format != Format::SOPK)
       return false;
-   /* TODO: pseudo-instruction rematerialization is only supported for 
p_create_vector */
-   if (instr->format == Format::PSEUDO && instr->opcode != 
aco_opcode::p_create_vector)
+   /* TODO: pseudo-instruction rematerialization is only supported for 
p_create_vector/p_parallelcopy */
+   if (instr->format == Format::PSEUDO && instr->opcode != 
aco_opcode::p_create_vector &&
+       instr->opcode != aco_opcode::p_parallelcopy)
       return false;
    if (instr->format == Format::SOPK && instr->opcode != 
aco_opcode::s_movk_i32)
       return false;
@@ -270,7 +271,7 @@ aco_ptr<Instruction> do_reload(spill_ctx& ctx, Temp tmp, 
Temp new_name, uint32_t
    if (remat != ctx.remat.end()) {
       Instruction *instr = remat->second.instr;
       assert((instr->format == Format::VOP1 || instr->format == Format::SOP1 
|| instr->format == Format::PSEUDO || instr->format == Format::SOPK) && 
"unsupported");
-      assert((instr->format != Format::PSEUDO || instr->opcode == 
aco_opcode::p_create_vector) && "unsupported");
+      assert((instr->format != Format::PSEUDO || instr->opcode == 
aco_opcode::p_create_vector || instr->opcode == aco_opcode::p_parallelcopy) && 
"unsupported");
       assert(instr->definitions.size() == 1 && "unsupported");
 
       aco_ptr<Instruction> res;
diff --git a/src/amd/compiler/tests/test_isel.cpp 
b/src/amd/compiler/tests/test_isel.cpp
index f45fe4311c0..48425b71a03 100644
--- a/src/amd/compiler/tests/test_isel.cpp
+++ b/src/amd/compiler/tests/test_isel.cpp
@@ -67,9 +67,9 @@ BEGIN_TEST(isel.compute.simple)
             uint res;
          };
          void main() {
-            //~gfx7>> v1: %data = v_mov_b32 42
+            //~gfx7>> v1: %data = p_parallelcopy 42
             //~gfx7>> buffer_store_dword %_, v1: undef, 0, %data disable_wqm 
storage:buffer semantics: scope:invocation
-            //~gfx8>> s1: %data = s_mov_b32 42
+            //~gfx8>> s1: %data = p_parallelcopy 42
             //~gfx8>> s_buffer_store_dword %_, 0, %data storage:buffer 
semantics: scope:invocation
             res = 42;
          }

_______________________________________________
mesa-commit mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/mesa-commit

Mesa (master): aco: always use p_parallelcopy for pre-RA copies

Reply via email to