Module: Mesa Branch: master Commit: e54c111c4596274e21b9368e6fea503107e9e441 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=e54c111c4596274e21b9368e6fea503107e9e441
Author: Rhys Perry <[email protected]> Date: Wed Oct 14 13:50:24 2020 +0100 aco: always use p_parallelcopy for pre-RA copies Most fossil-db changes are because literals are applied earlier (in label_instruction), so use counts are more accurate and more literals are applied. fossil-db (Navi): Totals from 79551 (57.89% of 137413) affected shaders: SGPRs: 4549610 -> 4542802 (-0.15%); split: -0.19%, +0.04% VGPRs: 3326764 -> 3324172 (-0.08%); split: -0.10%, +0.03% SpillSGPRs: 38886 -> 34562 (-11.12%); split: -11.14%, +0.02% CodeSize: 240143456 -> 240001008 (-0.06%); split: -0.11%, +0.05% MaxWaves: 1078919 -> 1079281 (+0.03%); split: +0.04%, -0.01% Instrs: 46627073 -> 46528490 (-0.21%); split: -0.22%, +0.01% fossil-db (Polaris): Totals from 98463 (70.90% of 138881) affected shaders: SGPRs: 5164689 -> 5164353 (-0.01%); split: -0.02%, +0.01% VGPRs: 3920936 -> 3921856 (+0.02%); split: -0.00%, +0.03% SpillSGPRs: 56298 -> 52259 (-7.17%); split: -7.22%, +0.04% CodeSize: 258680092 -> 258692712 (+0.00%); split: -0.02%, +0.03% MaxWaves: 620863 -> 620823 (-0.01%); split: +0.00%, -0.01% Instrs: 50776289 -> 50757577 (-0.04%); split: -0.04%, +0.00% Signed-off-by: Rhys Perry <[email protected]> Reviewed-by: Timur Kristóf <[email protected]> Reviewed-by: Daniel Schürmann <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7216> --- src/amd/compiler/aco_builder_h.py | 80 +--------------------------- src/amd/compiler/aco_insert_exec_mask.cpp | 1 + src/amd/compiler/aco_opt_value_numbering.cpp | 1 + src/amd/compiler/aco_optimizer.cpp | 4 +- src/amd/compiler/aco_spill.cpp | 7 +-- src/amd/compiler/tests/test_isel.cpp | 4 +- 6 files changed, 13 insertions(+), 84 deletions(-) diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py index bcfdb91aecf..8d687fdb96a 100644 --- a/src/amd/compiler/aco_builder_h.py +++ b/src/amd/compiler/aco_builder_h.py @@ -85,8 +85,6 @@ ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask) aco_ptr<Instruction> create_s_mov(Definition dst, Operand src); -extern uint8_t int8_mul_table[512]; - enum sendmsg { sendmsg_none = 0, _sendmsg_gs = 2, @@ -386,82 +384,8 @@ public: return v_mul_imm(dst, tmp, imm, true); } - Result copy(Definition dst, Op op_) { - Operand op = op_.op; - assert(op.bytes() == dst.bytes()); - if (dst.regClass() == s1 && op.size() == 1 && op.isLiteral()) { - uint32_t imm = op.constantValue(); - if (imm == 0x3e22f983) { - if (program->chip_class >= GFX8) - op.setFixed(PhysReg{248}); /* it can be an inline constant on GFX8+ */ - } else if (imm >= 0xffff8000 || imm <= 0x7fff) { - return sopk(aco_opcode::s_movk_i32, dst, imm & 0xFFFFu); - } else if (util_bitreverse(imm) <= 64 || util_bitreverse(imm) >= 0xFFFFFFF0) { - uint32_t rev = util_bitreverse(imm); - return dst.regClass() == v1 ? - vop1(aco_opcode::v_bfrev_b32, dst, Operand(rev)) : - sop1(aco_opcode::s_brev_b32, dst, Operand(rev)); - } else if (imm != 0) { - unsigned start = (ffs(imm) - 1) & 0x1f; - unsigned size = util_bitcount(imm) & 0x1f; - if ((((1u << size) - 1u) << start) == imm) - return sop2(aco_opcode::s_bfm_b32, dst, Operand(size), Operand(start)); - } - } - - if (dst.regClass() == s1) { - return sop1(aco_opcode::s_mov_b32, dst, op); - } else if (dst.regClass() == s2) { - return sop1(aco_opcode::s_mov_b64, dst, op); - } else if (dst.regClass() == v1 || dst.regClass() == v1.as_linear()) { - return vop1(aco_opcode::v_mov_b32, dst, op); - } else if (op.bytes() > 2 || (op.isLiteral() && dst.regClass().is_subdword())) { - return pseudo(aco_opcode::p_create_vector, dst, op); - } else if (op.bytes() == 1 && op.isConstant()) { - uint8_t val = op.constantValue(); - Operand op32((uint32_t)val | (val & 0x80u ? 0xffffff00u : 0u)); - aco_ptr<SDWA_instruction> sdwa; - if (op32.isLiteral()) { - sdwa.reset(create_instruction<SDWA_instruction>(aco_opcode::v_mul_u32_u24, asSDWA(Format::VOP2), 2, 1)); - uint32_t a = (uint32_t)int8_mul_table[val * 2]; - uint32_t b = (uint32_t)int8_mul_table[val * 2 + 1]; - sdwa->operands[0] = Operand(a | (a & 0x80u ? 0xffffff00u : 0x0u)); - sdwa->operands[1] = Operand(b | (b & 0x80u ? 0xffffff00u : 0x0u)); - } else { - sdwa.reset(create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)); - sdwa->operands[0] = op32; - } - sdwa->definitions[0] = dst; - sdwa->sel[0] = sdwa_udword; - sdwa->sel[1] = sdwa_udword; - sdwa->dst_sel = sdwa_ubyte; - sdwa->dst_preserve = true; - return insert(std::move(sdwa)); - } else if (op.bytes() == 2 && op.isConstant() && !op.isLiteral()) { - aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_add_f16, asSDWA(Format::VOP2), 2, 1)}; - sdwa->operands[0] = op; - sdwa->operands[1] = Operand(0u); - sdwa->definitions[0] = dst; - sdwa->sel[0] = sdwa_uword; - sdwa->sel[1] = sdwa_udword; - sdwa->dst_sel = dst.bytes() == 1 ? sdwa_ubyte : sdwa_uword; - sdwa->dst_preserve = true; - return insert(std::move(sdwa)); - } else if (dst.regClass().is_subdword()) { - if (program->chip_class >= GFX8) { - aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)}; - sdwa->operands[0] = op; - sdwa->definitions[0] = dst; - sdwa->sel[0] = op.bytes() == 1 ? sdwa_ubyte : sdwa_uword; - sdwa->dst_sel = dst.bytes() == 1 ? sdwa_ubyte : sdwa_uword; - sdwa->dst_preserve = true; - return insert(std::move(sdwa)); - } else { - return vop1(aco_opcode::v_mov_b32, dst, op); - } - } else { - unreachable("Unhandled case in bld.copy()"); - } + Result copy(Definition dst, Op op) { + return pseudo(aco_opcode::p_parallelcopy, dst, op); } Result vadd32(Definition dst, Op a, Op b, bool carry_out=false, Op carry_in=Op(Operand(s2)), bool post_ra=false) { diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp index c5965c631a1..30c408a354e 100644 --- a/src/amd/compiler/aco_insert_exec_mask.cpp +++ b/src/amd/compiler/aco_insert_exec_mask.cpp @@ -109,6 +109,7 @@ bool pred_by_exec_mask(aco_ptr<Instruction>& instr) { case aco_opcode::p_create_vector: case aco_opcode::p_extract_vector: case aco_opcode::p_split_vector: + case aco_opcode::p_parallelcopy: for (Definition def : instr->definitions) { if (def.getTemp().type() == RegType::vgpr) return true; diff --git a/src/amd/compiler/aco_opt_value_numbering.cpp b/src/amd/compiler/aco_opt_value_numbering.cpp index 4c35294e183..12b376fb91c 100644 --- a/src/amd/compiler/aco_opt_value_numbering.cpp +++ b/src/amd/compiler/aco_opt_value_numbering.cpp @@ -393,6 +393,7 @@ void process_block(vn_ctx& ctx, Block& block) instr->opcode == aco_opcode::s_mov_b32 || instr->opcode == aco_opcode::s_mov_b64 || instr->opcode == aco_opcode::v_mov_b32 || + instr->opcode == aco_opcode::p_parallelcopy || (instr->opcode == aco_opcode::p_create_vector && instr->operands.size() == 1); if (copy_instr && !instr->definitions[0].isFixed() && instr->operands[0].isTemp() && instr->operands[0].regClass() == instr->definitions[0].regClass() && diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index c839e0696a3..2d7f848ed94 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -853,7 +853,8 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr) case aco_opcode::p_create_vector: case aco_opcode::p_split_vector: case aco_opcode::p_extract_vector: - case aco_opcode::p_phi: { + case aco_opcode::p_phi: + case aco_opcode::p_parallelcopy: { const bool all_vgpr = std::none_of(instr->definitions.begin(), instr->definitions.end(), [] (const Definition& def) { return def.getTemp().type() != RegType::vgpr;}); if (all_vgpr) { @@ -1212,6 +1213,7 @@ void label_instruction(opt_ctx &ctx, Block& block, aco_ptr<Instruction>& instr) case aco_opcode::s_mov_b32: /* propagate */ case aco_opcode::s_mov_b64: case aco_opcode::v_mov_b32: + case aco_opcode::p_parallelcopy: if (instr->operands[0].isTemp() && ctx.info[instr->operands[0].tempId()].is_vec() && instr->operands[0].regClass() != instr->definitions[0].regClass()) { /* We might not be able to copy-propagate if it's a SGPR->VGPR copy, so diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp index 4196b652405..67c6712e0e4 100644 --- a/src/amd/compiler/aco_spill.cpp +++ b/src/amd/compiler/aco_spill.cpp @@ -245,8 +245,9 @@ bool should_rematerialize(aco_ptr<Instruction>& instr) /* TODO: rematerialization is only supported for VOP1, SOP1 and PSEUDO */ if (instr->format != Format::VOP1 && instr->format != Format::SOP1 && instr->format != Format::PSEUDO && instr->format != Format::SOPK) return false; - /* TODO: pseudo-instruction rematerialization is only supported for p_create_vector */ - if (instr->format == Format::PSEUDO && instr->opcode != aco_opcode::p_create_vector) + /* TODO: pseudo-instruction rematerialization is only supported for p_create_vector/p_parallelcopy */ + if (instr->format == Format::PSEUDO && instr->opcode != aco_opcode::p_create_vector && + instr->opcode != aco_opcode::p_parallelcopy) return false; if (instr->format == Format::SOPK && instr->opcode != aco_opcode::s_movk_i32) return false; @@ -270,7 +271,7 @@ aco_ptr<Instruction> do_reload(spill_ctx& ctx, Temp tmp, Temp new_name, uint32_t if (remat != ctx.remat.end()) { Instruction *instr = remat->second.instr; assert((instr->format == Format::VOP1 || instr->format == Format::SOP1 || instr->format == Format::PSEUDO || instr->format == Format::SOPK) && "unsupported"); - assert((instr->format != Format::PSEUDO || instr->opcode == aco_opcode::p_create_vector) && "unsupported"); + assert((instr->format != Format::PSEUDO || instr->opcode == aco_opcode::p_create_vector || instr->opcode == aco_opcode::p_parallelcopy) && "unsupported"); assert(instr->definitions.size() == 1 && "unsupported"); aco_ptr<Instruction> res; diff --git a/src/amd/compiler/tests/test_isel.cpp b/src/amd/compiler/tests/test_isel.cpp index f45fe4311c0..48425b71a03 100644 --- a/src/amd/compiler/tests/test_isel.cpp +++ b/src/amd/compiler/tests/test_isel.cpp @@ -67,9 +67,9 @@ BEGIN_TEST(isel.compute.simple) uint res; }; void main() { - //~gfx7>> v1: %data = v_mov_b32 42 + //~gfx7>> v1: %data = p_parallelcopy 42 //~gfx7>> buffer_store_dword %_, v1: undef, 0, %data disable_wqm storage:buffer semantics: scope:invocation - //~gfx8>> s1: %data = s_mov_b32 42 + //~gfx8>> s1: %data = p_parallelcopy 42 //~gfx8>> s_buffer_store_dword %_, 0, %data storage:buffer semantics: scope:invocation res = 42; } _______________________________________________ mesa-commit mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/mesa-commit
