Module: Mesa Branch: main Commit: e83d8e13668076bf1b55b2cea85496c6fdc5309e URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=e83d8e13668076bf1b55b2cea85496c6fdc5309e
Author: Daniel Schürmann <dan...@schuermann.dev> Date: Sat Jan 6 12:59:51 2024 +0100 aco/insert_exec_mask: replace phi for loop restore mask with explicit copies Totals from 1785 (2.25% of 79242) affected shaders: (GFX11) Instrs: 6787574 -> 6787041 (-0.01%); split: -0.01%, +0.00% CodeSize: 34906500 -> 34904704 (-0.01%); split: -0.01%, +0.01% SpillSGPRs: 5848 -> 5816 (-0.55%) Latency: 88616877 -> 88617209 (+0.00%); split: -0.00%, +0.00% InvThroughput: 16644948 -> 16644717 (-0.00%); split: -0.00%, +0.00% VClause: 141122 -> 141121 (-0.00%) SClause: 178929 -> 178906 (-0.01%); split: -0.03%, +0.02% Copies: 569444 -> 569081 (-0.06%); split: -0.09%, +0.03% Branches: 186980 -> 186961 (-0.01%); split: -0.01%, +0.00% PreSGPRs: 133648 -> 133369 (-0.21%) Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26937> --- src/amd/compiler/aco_insert_exec_mask.cpp | 53 +++++++++++-------------------- 1 file changed, 18 insertions(+), 35 deletions(-) diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp index 37201622eef..c3a8f09f018 100644 --- a/src/amd/compiler/aco_insert_exec_mask.cpp +++ b/src/amd/compiler/aco_insert_exec_mask.cpp @@ -235,8 +235,7 @@ add_coupling_code(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>> assert(preds[0] == idx - 1); ctx.info[idx].exec = ctx.info[idx - 1].exec; loop_info& info = ctx.loop.back(); - while (ctx.info[idx].exec.size() > info.num_exec_masks) - ctx.info[idx].exec.pop_back(); + assert(ctx.info[idx].exec.size() == info.num_exec_masks); /* create ssa names for outer exec masks */ if (info.has_discard) { @@ -250,17 +249,6 @@ add_coupling_code(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>> } } - /* create ssa name for restore mask */ - if (info.has_divergent_break) { - // TODO: this phi is unnecessary if we end WQM immediately after the loop - /* this phi might be trivial but ensures a parallelcopy on the loop header */ - aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>( - aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)}; - phi->definitions[0] = bld.def(bld.lm); - phi->operands[0] = get_exec_op(ctx.info[preds[0]].exec[info.num_exec_masks - 1].first); - ctx.info[idx].exec.back().first = bld.insert(std::move(phi)); - } - /* create ssa name for loop active mask */ aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>( aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)}; @@ -269,16 +257,8 @@ add_coupling_code(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>> else phi->definitions[0] = Definition(exec, bld.lm); phi->operands[0] = get_exec_op(ctx.info[preds[0]].exec.back().first); - Temp loop_active = bld.insert(std::move(phi)); - - if (info.has_divergent_break) { - uint8_t mask_type = - (ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact)) | mask_type_loop; - ctx.info[idx].exec.emplace_back(loop_active, mask_type); - } else { - ctx.info[idx].exec.back().first = Operand(loop_active); - ctx.info[idx].exec.back().second |= mask_type_loop; - } + ctx.info[idx].exec.back().first = bld.insert(std::move(phi)); + ctx.info[idx].exec.back().second |= mask_type_loop; /* create a parallelcopy to move the active mask to exec */ if (info.has_divergent_continue) { @@ -318,13 +298,9 @@ add_coupling_code(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>> if (info.has_divergent_break) { restore_exec = true; - aco_ptr<Instruction>& phi = header->instructions[instr_idx]; - assert(phi->opcode == aco_opcode::p_linear_phi); - for (unsigned i = 1; i < phi->operands.size(); i++) - phi->operands[i] = - get_exec_op(ctx.info[header_preds[i]].exec[info.num_exec_masks].first); + /* Drop the loop active mask. */ + info.num_exec_masks--; } - assert(!(block->kind & block_kind_top_level) || info.num_exec_masks <= 2); /* create the loop exit phis if not trivial */ @@ -345,10 +321,6 @@ add_coupling_code(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>> aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>( aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)}; phi->definitions[0] = bld.def(bld.lm); - if (exec_idx == info.num_exec_masks - 1u) { - phi->definitions[0] = Definition(exec, bld.lm); - restore_exec = false; - } for (unsigned i = 0; i < phi->operands.size(); i++) phi->operands[i] = get_exec_op(ctx.info[preds[i]].exec[exec_idx].first); ctx.info[idx].exec.emplace_back(bld.insert(std::move(phi)), type); @@ -659,9 +631,20 @@ add_branch_code(exec_ctx& ctx, Block* block) has_divergent_continue = true; } + if (has_divergent_break) { + /* save restore exec mask */ + uint8_t mask = ctx.info[idx].exec.back().second; + if (ctx.info[idx].exec.back().first.constantEquals(-1u)) { + ctx.info[idx].exec.emplace_back(Operand(exec, bld.lm), mask); + } else { + bld.reset(bld.instructions, std::prev(bld.instructions->end())); + Operand restore = bld.copy(bld.def(bld.lm), Operand(exec, bld.lm)); + ctx.info[idx].exec.emplace(std::prev(ctx.info[idx].exec.end()), restore, mask); + bld.reset(bld.instructions); + } + ctx.info[idx].exec.back().second &= (mask_type_wqm | mask_type_exact); + } unsigned num_exec_masks = ctx.info[idx].exec.size(); - if (block->kind & block_kind_top_level) - num_exec_masks = std::min(num_exec_masks, 2u); ctx.loop.emplace_back(&ctx.program->blocks[block->linear_succs[0]], num_exec_masks, has_divergent_break, has_divergent_continue, has_discard);