Module: Mesa Branch: main Commit: dce695b24f10a2bb01d46aa1c1156f9ce9300b34 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=dce695b24f10a2bb01d46aa1c1156f9ce9300b34
Author: Daniel Schürmann <dan...@schuermann.dev> Date: Fri Jan 5 08:22:36 2024 +0100 aco: refactor and speed-up dead code analysis Assuming that no loop header phis are dead code, we can perform the dead code analysis in a single iteration. Totals from 25 (0.03% of 79330) affected shaders: (GFX11) MaxWaves: 664 -> 662 (-0.30%) Instrs: 487618 -> 488822 (+0.25%) CodeSize: 2451548 -> 2459756 (+0.33%) VGPRs: 1296 -> 1332 (+2.78%) Latency: 2337256 -> 2338098 (+0.04%); split: -0.00%, +0.04% InvThroughput: 560682 -> 576158 (+2.76%) VClause: 15782 -> 15790 (+0.05%) Copies: 37905 -> 38731 (+2.18%) PreVGPRs: 1124 -> 1156 (+2.85%) Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26901> --- src/amd/compiler/aco_dead_code_analysis.cpp | 67 +++++++++++++---------------- src/amd/compiler/aco_insert_exec_mask.cpp | 1 + 2 files changed, 30 insertions(+), 38 deletions(-) diff --git a/src/amd/compiler/aco_dead_code_analysis.cpp b/src/amd/compiler/aco_dead_code_analysis.cpp index e1dbcd53621..18313a2f350 100644 --- a/src/amd/compiler/aco_dead_code_analysis.cpp +++ b/src/amd/compiler/aco_dead_code_analysis.cpp @@ -30,52 +30,41 @@ /* * Implements an analysis pass to determine the number of uses * for each SSA-definition. + * + * This pass assumes that no loop header phis are dead code. */ namespace aco { namespace { -struct dce_ctx { - int current_block; - std::vector<uint16_t> uses; - std::vector<std::vector<bool>> live; - - dce_ctx(Program* program) - : current_block(program->blocks.size() - 1), uses(program->peekAllocationId()) - { - live.reserve(program->blocks.size()); - for (Block& block : program->blocks) - live.emplace_back(block.instructions.size()); +void +process_loop_header_phis(std::vector<uint16_t>& uses, Block& block) +{ + for (aco_ptr<Instruction>& instr : block.instructions) { + if (!is_phi(instr)) + return; + for (const Operand& op : instr->operands) { + if (op.isTemp()) + uses[op.tempId()]++; + } } -}; +} void -process_block(dce_ctx& ctx, Block& block) +process_block(std::vector<uint16_t>& uses, Block& block) { - std::vector<bool>& live = ctx.live[block.index]; - assert(live.size() == block.instructions.size()); - bool process_predecessors = false; - for (int idx = block.instructions.size() - 1; idx >= 0; idx--) { - if (live[idx]) - continue; + for (auto it = block.instructions.rbegin(); it != block.instructions.rend(); it++) { + aco_ptr<Instruction>& instr = *it; + if ((block.kind & block_kind_loop_header) && is_phi(instr)) + break; - aco_ptr<Instruction>& instr = block.instructions[idx]; - if (!is_dead(ctx.uses, instr.get())) { + if (!is_dead(uses, instr.get())) { for (const Operand& op : instr->operands) { - if (op.isTemp()) { - if (ctx.uses[op.tempId()] == 0) - process_predecessors = true; - ctx.uses[op.tempId()]++; - } + if (op.isTemp()) + uses[op.tempId()]++; } - live[idx] = true; } } - - if (process_predecessors) { - for (unsigned pred_idx : block.linear_preds) - ctx.current_block = std::max(ctx.current_block, (int)pred_idx); - } } } /* end namespace */ @@ -83,15 +72,17 @@ process_block(dce_ctx& ctx, Block& block) std::vector<uint16_t> dead_code_analysis(Program* program) { + std::vector<uint16_t> uses(program->peekAllocationId()); - dce_ctx ctx(program); - - while (ctx.current_block >= 0) { - unsigned next_block = ctx.current_block--; - process_block(ctx, program->blocks[next_block]); + for (Block& block : program->blocks) { + if (block.kind & block_kind_loop_header) + process_loop_header_phis(uses, block); } - return ctx.uses; + for (auto it = program->blocks.rbegin(); it != program->blocks.rend(); it++) + process_block(uses, *it); + + return uses; } } // namespace aco diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp index d20bc29b120..4f4fb7eba5e 100644 --- a/src/amd/compiler/aco_insert_exec_mask.cpp +++ b/src/amd/compiler/aco_insert_exec_mask.cpp @@ -251,6 +251,7 @@ add_coupling_code(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>> /* create ssa name for restore mask */ if (info.has_divergent_break) { + // TODO: this phi is unnecessary if we end WQM immediately after the loop /* this phi might be trivial but ensures a parallelcopy on the loop header */ aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>( aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};