Mesa (main): aco: refactor and speed-up dead code analysis

GitLab Mirror Mon, 08 Jan 2024 02:00:36 -0800

Module: Mesa
Branch: main
Commit: dce695b24f10a2bb01d46aa1c1156f9ce9300b34
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=dce695b24f10a2bb01d46aa1c1156f9ce9300b34


Author: Daniel Schürmann <dan...@schuermann.dev>
Date:   Fri Jan  5 08:22:36 2024 +0100

aco: refactor and speed-up dead code analysis

Assuming that no loop header phis are dead code,
we can perform the dead code analysis in a single iteration.

Totals from 25 (0.03% of 79330) affected shaders: (GFX11)

MaxWaves: 664 -> 662 (-0.30%)
Instrs: 487618 -> 488822 (+0.25%)
CodeSize: 2451548 -> 2459756 (+0.33%)
VGPRs: 1296 -> 1332 (+2.78%)
Latency: 2337256 -> 2338098 (+0.04%); split: -0.00%, +0.04%
InvThroughput: 560682 -> 576158 (+2.76%)
VClause: 15782 -> 15790 (+0.05%)
Copies: 37905 -> 38731 (+2.18%)
PreVGPRs: 1124 -> 1156 (+2.85%)
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26901>

---

 src/amd/compiler/aco_dead_code_analysis.cpp | 67 +++++++++++++----------------
 src/amd/compiler/aco_insert_exec_mask.cpp   |  1 +
 2 files changed, 30 insertions(+), 38 deletions(-)

diff --git a/src/amd/compiler/aco_dead_code_analysis.cpp 
b/src/amd/compiler/aco_dead_code_analysis.cpp
index e1dbcd53621..18313a2f350 100644
--- a/src/amd/compiler/aco_dead_code_analysis.cpp
+++ b/src/amd/compiler/aco_dead_code_analysis.cpp
@@ -30,52 +30,41 @@
 /*
  * Implements an analysis pass to determine the number of uses
  * for each SSA-definition.
+ *
+ * This pass assumes that no loop header phis are dead code.
  */
 
 namespace aco {
 namespace {
 
-struct dce_ctx {
-   int current_block;
-   std::vector<uint16_t> uses;
-   std::vector<std::vector<bool>> live;
-
-   dce_ctx(Program* program)
-       : current_block(program->blocks.size() - 1), 
uses(program->peekAllocationId())
-   {
-      live.reserve(program->blocks.size());
-      for (Block& block : program->blocks)
-         live.emplace_back(block.instructions.size());
+void
+process_loop_header_phis(std::vector<uint16_t>& uses, Block& block)
+{
+   for (aco_ptr<Instruction>& instr : block.instructions) {
+      if (!is_phi(instr))
+         return;
+      for (const Operand& op : instr->operands) {
+         if (op.isTemp())
+            uses[op.tempId()]++;
+      }
    }
-};
+}
 
 void
-process_block(dce_ctx& ctx, Block& block)
+process_block(std::vector<uint16_t>& uses, Block& block)
 {
-   std::vector<bool>& live = ctx.live[block.index];
-   assert(live.size() == block.instructions.size());
-   bool process_predecessors = false;
-   for (int idx = block.instructions.size() - 1; idx >= 0; idx--) {
-      if (live[idx])
-         continue;
+   for (auto it = block.instructions.rbegin(); it != 
block.instructions.rend(); it++) {
+      aco_ptr<Instruction>& instr = *it;
+      if ((block.kind & block_kind_loop_header) && is_phi(instr))
+         break;
 
-      aco_ptr<Instruction>& instr = block.instructions[idx];
-      if (!is_dead(ctx.uses, instr.get())) {
+      if (!is_dead(uses, instr.get())) {
          for (const Operand& op : instr->operands) {
-            if (op.isTemp()) {
-               if (ctx.uses[op.tempId()] == 0)
-                  process_predecessors = true;
-               ctx.uses[op.tempId()]++;
-            }
+            if (op.isTemp())
+               uses[op.tempId()]++;
          }
-         live[idx] = true;
       }
    }
-
-   if (process_predecessors) {
-      for (unsigned pred_idx : block.linear_preds)
-         ctx.current_block = std::max(ctx.current_block, (int)pred_idx);
-   }
 }
 
 } /* end namespace */
@@ -83,15 +72,17 @@ process_block(dce_ctx& ctx, Block& block)
 std::vector<uint16_t>
 dead_code_analysis(Program* program)
 {
+   std::vector<uint16_t> uses(program->peekAllocationId());
 
-   dce_ctx ctx(program);
-
-   while (ctx.current_block >= 0) {
-      unsigned next_block = ctx.current_block--;
-      process_block(ctx, program->blocks[next_block]);
+   for (Block& block : program->blocks) {
+      if (block.kind & block_kind_loop_header)
+         process_loop_header_phis(uses, block);
    }
 
-   return ctx.uses;
+   for (auto it = program->blocks.rbegin(); it != program->blocks.rend(); it++)
+      process_block(uses, *it);
+
+   return uses;
 }
 
 } // namespace aco
diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp 
b/src/amd/compiler/aco_insert_exec_mask.cpp
index d20bc29b120..4f4fb7eba5e 100644
--- a/src/amd/compiler/aco_insert_exec_mask.cpp
+++ b/src/amd/compiler/aco_insert_exec_mask.cpp
@@ -251,6 +251,7 @@ add_coupling_code(exec_ctx& ctx, Block* block, 
std::vector<aco_ptr<Instruction>>
 
       /* create ssa name for restore mask */
       if (info.has_divergent_break) {
+         // TODO: this phi is unnecessary if we end WQM immediately after the 
loop
          /* this phi might be trivial but ensures a parallelcopy on the loop 
header */
          aco_ptr<Pseudo_instruction> 
phi{create_instruction<Pseudo_instruction>(
             aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};

Mesa (main): aco: refactor and speed-up dead code analysis

Reply via email to