Module: Mesa
Branch: main
Commit: e83d8e13668076bf1b55b2cea85496c6fdc5309e
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=e83d8e13668076bf1b55b2cea85496c6fdc5309e

Author: Daniel Schürmann <dan...@schuermann.dev>
Date:   Sat Jan  6 12:59:51 2024 +0100

aco/insert_exec_mask: replace phi for loop restore mask with explicit copies

Totals from 1785 (2.25% of 79242) affected shaders: (GFX11)

Instrs: 6787574 -> 6787041 (-0.01%); split: -0.01%, +0.00%
CodeSize: 34906500 -> 34904704 (-0.01%); split: -0.01%, +0.01%
SpillSGPRs: 5848 -> 5816 (-0.55%)
Latency: 88616877 -> 88617209 (+0.00%); split: -0.00%, +0.00%
InvThroughput: 16644948 -> 16644717 (-0.00%); split: -0.00%, +0.00%
VClause: 141122 -> 141121 (-0.00%)
SClause: 178929 -> 178906 (-0.01%); split: -0.03%, +0.02%
Copies: 569444 -> 569081 (-0.06%); split: -0.09%, +0.03%
Branches: 186980 -> 186961 (-0.01%); split: -0.01%, +0.00%
PreSGPRs: 133648 -> 133369 (-0.21%)
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26937>

---

 src/amd/compiler/aco_insert_exec_mask.cpp | 53 +++++++++++--------------------
 1 file changed, 18 insertions(+), 35 deletions(-)

diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp 
b/src/amd/compiler/aco_insert_exec_mask.cpp
index 37201622eef..c3a8f09f018 100644
--- a/src/amd/compiler/aco_insert_exec_mask.cpp
+++ b/src/amd/compiler/aco_insert_exec_mask.cpp
@@ -235,8 +235,7 @@ add_coupling_code(exec_ctx& ctx, Block* block, 
std::vector<aco_ptr<Instruction>>
       assert(preds[0] == idx - 1);
       ctx.info[idx].exec = ctx.info[idx - 1].exec;
       loop_info& info = ctx.loop.back();
-      while (ctx.info[idx].exec.size() > info.num_exec_masks)
-         ctx.info[idx].exec.pop_back();
+      assert(ctx.info[idx].exec.size() == info.num_exec_masks);
 
       /* create ssa names for outer exec masks */
       if (info.has_discard) {
@@ -250,17 +249,6 @@ add_coupling_code(exec_ctx& ctx, Block* block, 
std::vector<aco_ptr<Instruction>>
          }
       }
 
-      /* create ssa name for restore mask */
-      if (info.has_divergent_break) {
-         // TODO: this phi is unnecessary if we end WQM immediately after the 
loop
-         /* this phi might be trivial but ensures a parallelcopy on the loop 
header */
-         aco_ptr<Pseudo_instruction> 
phi{create_instruction<Pseudo_instruction>(
-            aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
-         phi->definitions[0] = bld.def(bld.lm);
-         phi->operands[0] = 
get_exec_op(ctx.info[preds[0]].exec[info.num_exec_masks - 1].first);
-         ctx.info[idx].exec.back().first = bld.insert(std::move(phi));
-      }
-
       /* create ssa name for loop active mask */
       aco_ptr<Pseudo_instruction> phi{create_instruction<Pseudo_instruction>(
          aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
@@ -269,16 +257,8 @@ add_coupling_code(exec_ctx& ctx, Block* block, 
std::vector<aco_ptr<Instruction>>
       else
          phi->definitions[0] = Definition(exec, bld.lm);
       phi->operands[0] = get_exec_op(ctx.info[preds[0]].exec.back().first);
-      Temp loop_active = bld.insert(std::move(phi));
-
-      if (info.has_divergent_break) {
-         uint8_t mask_type =
-            (ctx.info[idx].exec.back().second & (mask_type_wqm | 
mask_type_exact)) | mask_type_loop;
-         ctx.info[idx].exec.emplace_back(loop_active, mask_type);
-      } else {
-         ctx.info[idx].exec.back().first = Operand(loop_active);
-         ctx.info[idx].exec.back().second |= mask_type_loop;
-      }
+      ctx.info[idx].exec.back().first = bld.insert(std::move(phi));
+      ctx.info[idx].exec.back().second |= mask_type_loop;
 
       /* create a parallelcopy to move the active mask to exec */
       if (info.has_divergent_continue) {
@@ -318,13 +298,9 @@ add_coupling_code(exec_ctx& ctx, Block* block, 
std::vector<aco_ptr<Instruction>>
 
       if (info.has_divergent_break) {
          restore_exec = true;
-         aco_ptr<Instruction>& phi = header->instructions[instr_idx];
-         assert(phi->opcode == aco_opcode::p_linear_phi);
-         for (unsigned i = 1; i < phi->operands.size(); i++)
-            phi->operands[i] =
-               
get_exec_op(ctx.info[header_preds[i]].exec[info.num_exec_masks].first);
+         /* Drop the loop active mask. */
+         info.num_exec_masks--;
       }
-
       assert(!(block->kind & block_kind_top_level) || info.num_exec_masks <= 
2);
 
       /* create the loop exit phis if not trivial */
@@ -345,10 +321,6 @@ add_coupling_code(exec_ctx& ctx, Block* block, 
std::vector<aco_ptr<Instruction>>
             aco_ptr<Pseudo_instruction> 
phi{create_instruction<Pseudo_instruction>(
                aco_opcode::p_linear_phi, Format::PSEUDO, preds.size(), 1)};
             phi->definitions[0] = bld.def(bld.lm);
-            if (exec_idx == info.num_exec_masks - 1u) {
-               phi->definitions[0] = Definition(exec, bld.lm);
-               restore_exec = false;
-            }
             for (unsigned i = 0; i < phi->operands.size(); i++)
                phi->operands[i] = 
get_exec_op(ctx.info[preds[i]].exec[exec_idx].first);
             ctx.info[idx].exec.emplace_back(bld.insert(std::move(phi)), type);
@@ -659,9 +631,20 @@ add_branch_code(exec_ctx& ctx, Block* block)
             has_divergent_continue = true;
       }
 
+      if (has_divergent_break) {
+         /* save restore exec mask */
+         uint8_t mask = ctx.info[idx].exec.back().second;
+         if (ctx.info[idx].exec.back().first.constantEquals(-1u)) {
+            ctx.info[idx].exec.emplace_back(Operand(exec, bld.lm), mask);
+         } else {
+            bld.reset(bld.instructions, std::prev(bld.instructions->end()));
+            Operand restore = bld.copy(bld.def(bld.lm), Operand(exec, bld.lm));
+            ctx.info[idx].exec.emplace(std::prev(ctx.info[idx].exec.end()), 
restore, mask);
+            bld.reset(bld.instructions);
+         }
+         ctx.info[idx].exec.back().second &= (mask_type_wqm | mask_type_exact);
+      }
       unsigned num_exec_masks = ctx.info[idx].exec.size();
-      if (block->kind & block_kind_top_level)
-         num_exec_masks = std::min(num_exec_masks, 2u);
 
       ctx.loop.emplace_back(&ctx.program->blocks[block->linear_succs[0]], 
num_exec_masks,
                             has_divergent_break, has_divergent_continue, 
has_discard);

Reply via email to