Module: Mesa
Branch: staging/23.1
Commit: 3aa5e03ca101c64584d1f1d63c32eb8282602574
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=3aa5e03ca101c64584d1f1d63c32eb8282602574

Author: Rhys Perry <[email protected]>
Date:   Fri Sep 29 11:01:45 2023 +0100

aco/optimizer_postRA: don't combine DPP across exec on GFX8/9

GFX8/9 seem to use FI=0 behaviour.

fossil-db (vega10):
Totals from 1 (0.00% of 63053) affected shaders:
Instrs: 542 -> 570 (+5.17%)
CodeSize: 2928 -> 3040 (+3.83%)
Latency: 2087 -> 2118 (+1.49%)
InvThroughput: 1103 -> 1143 (+3.63%)

Affected shader is from Cyberpunk 2077 fossil.

Signed-off-by: Rhys Perry <[email protected]>
Reviewed-by: Georg Lehmann <[email protected]>
Cc: 23.2 <mesa-stable>
Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/9784
(cherry picked from commit e64f895e08f39e0f2c42df1f2aac9f92f94cefd1)

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25479>

---

 src/amd/compiler/aco_assembler.cpp               |  2 +-
 src/amd/compiler/aco_optimizer_postRA.cpp        |  5 +++++
 src/amd/compiler/tests/test_optimizer_postRA.cpp | 28 ++++++++++++++++++++++++
 3 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/src/amd/compiler/aco_assembler.cpp 
b/src/amd/compiler/aco_assembler.cpp
index e6e3de055bd..c0a4a49add0 100644
--- a/src/amd/compiler/aco_assembler.cpp
+++ b/src/amd/compiler/aco_assembler.cpp
@@ -858,7 +858,7 @@ emit_instruction(asm_context& ctx, std::vector<uint32_t>& 
out, Instruction* inst
          encoding |= dpp.abs[0] << 21;
          encoding |= dpp.neg[0] << 20;
          if (ctx.gfx_level >= GFX10)
-            encoding |= 1 << 18; /* set Fetch Inactive to match GFX9 behaviour 
*/
+            encoding |= 1 << 18; /* set Fetch Inactive */
          encoding |= dpp.bound_ctrl << 19;
          encoding |= dpp.dpp_ctrl << 8;
          encoding |= reg(ctx, dpp_op, 8);
diff --git a/src/amd/compiler/aco_optimizer_postRA.cpp 
b/src/amd/compiler/aco_optimizer_postRA.cpp
index 3f692663f16..d499f53c5e1 100644
--- a/src/amd/compiler/aco_optimizer_postRA.cpp
+++ b/src/amd/compiler/aco_optimizer_postRA.cpp
@@ -510,6 +510,11 @@ try_combine_dpp(pr_opt_ctx& ctx, aco_ptr<Instruction>& 
instr)
       if (is_overwritten_since(ctx, mov->operands[0], op_instr_idx))
          continue;
 
+      /* GFX8/9 don't have fetch-inactive. */
+      if (ctx.program->gfx_level < GFX10 &&
+          is_overwritten_since(ctx, Operand(exec, ctx.program->lane_mask), 
op_instr_idx))
+         continue;
+
       if (i && !can_swap_operands(instr, &instr->opcode))
          continue;
 
diff --git a/src/amd/compiler/tests/test_optimizer_postRA.cpp 
b/src/amd/compiler/tests/test_optimizer_postRA.cpp
index d6cc3208ecb..487dfb36ad2 100644
--- a/src/amd/compiler/tests/test_optimizer_postRA.cpp
+++ b/src/amd/compiler/tests/test_optimizer_postRA.cpp
@@ -468,6 +468,34 @@ BEGIN_TEST(optimizer_postRA.dpp)
    finish_optimizer_postRA_test();
 END_TEST
 
+BEGIN_TEST(optimizer_postRA.dpp_across_exec)
+   for (amd_gfx_level gfx : {GFX9, GFX10}) {
+      //>> v1: %a:v[0], v1: %b:v[1] = p_startpgm
+      if (!setup_cs("v1 v1", gfx))
+         continue;
+
+      bld.instructions->at(0)->definitions[0].setFixed(PhysReg(256));
+      bld.instructions->at(0)->definitions[1].setFixed(PhysReg(257));
+
+      PhysReg reg_v2(258);
+      Operand a(inputs[0], PhysReg(256));
+      Operand b(inputs[1], PhysReg(257));
+
+      //~gfx9! v1: %tmp0:v[2] = v_mov_b32 %a:v[0] row_mirror bound_ctrl:1
+      //! s2: %0:exec,  s1: %0:scc = s_not_b64 %0:exec
+      //~gfx9! v1: %res0:v[2] = v_add_f32 %tmp0:v[2], %b:v[1]
+      //~gfx10! v1: %res0:v[2] = v_add_f32 %a:v[0], %b:v[1] row_mirror 
bound_ctrl:1
+      //! p_unit_test 0, %res0:v[2]
+      Temp tmp0 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1, reg_v2), a, 
dpp_row_mirror);
+      bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1),
+               Operand(exec, bld.lm));
+      Temp res0 = bld.vop2(aco_opcode::v_add_f32, bld.def(v1, reg_v2), 
Operand(tmp0, reg_v2), b);
+      writeout(0, Operand(res0, reg_v2));
+
+      finish_optimizer_postRA_test();
+   }
+END_TEST
+
 BEGIN_TEST(optimizer_postRA.dpp_across_cf)
    //>> v1: %a:v[0], v1: %b:v[1], v1: %c:v[2], v1: %d:v[3], s2: %e:s[0-1] = 
p_startpgm
    if (!setup_cs("v1 v1 v1 v1 s2", GFX10_3))

Reply via email to