Module: Mesa
Branch: main
Commit: b4383821e7ec10d2d6c3cfec6eb8fe54dddb0d38
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=b4383821e7ec10d2d6c3cfec6eb8fe54dddb0d38

Author: Rhys Perry <[email protected]>
Date:   Tue Feb  7 19:45:55 2023 +0000

aco: don't modify exec in p_interp_gfx11

The RDNA3 ISA docs say that lds_param_load write the entire quad
regardless of exec, so this isn't needed.

fossil-db (gfx1100):
Totals from 5291 (3.93% of 134574) affected shaders:
Instrs: 4891396 -> 4789628 (-2.08%)
CodeSize: 25519032 -> 25111960 (-1.60%)
Latency: 36122982 -> 36074300 (-0.13%); split: -0.14%, +0.00%
InvThroughput: 4162436 -> 4161424 (-0.02%); split: -0.02%, +0.00%
Copies: 263862 -> 263838 (-0.01%)
PreSGPRs: 225012 -> 224179 (-0.37%)

Signed-off-by: Rhys Perry <[email protected]>
Reviewed-by: Georg Lehmann <[email protected]>
Reviewed-by: Timur Kristóf <[email protected]>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21171>

---

 src/amd/compiler/aco_builder_h.py              |  2 +-
 src/amd/compiler/aco_instruction_selection.cpp | 11 +++++------
 src/amd/compiler/aco_lower_to_hw_instr.cpp     |  8 --------
 3 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/src/amd/compiler/aco_builder_h.py 
b/src/amd/compiler/aco_builder_h.py
index f7552fd456e..816bc33aa02 100644
--- a/src/amd/compiler/aco_builder_h.py
+++ b/src/amd/compiler/aco_builder_h.py
@@ -530,7 +530,7 @@ public:
    }
 <%
 import itertools
-formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', 
list(itertools.product(range(5), range(6))) + [(8, 1), (1, 8), (2, 6), (3,6)]),
+formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', 
list(itertools.product(range(5), range(6))) + [(8, 1), (1, 8), (2, 6), (3, 6), 
(1, 6)]),
            ("sop1", [Format.SOP1], 'SOP1_instruction', [(0, 1), (1, 0), (1, 
1), (2, 1), (3, 2)]),
            ("sop2", [Format.SOP2], 'SOP2_instruction', itertools.product([1, 
2], [2, 3])),
            ("sopk", [Format.SOPK], 'SOPK_instruction', itertools.product([0, 
1, 2], [0, 1])),
diff --git a/src/amd/compiler/aco_instruction_selection.cpp 
b/src/amd/compiler/aco_instruction_selection.cpp
index d135e5d98b8..b67b173abab 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -5335,9 +5335,8 @@ emit_interp_instr_gfx11(isel_context* ctx, unsigned idx, 
unsigned component, Tem
       prim_mask_op.setLateKill(true); /* we don't want the bld.lm definition 
to use m0 */
       Operand coord2_op(coord2);
       coord2_op.setLateKill(true); /* we re-use the destination reg in the 
middle */
-      bld.pseudo(aco_opcode::p_interp_gfx11, Definition(dst), bld.def(bld.lm), 
bld.def(s1, scc),
-                 Operand(v1.as_linear()), Operand::c32(idx), 
Operand::c32(component), coord1,
-                 coord2_op, prim_mask_op);
+      bld.pseudo(aco_opcode::p_interp_gfx11, Definition(dst), 
Operand(v1.as_linear()),
+                 Operand::c32(idx), Operand::c32(component), coord1, 
coord2_op, prim_mask_op);
       return;
    }
 
@@ -5416,9 +5415,9 @@ emit_interp_mov_instr(isel_context* ctx, unsigned idx, 
unsigned component, unsig
       if (in_exec_divergent_or_in_loop(ctx)) {
          Operand prim_mask_op = bld.m0(prim_mask);
          prim_mask_op.setLateKill(true); /* we don't want the bld.lm 
definition to use m0 */
-         bld.pseudo(aco_opcode::p_interp_gfx11, Definition(dst), 
bld.def(bld.lm), bld.def(s1, scc),
-                    Operand(v1.as_linear()), Operand::c32(idx), 
Operand::c32(component),
-                    Operand::c32(dpp_ctrl), prim_mask_op);
+         bld.pseudo(aco_opcode::p_interp_gfx11, Definition(dst), 
Operand(v1.as_linear()),
+                    Operand::c32(idx), Operand::c32(component), 
Operand::c32(dpp_ctrl),
+                    prim_mask_op);
       } else {
          Temp p =
             bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), 
bld.m0(prim_mask), idx, component);
diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp 
b/src/amd/compiler/aco_lower_to_hw_instr.cpp
index ac5f95361f4..a31e9914935 100644
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@@ -2507,15 +2507,11 @@ lower_to_hw_instr(Program* program)
             case aco_opcode::p_interp_gfx11: {
                assert(instr->definitions[0].regClass() == v1 ||
                       instr->definitions[0].regClass() == v2b);
-               assert(instr->definitions[1].regClass() == bld.lm);
-               assert(instr->definitions[2].isFixed() && 
instr->definitions[2].physReg() == scc);
                assert(instr->operands[0].regClass() == v1.as_linear());
                assert(instr->operands[1].isConstant());
                assert(instr->operands[2].isConstant());
                assert(instr->operands.back().physReg() == m0);
                Definition dst = instr->definitions[0];
-               PhysReg exec_tmp = instr->definitions[1].physReg();
-               Definition clobber_scc = instr->definitions[2];
                PhysReg lin_vgpr = instr->operands[0].physReg();
                unsigned attribute = instr->operands[1].constantValue();
                unsigned component = instr->operands[2].constantValue();
@@ -2531,12 +2527,8 @@ lower_to_hw_instr(Program* program)
                   dpp_ctrl = instr->operands[3].constantValue();
                }
 
-               bld.sop1(Builder::s_mov, Definition(exec_tmp, bld.lm), 
Operand(exec, bld.lm));
-               bld.sop1(Builder::s_wqm, Definition(exec, bld.lm), clobber_scc,
-                        Operand(exec, bld.lm));
                bld.ldsdir(aco_opcode::lds_param_load, Definition(lin_vgpr, 
v1), Operand(m0, s1),
                           attribute, component);
-               bld.sop1(Builder::s_mov, Definition(exec, bld.lm), 
Operand(exec_tmp, bld.lm));
 
                Operand p(lin_vgpr, v1);
                Operand dst_op(dst.physReg(), v1);

Reply via email to