Module: Mesa
Branch: main
Commit: 16d2c7ad557b46104f91365ab3405f0a3ed7e36d
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=16d2c7ad557b46104f91365ab3405f0a3ed7e36d

Author: Rhys Perry <[email protected]>
Date:   Wed Oct 26 21:13:15 2022 +0100

aco/gfx11: perform FS input loads in WQM

fossil-db (gfx1100):
Totals from 48184 (35.68% of 135032) affected shaders:
MaxWaves: 1131876 -> 1131960 (+0.01%); split: +0.05%, -0.04%
Instrs: 36755466 -> 36782290 (+0.07%); split: -0.04%, +0.11%
CodeSize: 200812068 -> 200915348 (+0.05%); split: -0.04%, +0.09%
VGPRs: 2163980 -> 2163828 (-0.01%); split: -0.15%, +0.14%
Latency: 484174459 -> 484341018 (+0.03%); split: -0.06%, +0.09%
InvThroughput: 87941284 -> 87944874 (+0.00%); split: -0.04%, +0.04%
VClause: 652984 -> 653085 (+0.02%); split: -0.09%, +0.10%
SClause: 1510995 -> 1528832 (+1.18%); split: -0.40%, +1.58%
Copies: 1997689 -> 2001857 (+0.21%); split: -0.49%, +0.69%
Branches: 676629 -> 676584 (-0.01%); split: -0.02%, +0.01%
PreSGPRs: 2033070 -> 2036725 (+0.18%)
PreVGPRs: 1903922 -> 1903897 (-0.00%)

Signed-off-by: Rhys Perry <[email protected]>
Reviewed-by: Daniel Schürmann <[email protected]>
Fixes: 3730be9873d ("aco: mostly implement FS input loads on GFX11")
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19370>

---

 src/amd/compiler/aco_instruction_selection.cpp | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/amd/compiler/aco_instruction_selection.cpp 
b/src/amd/compiler/aco_instruction_selection.cpp
index d90c99549a9..352f793b11e 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -5319,14 +5319,17 @@ emit_interp_instr_gfx11(isel_context* ctx, unsigned 
idx, unsigned component, Tem
 
    Temp p = bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), 
bld.m0(prim_mask), idx, component);
 
+   Temp res;
    if (dst.regClass() == v2b) {
       Temp p10 =
          bld.vinterp_inreg(aco_opcode::v_interp_p10_f16_f32_inreg, 
bld.def(v1), p, coord1, p);
-      bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, 
Definition(dst), p, coord2, p10);
+      res = bld.vinterp_inreg(aco_opcode::v_interp_p2_f16_f32_inreg, 
bld.def(v1), p, coord2, p10);
    } else {
       Temp p10 = bld.vinterp_inreg(aco_opcode::v_interp_p10_f32_inreg, 
bld.def(v1), p, coord1, p);
-      bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, Definition(dst), p, 
coord2, p10);
+      res = bld.vinterp_inreg(aco_opcode::v_interp_p2_f32_inreg, bld.def(v1), 
p, coord2, p10);
    }
+   /* lds_param_load must be done in WQM, and the result kept valid for helper 
lanes. */
+   emit_wqm(bld, res, dst, true);
 }
 
 void
@@ -5385,7 +5388,10 @@ emit_interp_mov_instr(isel_context* ctx, unsigned idx, 
unsigned component, unsig
       //TODO: this doesn't work in quad-divergent control flow and ignores 
vertex_id
       Temp p = bld.ldsdir(aco_opcode::lds_param_load, bld.def(v1), 
bld.m0(prim_mask), idx, component);
       uint16_t dpp_ctrl = dpp_quad_perm(0, 0, 0, 0);
-      bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(dst), p, dpp_ctrl);
+      Temp res = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p, dpp_ctrl);
+
+      /* lds_param_load must be done in WQM, and the result kept valid for 
helper lanes. */
+      emit_wqm(bld, res, dst, true);
    } else {
       bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), 
Operand::c32(vertex_id),
                  bld.m0(prim_mask), idx, component);

Reply via email to