Module: Mesa
Branch: main
Commit: b7cd0eb439904f500b700657bb31572ed86c43b4
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=b7cd0eb439904f500b700657bb31572ed86c43b4

Author: Georg Lehmann <[email protected]>
Date:   Sun Feb 19 16:49:02 2023 +0100

aco: use v_permlane(x)16_b32 for masked swizzle

Should be cheaper than ds_swizzle.

Totals from 8 (0.01% of 134913) affected shaders:
CodeSize: 16316 -> 16388 (+0.44%)
Instrs: 3088 -> 3086 (-0.06%)
Latency: 49558 -> 49508 (-0.10%)
InvThroughput: 9180 -> 9198 (+0.20%)
Copies: 376 -> 384 (+2.13%)

Reviewed-by: Rhys Perry <[email protected]>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21412>

---

 src/amd/compiler/aco_instruction_selection.cpp | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/amd/compiler/aco_instruction_selection.cpp 
b/src/amd/compiler/aco_instruction_selection.cpp
index 1f4547d1433..1de943bde4f 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -249,6 +249,10 @@ emit_masked_swizzle(isel_context* ctx, Builder& bld, Temp 
src, unsigned mask)
 
       uint16_t dpp_ctrl = 0xffff;
 
+      /* DPP16 before DPP8 before v_permlane(x)16_b32
+       * because DPP16 supports modifiers and v_permlane
+       * can't be folded into valu instructions.
+       */
       if (and_mask == 0x1f && or_mask < 4 && xor_mask < 4) {
          unsigned res[4] = {0, 1, 2, 3};
          for (unsigned i = 0; i < 4; i++)
@@ -262,12 +266,22 @@ emit_masked_swizzle(isel_context* ctx, Builder& bld, Temp 
src, unsigned mask)
          dpp_ctrl = dpp_row_half_mirror;
       } else if (ctx->options->gfx_level >= GFX10 && (and_mask & 0x18) == 0x18 
&& or_mask < 8 &&
                  xor_mask < 8) {
-         // DPP8 comes last, as it does not allow several modifiers like `abs` 
that are available with DPP16
          Builder::Result ret = bld.vop1_dpp8(aco_opcode::v_mov_b32, 
bld.def(v1), src);
          for (unsigned i = 0; i < 8; i++) {
             ret->dpp8().lane_sel[i] = (((i & and_mask) | or_mask) ^ xor_mask) 
& 0x7;
          }
          return ret;
+      } else if (ctx->options->gfx_level >= GFX10 && (and_mask & 0x10) == 0x10 
&& or_mask < 0x10) {
+         uint64_t lane_mask = 0;
+         for (unsigned i = 0; i < 16; i++)
+            lane_mask |= uint64_t(((i & and_mask) | or_mask) ^ (xor_mask & 
0xf)) << i * 4;
+         aco_opcode opcode =
+            xor_mask & 0x10 ? aco_opcode::v_permlanex16_b32 : 
aco_opcode::v_permlane16_b32;
+         Temp op1 = bld.copy(bld.def(s1), Operand::c32(lane_mask & 
0xffffffff));
+         Temp op2 = bld.copy(bld.def(s1), Operand::c32(lane_mask >> 32));
+         Builder::Result ret = bld.vop3(opcode, bld.def(v1), src, op1, op2);
+         ret->vop3().opsel = 0x3; /* set BOUND_CTRL/FETCH_INACTIVE */
+         return ret;
       }
 
       if (dpp_ctrl != 0xffff)

Reply via email to