Mesa (main): aco/gfx11: allow true 16-bit instructions to access v128+

GitLab Mirror Tue, 10 Jan 2023 08:29:58 -0800

Module: Mesa
Branch: main
Commit: 6872f8d861b80377c8bbff2f2fad0de14bc5e8b3
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=6872f8d861b80377c8bbff2f2fad0de14bc5e8b3


Author: Rhys Perry <[email protected]>
Date:   Fri Nov 11 19:58:45 2022 +0000

aco/gfx11: allow true 16-bit instructions to access v128+

It looks like the LLVM assembler promotes true 16-bit instructions to VOP3
in this case.

No fossil-db changes.

Signed-off-by: Rhys Perry <[email protected]>
Reviewed-by: Georg Lehmann <[email protected]>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20251>

---

 src/amd/compiler/aco_assembler.cpp        | 114 ++++++++++++++++++++++++------
 src/amd/compiler/aco_ir.cpp               | 106 +++++++++++++++++++++++++++
 src/amd/compiler/aco_ir.h                 |   1 +
 src/amd/compiler/tests/test_assembler.cpp |  86 ++++++++++++++++++++++
 4 files changed, 286 insertions(+), 21 deletions(-)

diff --git a/src/amd/compiler/aco_assembler.cpp 
b/src/amd/compiler/aco_assembler.cpp
index bb4ae1bcad7..8d78351fd65 100644
--- a/src/amd/compiler/aco_assembler.cpp
+++ b/src/amd/compiler/aco_assembler.cpp
@@ -98,8 +98,30 @@ reg(asm_context& ctx, Definition def, unsigned width = 32)
    return reg(ctx, def.physReg()) & BITFIELD_MASK(width);
 }
 
+bool
+needs_vop3_gfx11(asm_context& ctx, Instruction* instr, Operand *dpp_op)
+{
+   if (ctx.gfx_level <= GFX10_3)
+      return false;
+
+   uint8_t mask = get_gfx11_true16_mask(instr->opcode);
+   if (!mask)
+      return false;
+
+   u_foreach_bit (i, mask & 0x3) {
+      if (i == 0 && dpp_op && dpp_op->physReg().reg() >= (256 + 128))
+         return true;
+      if (instr->operands[i].physReg().reg() >= (256 + 128))
+         return true;
+   }
+   if ((mask & 0x8) && instr->definitions[0].physReg().reg() >= (256 + 128))
+      return true;
+   return false;
+}
+
 void
-emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* 
instr)
+emit_instruction(asm_context& ctx, std::vector<uint32_t>& out, Instruction* 
instr,
+                 Operand *dpp_op_ptr = NULL, DPP16_instruction *dpp16_ptr = 
NULL)
 {
    /* lower remaining pseudo-instructions */
    if (instr->opcode == aco_opcode::p_constaddr_getpc) {
@@ -298,30 +320,80 @@ emit_instruction(asm_context& ctx, std::vector<uint32_t>& 
out, Instruction* inst
       return;
    }
    case Format::VOP2: {
-      uint32_t encoding = 0;
-      encoding |= opcode << 25;
-      encoding |= reg(ctx, instr->definitions[0], 8) << 17;
-      encoding |= reg(ctx, instr->operands[1], 8) << 9;
-      encoding |= reg(ctx, instr->operands[0]);
-      out.push_back(encoding);
+      if (needs_vop3_gfx11(ctx, instr, dpp_op_ptr)) {
+         if (instr->opcode == aco_opcode::v_fmaak_f16) {
+            opcode = ctx.opcode[(int)aco_opcode::v_fma_f16];
+         } else if (instr->opcode == aco_opcode::v_fmamk_f16) {
+            std::swap(instr->operands[1], instr->operands[2]);
+            opcode = ctx.opcode[(int)aco_opcode::v_fma_f16];
+         } else {
+            opcode += 0x100;
+         }
+
+         uint32_t encoding = (0b110101 << 26);
+         encoding |= opcode << 16;
+         encoding |= reg(ctx, instr->definitions[0], 8);
+         encoding |= dpp16_ptr ? (dpp16_ptr->abs[0] << 8) | (dpp16_ptr->abs[1] 
<< 9) : 0;
+         out.push_back(encoding);
+
+         encoding = reg(ctx, instr->operands[0]);
+         encoding |= reg(ctx, instr->operands[1]) << 9;
+         if (instr->opcode == aco_opcode::v_fmaak_f16 ||
+             instr->opcode == aco_opcode::v_fmamk_f16)
+            encoding |= reg(ctx, instr->operands[2]) << 18;
+         encoding |= dpp16_ptr ? (dpp16_ptr->neg[0] << 29) | 
(dpp16_ptr->neg[1] << 30) : 0;
+         out.push_back(encoding);
+      } else {
+         uint32_t encoding = 0;
+         encoding |= opcode << 25;
+         encoding |= reg(ctx, instr->definitions[0], 8) << 17;
+         encoding |= reg(ctx, instr->operands[1], 8) << 9;
+         encoding |= reg(ctx, instr->operands[0]);
+         out.push_back(encoding);
+      }
       break;
    }
    case Format::VOP1: {
-      uint32_t encoding = (0b0111111 << 25);
-      if (!instr->definitions.empty())
-         encoding |= reg(ctx, instr->definitions[0], 8) << 17;
-      encoding |= opcode << 9;
-      if (!instr->operands.empty())
-         encoding |= reg(ctx, instr->operands[0]);
-      out.push_back(encoding);
+      if (needs_vop3_gfx11(ctx, instr, dpp_op_ptr)) {
+         uint32_t encoding = (0b110101 << 26);
+         encoding |= (opcode + 0x180) << 16;
+         encoding |= reg(ctx, instr->definitions[0], 8);
+         encoding |= dpp16_ptr ? dpp16_ptr->abs[0] << 8 : 0;
+         out.push_back(encoding);
+
+         encoding = reg(ctx, instr->operands[0]);
+         encoding |= dpp16_ptr ? dpp16_ptr->neg[0] << 29 : 0;
+         out.push_back(encoding);
+      } else {
+         uint32_t encoding = (0b0111111 << 25);
+         if (!instr->definitions.empty())
+            encoding |= reg(ctx, instr->definitions[0], 8) << 17;
+         encoding |= opcode << 9;
+         if (!instr->operands.empty())
+            encoding |= reg(ctx, instr->operands[0]);
+         out.push_back(encoding);
+      }
       break;
    }
    case Format::VOPC: {
-      uint32_t encoding = (0b0111110 << 25);
-      encoding |= opcode << 17;
-      encoding |= reg(ctx, instr->operands[1], 8) << 9;
-      encoding |= reg(ctx, instr->operands[0]);
-      out.push_back(encoding);
+      if (needs_vop3_gfx11(ctx, instr, dpp_op_ptr)) {
+         uint32_t encoding = (0b110101 << 26);
+         encoding |= opcode << 16;
+         encoding |= reg(ctx, instr->definitions[0], 8);
+         encoding |= dpp16_ptr ? (dpp16_ptr->abs[0] << 8) | (dpp16_ptr->abs[1] 
<< 9) : 0;
+         out.push_back(encoding);
+
+         encoding = reg(ctx, instr->operands[0]);
+         encoding |= reg(ctx, instr->operands[1]) << 9;
+         encoding |= dpp16_ptr ? (dpp16_ptr->neg[0] << 29) | 
(dpp16_ptr->neg[1] << 30) : 0;
+         out.push_back(encoding);
+      } else {
+         uint32_t encoding = (0b0111110 << 25);
+         encoding |= opcode << 17;
+         encoding |= reg(ctx, instr->operands[1], 8) << 9;
+         encoding |= reg(ctx, instr->operands[0]);
+         out.push_back(encoding);
+      }
       break;
    }
    case Format::VINTRP: {
@@ -802,7 +874,7 @@ emit_instruction(asm_context& ctx, std::vector<uint32_t>& 
out, Instruction* inst
          Operand dpp_op = instr->operands[0];
          instr->operands[0] = Operand(PhysReg{250}, v1);
          instr->format = (Format)((uint16_t)instr->format & 
~(uint16_t)Format::DPP16);
-         emit_instruction(ctx, out, instr);
+         emit_instruction(ctx, out, instr, &dpp_op, &dpp);
          uint32_t encoding = (0xF & dpp.row_mask) << 28;
          encoding |= (0xF & dpp.bank_mask) << 24;
          encoding |= dpp.abs[1] << 23;
@@ -824,7 +896,7 @@ emit_instruction(asm_context& ctx, std::vector<uint32_t>& 
out, Instruction* inst
          Operand dpp_op = instr->operands[0];
          instr->operands[0] = Operand(PhysReg{234}, v1);
          instr->format = (Format)((uint16_t)instr->format & 
~(uint16_t)Format::DPP8);
-         emit_instruction(ctx, out, instr);
+         emit_instruction(ctx, out, instr, &dpp_op);
          uint32_t encoding = reg(ctx, dpp_op, 8);
          for (unsigned i = 0; i < 8; ++i)
             encoding |= dpp.lane_sel[i] << (8 + i * 3);
diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp
index 44a975b6f4e..eb1daf4d3d1 100644
--- a/src/amd/compiler/aco_ir.cpp
+++ b/src/amd/compiler/aco_ir.cpp
@@ -533,6 +533,112 @@ instr_is_16bit(amd_gfx_level gfx_level, aco_opcode op)
    }
 }
 
+/* On GFX11, for some instructions, bit 7 of the destination/operand vgpr is 
opsel and the field
+ * only supports v0-v127.
+ */
+// TODO: take advantage of this functionality in the RA and assembler
+uint8_t
+get_gfx11_true16_mask(aco_opcode op)
+{
+   switch (op) {
+   case aco_opcode::v_ceil_f16:
+   case aco_opcode::v_cos_f16:
+   case aco_opcode::v_cvt_f16_i16:
+   case aco_opcode::v_cvt_f16_u16:
+   case aco_opcode::v_cvt_i16_f16:
+   case aco_opcode::v_cvt_u16_f16:
+   case aco_opcode::v_cvt_norm_i16_f16:
+   case aco_opcode::v_cvt_norm_u16_f16:
+   case aco_opcode::v_exp_f16:
+   case aco_opcode::v_floor_f16:
+   case aco_opcode::v_fract_f16:
+   case aco_opcode::v_frexp_exp_i16_f16:
+   case aco_opcode::v_frexp_mant_f16:
+   case aco_opcode::v_log_f16:
+   case aco_opcode::v_not_b16:
+   case aco_opcode::v_rcp_f16:
+   case aco_opcode::v_rndne_f16:
+   case aco_opcode::v_rsq_f16:
+   case aco_opcode::v_sin_f16:
+   case aco_opcode::v_sqrt_f16:
+   case aco_opcode::v_trunc_f16:
+   case aco_opcode::v_mov_b16: return 0x1 | 0x8;
+   case aco_opcode::v_add_f16:
+   case aco_opcode::v_fmaak_f16:
+   case aco_opcode::v_fmac_f16:
+   case aco_opcode::v_fmamk_f16:
+   case aco_opcode::v_ldexp_f16:
+   case aco_opcode::v_max_f16:
+   case aco_opcode::v_min_f16:
+   case aco_opcode::v_mul_f16:
+   case aco_opcode::v_sub_f16:
+   case aco_opcode::v_subrev_f16:
+   case aco_opcode::v_and_b16:
+   case aco_opcode::v_or_b16:
+   case aco_opcode::v_xor_b16: return 0x3 | 0x8;
+   case aco_opcode::v_cmp_class_f16:
+   case aco_opcode::v_cmpx_class_f16:
+   case aco_opcode::v_cvt_f32_f16:
+   case aco_opcode::v_cvt_i32_i16:
+   case aco_opcode::v_cvt_u32_u16: return 0x1;
+   case aco_opcode::v_cmp_eq_f16:
+   case aco_opcode::v_cmp_eq_i16:
+   case aco_opcode::v_cmp_eq_u16:
+   case aco_opcode::v_cmp_ge_f16:
+   case aco_opcode::v_cmp_ge_i16:
+   case aco_opcode::v_cmp_ge_u16:
+   case aco_opcode::v_cmp_gt_f16:
+   case aco_opcode::v_cmp_gt_i16:
+   case aco_opcode::v_cmp_gt_u16:
+   case aco_opcode::v_cmp_le_f16:
+   case aco_opcode::v_cmp_le_i16:
+   case aco_opcode::v_cmp_le_u16:
+   case aco_opcode::v_cmp_lg_f16:
+   case aco_opcode::v_cmp_lg_i16:
+   case aco_opcode::v_cmp_lg_u16:
+   case aco_opcode::v_cmp_lt_f16:
+   case aco_opcode::v_cmp_lt_i16:
+   case aco_opcode::v_cmp_lt_u16:
+   case aco_opcode::v_cmp_neq_f16:
+   case aco_opcode::v_cmp_nge_f16:
+   case aco_opcode::v_cmp_ngt_f16:
+   case aco_opcode::v_cmp_nle_f16:
+   case aco_opcode::v_cmp_nlg_f16:
+   case aco_opcode::v_cmp_nlt_f16:
+   case aco_opcode::v_cmp_o_f16:
+   case aco_opcode::v_cmp_u_f16:
+   case aco_opcode::v_cmpx_eq_f16:
+   case aco_opcode::v_cmpx_eq_i16:
+   case aco_opcode::v_cmpx_eq_u16:
+   case aco_opcode::v_cmpx_ge_f16:
+   case aco_opcode::v_cmpx_ge_i16:
+   case aco_opcode::v_cmpx_ge_u16:
+   case aco_opcode::v_cmpx_gt_f16:
+   case aco_opcode::v_cmpx_gt_i16:
+   case aco_opcode::v_cmpx_gt_u16:
+   case aco_opcode::v_cmpx_le_f16:
+   case aco_opcode::v_cmpx_le_i16:
+   case aco_opcode::v_cmpx_le_u16:
+   case aco_opcode::v_cmpx_lg_f16:
+   case aco_opcode::v_cmpx_lg_i16:
+   case aco_opcode::v_cmpx_lg_u16:
+   case aco_opcode::v_cmpx_lt_f16:
+   case aco_opcode::v_cmpx_lt_i16:
+   case aco_opcode::v_cmpx_lt_u16:
+   case aco_opcode::v_cmpx_neq_f16:
+   case aco_opcode::v_cmpx_nge_f16:
+   case aco_opcode::v_cmpx_ngt_f16:
+   case aco_opcode::v_cmpx_nle_f16:
+   case aco_opcode::v_cmpx_nlg_f16:
+   case aco_opcode::v_cmpx_nlt_f16:
+   case aco_opcode::v_cmpx_o_f16:
+   case aco_opcode::v_cmpx_u_f16: return 0x3;
+   case aco_opcode::v_cvt_f16_f32:
+   case aco_opcode::v_sat_pk_u8_i16: return 0x8;
+   default: return 0x0;
+   }
+}
+
 uint32_t
 get_reduction_identity(ReduceOp op, unsigned idx)
 {
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
index cbdd300117c..baf64b04267 100644
--- a/src/amd/compiler/aco_ir.h
+++ b/src/amd/compiler/aco_ir.h
@@ -1872,6 +1872,7 @@ is_dead(const std::vector<uint16_t>& uses, const 
Instruction* instr)
 
 bool can_use_opsel(amd_gfx_level gfx_level, aco_opcode op, int idx);
 bool instr_is_16bit(amd_gfx_level gfx_level, aco_opcode op);
+uint8_t get_gfx11_true16_mask(aco_opcode op);
 bool can_use_SDWA(amd_gfx_level gfx_level, const aco_ptr<Instruction>& instr, 
bool pre_ra);
 bool can_use_DPP(const aco_ptr<Instruction>& instr, bool pre_ra, bool dpp8);
 /* updates "instr" and returns the old instruction (or NULL if no update was 
needed) */
diff --git a/src/amd/compiler/tests/test_assembler.cpp 
b/src/amd/compiler/tests/test_assembler.cpp
index 533f69c6ba7..4f4c29a60a7 100644
--- a/src/amd/compiler/tests/test_assembler.cpp
+++ b/src/amd/compiler/tests/test_assembler.cpp
@@ -807,4 +807,90 @@ BEGIN_TEST(assembler.gfx11.ldsdir)
 
    finish_assembler_test();
 END_TEST
+
+BEGIN_TEST(assembler.gfx11.vop12c_v128)
+   if (!setup_cs(NULL, GFX11))
+      return;
+
+   Definition dst_v0 = bld.def(v1);
+   dst_v0.setFixed(PhysReg(256));
+
+   Definition dst_v128 = bld.def(v1);
+   dst_v128.setFixed(PhysReg(256 + 128));
+
+   Operand op_v1(bld.tmp(v1));
+   op_v1.setFixed(PhysReg(256 + 1));
+
+   Operand op_v2(bld.tmp(v1));
+   op_v2.setFixed(PhysReg(256 + 2));
+
+   Operand op_v129(bld.tmp(v1));
+   op_v129.setFixed(PhysReg(256 + 129));
+
+   Operand op_v130(bld.tmp(v1));
+   op_v130.setFixed(PhysReg(256 + 130));
+
+   //>> BB0:
+   //! v_mul_f16_e32 v0, v1, v2 ; Error: VGPR_32_Lo128: unknown register 128 ; 
6a000501
+   bld.vop2(aco_opcode::v_mul_f16, dst_v0, op_v1, op_v2);
+
+   //! v_mul_f16_e64 v128, v1, v2                                  ; d5350080 
00020501
+   bld.vop2(aco_opcode::v_mul_f16, dst_v128, op_v1, op_v2);
+
+   //! v_mul_f16_e64 v0, v129, v2                                  ; d5350000 
00020581
+   bld.vop2(aco_opcode::v_mul_f16, dst_v0, op_v129, op_v2);
+
+   //! v_mul_f16_e64 v0, v1, v130                                  ; d5350000 
00030501
+   bld.vop2(aco_opcode::v_mul_f16, dst_v0, op_v1, op_v130);
+
+   //! v_rcp_f16_e64 v128, v1                                      ; d5d40080 
00000101
+   bld.vop1(aco_opcode::v_rcp_f16, dst_v128, op_v1);
+
+   //! v_cmp_eq_f16_e64 vcc, v129, v2                              ; d402006a 
00020581
+   bld.vopc(aco_opcode::v_cmp_eq_f16, bld.def(s2, vcc), op_v129, op_v2);
+
+   //! v_mul_f16_e64_dpp v128, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf 
bound_ctrl:1 fi:1 ; d5350080 000204fa ff0d2101
+   bld.vop2_dpp(aco_opcode::v_mul_f16, dst_v128, op_v1, op_v2, dpp_row_rr(1));
+
+   //! v_mul_f16_e64_dpp v0, v129, v2 row_ror:1 row_mask:0xf bank_mask:0xf 
bound_ctrl:1 fi:1 ; d5350000 000204fa ff0d2181
+   bld.vop2_dpp(aco_opcode::v_mul_f16, dst_v0, op_v129, op_v2, dpp_row_rr(1));
+
+   //! v_mul_f16_e64_dpp v0, v1, v130 row_ror:1 row_mask:0xf bank_mask:0xf 
bound_ctrl:1 fi:1 ; d5350000 000304fa ff0d2101
+   bld.vop2_dpp(aco_opcode::v_mul_f16, dst_v0, op_v1, op_v130, dpp_row_rr(1));
+
+   //! v_mul_f16_e64_dpp v128, v1, v2 dpp8:[0,0,0,0,0,0,0,0] fi:1  ; d5350080 
000204ea 00000001
+   bld.vop2_dpp8(aco_opcode::v_mul_f16, dst_v128, op_v1, op_v2);
+
+   //! v_mul_f16_e64_dpp v0, v129, v2 dpp8:[0,0,0,0,0,0,0,0] fi:1  ; d5350000 
000204ea 00000081
+   bld.vop2_dpp8(aco_opcode::v_mul_f16, dst_v0, op_v129, op_v2);
+
+   //! v_mul_f16_e64_dpp v0, v1, v130 dpp8:[0,0,0,0,0,0,0,0] fi:1  ; d5350000 
000304ea 00000001
+   bld.vop2_dpp8(aco_opcode::v_mul_f16, dst_v0, op_v1, op_v130);
+
+   //! v_fma_f16 v128, v1, v2, 0x60                                ; d6480080 
03fe0501 00000060
+   bld.vop2(aco_opcode::v_fmaak_f16, dst_v128, op_v1, op_v2, 
Operand::literal32(96));
+
+   //! v_fma_f16 v128, v1, 0x60, v2                                ; d6480080 
0409ff01 00000060
+   bld.vop2(aco_opcode::v_fmamk_f16, dst_v128, op_v1, op_v2, 
Operand::literal32(96));
+
+   //! v_rcp_f16_e64_dpp v128, -v1 row_ror:1 row_mask:0xf bank_mask:0xf 
bound_ctrl:1 fi:1 ; d5d40080 200000fa ff1d2101
+   bld.vop1_dpp(aco_opcode::v_rcp_f16, dst_v128, op_v1, 
dpp_row_rr(1))->dpp16().neg[0] = true;
+
+   //! v_rcp_f16_e64_dpp v128, |v1| row_ror:1 row_mask:0xf bank_mask:0xf 
bound_ctrl:1 fi:1 ; d5d40180 000000fa ff2d2101
+   bld.vop1_dpp(aco_opcode::v_rcp_f16, dst_v128, op_v1, 
dpp_row_rr(1))->dpp16().abs[0] = true;
+
+   //! v_mul_f16_e64_dpp v128, -v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf 
bound_ctrl:1 fi:1 ; d5350080 200204fa ff1d2101
+   bld.vop2_dpp(aco_opcode::v_mul_f16, dst_v128, op_v1, op_v2, 
dpp_row_rr(1))->dpp16().neg[0] = true;
+
+   //! v_mul_f16_e64_dpp v128, |v1|, v2 row_ror:1 row_mask:0xf bank_mask:0xf 
bound_ctrl:1 fi:1 ; d5350180 000204fa ff2d2101
+   bld.vop2_dpp(aco_opcode::v_mul_f16, dst_v128, op_v1, op_v2, 
dpp_row_rr(1))->dpp16().abs[0] = true;
+
+   //! v_cmp_eq_f16_e64_dpp vcc, -v129, v2 row_ror:1 row_mask:0xf 
bank_mask:0xf bound_ctrl:1 fi:1 ; d402006a 200204fa ff1d2181
+   bld.vopc_dpp(aco_opcode::v_cmp_eq_f16, bld.def(s2, vcc), op_v129, op_v2, 
dpp_row_rr(1))->dpp16().neg[0] = true;
+
+   //! v_cmp_eq_f16_e64_dpp vcc, |v129|, v2 row_ror:1 row_mask:0xf 
bank_mask:0xf bound_ctrl:1 fi:1 ; d402016a 000204fa ff2d2181
+   bld.vopc_dpp(aco_opcode::v_cmp_eq_f16, bld.def(s2, vcc), op_v129, op_v2, 
dpp_row_rr(1))->dpp16().abs[0] = true;
+
+   finish_assembler_test();
+END_TEST
 #endif

Mesa (main): aco/gfx11: allow true 16-bit instructions to access v128+

Reply via email to