Module: Mesa Branch: main Commit: f32dde29028a529c42431c387edade0886239aad URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=f32dde29028a529c42431c387edade0886239aad
Author: Rhys Perry <[email protected]> Date: Fri Oct 21 13:01:01 2022 +0100 aco: swap v_perm_b32 operands I misread the ISA doc and got the order wrong. Signed-off-by: Rhys Perry <[email protected]> Reviewed-by: Samuel Pitoiset <[email protected]> Fixes: dae1629778d ("aco: disable sdwa on gfx11") Fixes: e68e6c75ca1 ("aco: use v_perm_b32 to copy 0xff00/0x00ff/0xff/0x00") Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19223> --- src/amd/compiler/aco_lower_to_hw_instr.cpp | 15 +++---- src/amd/compiler/tests/test_to_hw_instr.cpp | 64 ++++++++++++++--------------- 2 files changed, 40 insertions(+), 39 deletions(-) diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index f3a8296ebfa..d9a46ddd4a1 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -1019,20 +1019,21 @@ get_intersection_mask(int a_start, int a_size, int b_start, int b_size) return u_bit_consecutive(intersection_start, intersection_end - intersection_start) & mask; } +/* src1 are bytes 0-3. dst/src0 are bytes 4-7. */ void -create_bperm(Builder& bld, uint8_t swiz[4], Definition dst, Operand src0, - Operand src1 = Operand(v1)) +create_bperm(Builder& bld, uint8_t swiz[4], Definition dst, Operand src1, + Operand src0 = Operand(v1)) { uint32_t swiz_packed = swiz[0] | ((uint32_t)swiz[1] << 8) | ((uint32_t)swiz[2] << 16) | ((uint32_t)swiz[3] << 24); dst = Definition(PhysReg(dst.physReg().reg()), v1); - if (!src0.isConstant()) - src0 = Operand(PhysReg(src0.physReg().reg()), v1); - if (src1.isUndefined()) - src1 = Operand(dst.physReg(), v1); - else if (!src1.isConstant()) + if (!src1.isConstant()) src1 = Operand(PhysReg(src1.physReg().reg()), v1); + if (src0.isUndefined()) + src0 = Operand(dst.physReg(), v1); + else if (!src0.isConstant()) + src0 = Operand(PhysReg(src0.physReg().reg()), v1); bld.vop3(aco_opcode::v_perm_b32, dst, src0, src1, Operand::c32(swiz_packed)); } diff --git a/src/amd/compiler/tests/test_to_hw_instr.cpp b/src/amd/compiler/tests/test_to_hw_instr.cpp index d120e2475dd..5e71d294a88 100644 --- a/src/amd/compiler/tests/test_to_hw_instr.cpp +++ b/src/amd/compiler/tests/test_to_hw_instr.cpp @@ -271,7 +271,7 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx[89]! v2b: %0:v[1][0:16] = v_mov_b32 %0:v[0][0:16] dst_sel:uword0 dst_preserve src0_sel:uword0 //~gfx[89]! v1b: %0:v[1][16:24] = v_mov_b32 %0:v[0][16:24] dst_sel:ubyte2 dst_preserve src0_sel:ubyte2 //~gfx11! v2b: %0:v[1][0:16] = v_add_u16_e64 %0:v[0][0:16], 0 - //~gfx11! v1: %0:v[1] = v_perm_b32 %0:v[0], %0:v[1], 0x7020504 + //~gfx11! v1: %0:v[1] = v_perm_b32 %0:v[1], %0:v[0], 0x7020504 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3u)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_b3, v1b), @@ -284,7 +284,7 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx(9|11)! v1: %0:v[0], v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0] //~gfx[89]! v1b: %0:v[1][8:16] = v_mov_b32 %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 //~gfx[89]! v2b: %0:v[1][16:32] = v_mov_b32 %0:v[0][16:32] dst_sel:uword1 dst_preserve src0_sel:uword1 - //~gfx11! v1: %0:v[1] = v_perm_b32 %0:v[0], %0:v[1], 0x7060104 + //~gfx11! v1: %0:v[1] = v_perm_b32 %0:v[1], %0:v[0], 0x7060104 //~gfx11! v2b: %0:v[1][16:32] = v_add_u16_e64 hi(%0:v[0][16:32]), 0 opsel_hi bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4u)); bld.pseudo(aco_opcode::p_parallelcopy, @@ -298,8 +298,8 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx(9|11)! v1: %0:v[1], v1: %0:v[0] = v_swap_b32 %0:v[0], %0:v[1] //~gfx[89]! v1b: %0:v[0][8:16] = v_mov_b32 %0:v[1][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte1 //~gfx[89]! v1b: %0:v[0][24:32] = v_mov_b32 %0:v[1][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 - //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[1], %0:v[0], 0x7060104 - //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[1], %0:v[0], 0x3060504 + //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0x7060104 + //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0x3060504 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(5u)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1b), Definition(v0_hi, v1b), Definition(v1_lo, v1), @@ -337,7 +337,7 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx11! v2b: %0:v[0][0:16] = v_add_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) //~gfx11! v2b: %0:v[1][16:32] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) opsel_hi //~gfx11! v2b: %0:v[0][0:16] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) - //~gfx11! v1: %0:v[0] = v_perm_b32 0, %0:v[0], 0x5060704 + //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x5060704 //~gfx11! v2b: %0:v[0][0:16] = v_add_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) //~gfx11! v2b: %0:v[1][16:32] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) opsel_hi //~gfx11! v2b: %0:v[0][0:16] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) @@ -352,7 +352,7 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx8! v1: %0:v[1] = v_xor_b32 %0:v[1], %0:v[0] //~gfx(9|11)! v1: %0:v[0], v1: %0:v[1] = v_swap_b32 %0:v[1], %0:v[0] //~gfx[89]! v1b: %0:v[1][24:32] = v_mov_b32 %0:v[0][24:32] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 - //~gfx11! v1: %0:v[1] = v_perm_b32 %0:v[0], %0:v[1], 0x3060504 + //~gfx11! v1: %0:v[1] = v_perm_b32 %0:v[1], %0:v[0], 0x3060504 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9u)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v3b), Definition(v1_lo, v3b), Definition(v0_b3, v1b), @@ -365,7 +365,7 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx11! v2b: %0:v[0][16:32] = v_add_u16_e64 hi(%0:v[0][16:32]), %0:v[1][0:16] opsel_hi //~gfx11! v2b: %0:v[1][0:16] = v_sub_u16_e64 hi(%0:v[0][16:32]), %0:v[1][0:16] //~gfx11! v2b: %0:v[0][16:32] = v_sub_u16_e64 hi(%0:v[0][16:32]), %0:v[1][0:16] opsel_hi - //~gfx11! v1: %0:v[0] = v_perm_b32 0, %0:v[0], 0x5060704 + //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x5060704 //~gfx11! v2b: %0:v[0][16:32] = v_add_u16_e64 hi(%0:v[0][16:32]), %0:v[1][0:16] opsel_hi //~gfx11! v2b: %0:v[1][0:16] = v_sub_u16_e64 hi(%0:v[0][16:32]), %0:v[1][0:16] //~gfx11! v2b: %0:v[0][16:32] = v_sub_u16_e64 hi(%0:v[0][16:32]), %0:v[1][0:16] opsel_hi @@ -375,7 +375,7 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx11! v2b: %0:v[0][0:16] = v_add_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) //~gfx11! v2b: %0:v[1][16:32] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) opsel_hi //~gfx11! v2b: %0:v[0][0:16] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) - //~gfx11! v1: %0:v[0] = v_perm_b32 0, %0:v[0], 0x7040506 + //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x7040506 //~gfx11! v2b: %0:v[0][0:16] = v_add_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) //~gfx11! v2b: %0:v[1][16:32] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) opsel_hi //~gfx11! v2b: %0:v[0][0:16] = v_sub_u16_e64 %0:v[0][0:16], hi(%0:v[1][16:32]) @@ -396,7 +396,7 @@ BEGIN_TEST(to_hw_instr.swap_subdword) //~gfx[89]! v1b: %0:v[0][24:32] = v_xor_b32 %0:v[0][24:32], %0:v[0][8:16] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte1 //~gfx[89]! v1b: %0:v[0][8:16] = v_xor_b32 %0:v[0][24:32], %0:v[0][8:16] dst_sel:ubyte1 dst_preserve src0_sel:ubyte3 src1_sel:ubyte1 //~gfx[89]! v1b: %0:v[0][24:32] = v_xor_b32 %0:v[0][24:32], %0:v[0][8:16] dst_sel:ubyte3 dst_preserve src0_sel:ubyte3 src1_sel:ubyte1 - //~gfx11! v1: %0:v[0] = v_perm_b32 0, %0:v[0], 0x5060704 + //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], 0, 0x5060704 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12u)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_b1, v1b), Definition(v0_b3, v1b), @@ -487,7 +487,7 @@ BEGIN_TEST(to_hw_instr.subdword_constant) //! p_unit_test 9 //~gfx(9|10)! v1b: %_:v[0][8:16] = v_mov_b32 0 dst_sel:ubyte1 dst_preserve src0_sel:dword //~gfx(9|10)! v1b: %_:v[0][16:24] = v_mov_b32 56 dst_sel:ubyte2 dst_preserve src0_sel:dword - //~gfx11! v1: %_:v[0] = v_perm_b32 0, %_:v[0], 0x7060c04 + //~gfx11! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x7060c04 //~gfx11! v1: %_:v[0] = v_and_b32 0xff00ffff, %_:v[0] //~gfx11! v1: %_:v[0] = v_or_b32 0x380000, %_:v[0] bld.pseudo(aco_opcode::p_unit_test, Operand::c32(9u)); @@ -515,7 +515,7 @@ BEGIN_TEST(to_hw_instr.subdword_constant) //! p_unit_test 12 //! v1: %_:v[0] = v_mov_b32 0 //~gfx(9|10)! v1b: %_:v[1][0:8] = v_mov_b32 0 dst_sel:ubyte0 dst_preserve src0_sel:dword - //~gfx11! v1: %_:v[1] = v_perm_b32 0, %_:v[1], 0x706050c + //~gfx11! v1: %_:v[1] = v_perm_b32 %_:v[1], 0, 0x706050c bld.pseudo(aco_opcode::p_unit_test, Operand::c32(12u)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1), Definition(v1_lo, v1b), Operand::zero(), Operand::zero(1)); @@ -530,7 +530,7 @@ BEGIN_TEST(to_hw_instr.subdword_constant) //>> p_unit_test 13 //~gfx9! v1: %_:v[0] = v_and_b32 0xffff0000, %_:v[0] //~gfx9! v1: %_:v[0] = v_or_b32 0xff, %_:v[0] - //~gfx(10|11)! v1: %_:v[0] = v_perm_b32 0, %_:v[0], 0x7060c0d + //~gfx(10|11)! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x7060c0d bld.pseudo(aco_opcode::p_unit_test, Operand::c32(13u)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Operand::c16(0x00ff)); @@ -538,28 +538,28 @@ BEGIN_TEST(to_hw_instr.subdword_constant) //! p_unit_test 14 //~gfx9! v1: %_:v[0] = v_and_b32 0xffff, %_:v[0] //~gfx9! v1: %_:v[0] = v_or_b32 0xff000000, %_:v[0] - //~gfx(10|11)! v1: %_:v[0] = v_perm_b32 0, %_:v[0], 0xd0c0504 + //~gfx(10|11)! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0xd0c0504 bld.pseudo(aco_opcode::p_unit_test, Operand::c32(14u)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_hi, v2b), Operand::c16(0xff00)); //! p_unit_test 15 //~gfx(9|10)! v2b: %_:v[0][0:16] = v_mov_b32 0 dst_sel:uword0 dst_preserve src0_sel:dword - //~gfx11! v1: %_:v[0] = v_perm_b32 0, %_:v[0], 0x7060c0c + //~gfx11! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x7060c0c bld.pseudo(aco_opcode::p_unit_test, Operand::c32(15u)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Operand::zero(2)); //! p_unit_test 16 //~gfx(9|10)! v1b: %_:v[0][0:8] = v_mov_b32 -1 dst_sel:ubyte0 dst_preserve src0_sel:dword - //~gfx11! v1: %_:v[0] = v_perm_b32 0, %_:v[0], 0x706050d + //~gfx11! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x706050d bld.pseudo(aco_opcode::p_unit_test, Operand::c32(16u)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1b), Operand::c8(0xff)); //! p_unit_test 17 //~gfx(9|10)! v1b: %_:v[0][0:8] = v_mov_b32 0 dst_sel:ubyte0 dst_preserve src0_sel:dword - //~gfx11! v1: %_:v[0] = v_perm_b32 0, %_:v[0], 0x706050c + //~gfx11! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x706050c bld.pseudo(aco_opcode::p_unit_test, Operand::c32(17u)); bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v1b), Operand::zero(1)); @@ -664,24 +664,24 @@ BEGIN_TEST(to_hw_instr.extract) bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4u)); //~gfx7.*! v2b: %_:v[0][0:16] = @v_bfe %_:v[1][0:16], 0, 8 //~gfx(8|9).*! v2b: %_:v[0][0:16] = v_mov_b32 %_:v[1][0:16] dst_sel:uword0 dst_preserve src0_sel:@byte(0) - //~gfx11_unsigned! v1: %_:v[0] = v_perm_b32 %_:v[1], %_:v[0], 0x7060c00 - //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[1], %_:v[0], 0x7060000 - //~gfx11_signed! v1: %_:v[0] = v_perm_b32 0, %_:v[0], 0x7060a04 + //~gfx11_unsigned! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060c00 + //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060000 + //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x7060a04 EXT(0, 0) //~gfx(8|9).*! v2b: %_:v[0][0:16] = v_mov_b32 %_:v[1][16:32] dst_sel:uword0 dst_preserve src0_sel:@byte(2) - //~gfx11_unsigned! v1: %_:v[0] = v_perm_b32 %_:v[1], %_:v[0], 0x7060c02 - //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[1], %_:v[0], 0x7060202 - //~gfx11_signed! v1: %_:v[0] = v_perm_b32 0, %_:v[0], 0x7060a04 + //~gfx11_unsigned! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060c02 + //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060202 + //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], 0, 0x7060a04 if (lvl != GFX7) EXT(0, 2) //~gfx7.*! v2b: %_:v[0][0:16] = @v_bfe %_:v[1][0:16], 8, 8 //~gfx(8|9).*! v2b: %_:v[0][0:16] = v_mov_b32 %_:v[1][0:16] dst_sel:uword0 dst_preserve src0_sel:@byte(1) - //~gfx11_unsigned! v1: %_:v[0] = v_perm_b32 %_:v[1], %_:v[0], 0x7060c01 - //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[1], %_:v[0], 0x7060801 + //~gfx11_unsigned! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060c01 + //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060801 EXT(1, 0) //~gfx(8|9).*! v2b: %_:v[0][0:16] = v_mov_b32 %_:v[1][16:32] dst_sel:uword0 dst_preserve src0_sel:@byte(3) - //~gfx11_unsigned! v1: %_:v[0] = v_perm_b32 %_:v[1], %_:v[0], 0x7060c03 - //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[1], %_:v[0], 0x7060903 + //~gfx11_unsigned! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060c03 + //~gfx11_signed! v1: %_:v[0] = v_perm_b32 %_:v[0], %_:v[1], 0x7060903 if (lvl != GFX7) EXT(1, 2) @@ -715,12 +715,12 @@ BEGIN_TEST(to_hw_instr.insert) //~gfx7! v1: %0:v[0] = v_bfe_u32 %0:v[1], 0, 8 //~gfx7! v1: %0:v[0] = v_lshlrev_b32 8, %0:v[0] //~gfx(8|9)! v1: %0:v[0] = v_mov_b32 %0:v[1] dst_sel:ubyte1 src0_sel:dword - //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[1], %0:v[0], 0xc0c000c + //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0xc0c000c INS(1, 8) //~gfx7! v1: %0:v[0] = v_bfe_u32 %0:v[1], 0, 8 //~gfx7! v1: %0:v[0] = v_lshlrev_b32 16, %0:v[0] //~gfx(8|9)! v1: %0:v[0] = v_mov_b32 %0:v[1] dst_sel:ubyte2 src0_sel:dword - //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[1], %0:v[0], 0xc000c0c + //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0xc000c0c INS(2, 8) //! v1: %0:v[0] = v_lshlrev_b32 24, %0:v[1] INS(3, 8) @@ -762,18 +762,18 @@ BEGIN_TEST(to_hw_instr.insert) bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2u)); //~gfx7! v2b: %_:v[0][0:16] = v_bfe_u32 %_:v[1][0:16], 0, 8 //~gfx(8|9)! v2b: %0:v[0][0:16] = v_lshlrev_b32 0, %0:v[1][0:16] dst_sel:uword0 dst_preserve src0_sel:dword src1_sel:ubyte0 - //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[1], %0:v[0], 0x7060c00 + //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0x7060c00 INS(0, 0) //~gfx(8|9)! v2b: %0:v[0][16:32] = v_lshlrev_b32 0, %0:v[1][0:16] dst_sel:uword1 dst_preserve src0_sel:dword src1_sel:ubyte0 - //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[1], %0:v[0], 0xc000504 + //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0xc000504 if (lvl != GFX7) INS(0, 2) //~gfx7! v2b: %_:v[0][0:16] = v_lshlrev_b32 8, %_:v[1][0:16] //~gfx(8|9)! v2b: %0:v[0][0:16] = v_lshlrev_b32 8, %0:v[1][0:16] dst_sel:uword0 dst_preserve src0_sel:dword src1_sel:ubyte0 - //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[1], %0:v[0], 0x706000c + //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0x706000c INS(1, 0) //~gfx(8|9)! v2b: %0:v[0][16:32] = v_lshlrev_b32 8, %0:v[1][0:16] dst_sel:uword1 dst_preserve src0_sel:dword src1_sel:ubyte0 - //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[1], %0:v[0], 0xc0504 + //~gfx11! v1: %0:v[0] = v_perm_b32 %0:v[0], %0:v[1], 0xc0504 if (lvl != GFX7) INS(1, 2)
