Module: Mesa Branch: main Commit: 9b6ab40b3be459985227007ca6681a3a9579e90f URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=9b6ab40b3be459985227007ca6681a3a9579e90f
Author: Rhys Perry <[email protected]> Date: Mon Nov 28 19:18:32 2022 +0000 aco: improve do_pack_2x16() with zero constants We can skip the v_or_b32 or use an instruction smaller than v_alignbyte_b32. Signed-off-by: Rhys Perry <[email protected]> Reviewed-by: Daniel Schürmann <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19933> --- src/amd/compiler/aco_lower_to_hw_instr.cpp | 14 ++++---- src/amd/compiler/tests/test_to_hw_instr.cpp | 52 ++++++++++++++++++++++------- 2 files changed, 48 insertions(+), 18 deletions(-) diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 95c8d77ffc3..5d4ae43b71c 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -1451,8 +1451,8 @@ do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Opera /* a single alignbyte can be sufficient: hi can be a 32-bit integer constant */ if (lo.physReg().byte() == 2 && hi.physReg().byte() == 0 && - (!hi.isConstant() || !Operand::c32(hi.constantValue()).isLiteral() || - ctx->program->gfx_level >= GFX10)) { + (!hi.isConstant() || (hi.constantValue() && (!Operand::c32(hi.constantValue()).isLiteral() || + ctx->program->gfx_level >= GFX10)))) { if (hi.isConstant()) bld.vop3(aco_opcode::v_alignbyte_b32, def, Operand::c32(hi.constantValue()), lo, Operand::c32(2u)); @@ -1470,8 +1470,9 @@ do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Opera bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), hi); else bld.vop2(aco_opcode::v_and_b32, def_hi, Operand::c32(~0xFFFFu), hi); - bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(lo.constantValue()), - Operand(def.physReg(), v1)); + if (lo.constantValue()) + bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(lo.constantValue()), + Operand(def.physReg(), v1)); return; } if (hi.isConstant()) { @@ -1482,8 +1483,9 @@ do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Opera bld.vop1(aco_opcode::v_cvt_u32_u16, def, lo); else bld.vop2(aco_opcode::v_and_b32, def_lo, Operand::c32(0xFFFFu), lo); - bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(hi.constantValue() << 16u), - Operand(def.physReg(), v1)); + if (hi.constantValue()) + bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(hi.constantValue() << 16u), + Operand(def.physReg(), v1)); return; } diff --git a/src/amd/compiler/tests/test_to_hw_instr.cpp b/src/amd/compiler/tests/test_to_hw_instr.cpp index 91d049e691e..9d2a27201ee 100644 --- a/src/amd/compiler/tests/test_to_hw_instr.cpp +++ b/src/amd/compiler/tests/test_to_hw_instr.cpp @@ -841,26 +841,54 @@ BEGIN_TEST(to_hw_instr.swap_linear_vgpr) finish_to_hw_instr_test(); END_TEST -BEGIN_TEST(to_hw_instr.pack2x16_alignbyte_constant) +BEGIN_TEST(to_hw_instr.pack2x16_constant) PhysReg v0_lo{256}; PhysReg v0_hi{256}; + PhysReg v1_lo{257}; PhysReg v1_hi{257}; v0_hi.reg_b += 2; v1_hi.reg_b += 2; - if (!setup_cs(NULL, GFX10)) - return; + for (amd_gfx_level lvl : {GFX10, GFX11}) { + if (!setup_cs(NULL, lvl)) + continue; - /* prevent usage of v_pack_b32_f16 */ - program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush; + /* prevent usage of v_pack_b32_f16 */ + program->blocks[0].fp_mode.denorm16_64 = fp_denorm_flush; - //>> p_unit_test 0 - //! v1: %_:v[0] = v_alignbyte_b32 0x3800, %_:v[1][16:32], 2 - bld.pseudo(aco_opcode::p_unit_test, Operand::zero()); - bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b), - Operand(v1_hi, v2b), Operand::c16(0x3800)); + //>> p_unit_test 0 + //! v1: %_:v[0] = v_alignbyte_b32 0x3800, %_:v[1][16:32], 2 + bld.pseudo(aco_opcode::p_unit_test, Operand::zero()); + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b), + Operand(v1_hi, v2b), Operand::c16(0x3800)); - //! s_endpgm + //! p_unit_test 1 + //! v2b: %_:v[0][0:16] = v_lshrrev_b32 16, %_:v[1][16:32] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(1)); + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b), + Operand(v1_hi, v2b), Operand::zero(2)); - finish_to_hw_instr_test(); + //! p_unit_test 2 + //~gfx10! v2b: %_:v[0][0:16] = v_and_b32 0xffff, %_:v[1][0:16] + //~gfx11! v1: %_:v[0] = v_cvt_u32_u16 %_:v[1][0:16] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(2)); + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b), + Operand(v1_lo, v2b), Operand::zero(2)); + + //! p_unit_test 3 + //! v2b: %_:v[0][16:32] = v_and_b32 0xffff0000, %_:v[1][16:32] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(3)); + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b), + Operand::zero(2), Operand(v1_hi, v2b)); + + //! p_unit_test 4 + //! v2b: %_:v[0][16:32] = v_lshlrev_b32 16, %_:v[1][0:16] + bld.pseudo(aco_opcode::p_unit_test, Operand::c32(4)); + bld.pseudo(aco_opcode::p_parallelcopy, Definition(v0_lo, v2b), Definition(v0_hi, v2b), + Operand::zero(2), Operand(v1_lo, v2b)); + + //! s_endpgm + + finish_to_hw_instr_test(); + } END_TEST
