Module: Mesa Branch: main Commit: 917cfd587c4a735816ab27884128c13396a526f3 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=917cfd587c4a735816ab27884128c13396a526f3
Author: Rhys Perry <[email protected]> Date: Wed Nov 16 17:42:20 2022 +0000 aco: use v_minmax/v_maxmin opcodes fossil-db (gfx1100): Totals from 29868 (22.12% of 135032) affected shaders: MaxWaves: 741336 -> 741344 (+0.00%) Instrs: 34624902 -> 34539766 (-0.25%); split: -0.25%, +0.00% CodeSize: 187196804 -> 187192100 (-0.00%); split: -0.01%, +0.01% VGPRs: 1816860 -> 1816788 (-0.00%); split: -0.01%, +0.01% Latency: 502597202 -> 502245627 (-0.07%); split: -0.08%, +0.01% InvThroughput: 84813176 -> 84586122 (-0.27%); split: -0.28%, +0.01% VClause: 633826 -> 633749 (-0.01%); split: -0.02%, +0.01% SClause: 1317738 -> 1317047 (-0.05%); split: -0.06%, +0.01% Copies: 2130610 -> 2130954 (+0.02%); split: -0.03%, +0.05% Branches: 766093 -> 765969 (-0.02%); split: -0.02%, +0.00% PreSGPRs: 1630250 -> 1630034 (-0.01%); split: -0.02%, +0.00% PreVGPRs: 1590777 -> 1590664 (-0.01%); split: -0.01%, +0.00% Signed-off-by: Rhys Perry <[email protected]> Reviewed-by: Daniel Schürmann <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19933> --- src/amd/compiler/aco_optimizer.cpp | 87 ++++++++++++++++++++++++------- src/amd/compiler/tests/helpers.cpp | 10 ++++ src/amd/compiler/tests/helpers.h | 2 + src/amd/compiler/tests/test_optimizer.cpp | 65 ++++++++++++++++++----- 4 files changed, 133 insertions(+), 31 deletions(-) diff --git a/src/amd/compiler/aco_optimizer.cpp b/src/amd/compiler/aco_optimizer.cpp index 1902acee531..4b8d77a0806 100644 --- a/src/amd/compiler/aco_optimizer.cpp +++ b/src/amd/compiler/aco_optimizer.cpp @@ -2622,14 +2622,43 @@ combine_add_or_then_and_lshl(opt_ctx& ctx, aco_ptr<Instruction>& instr) } bool -combine_minmax(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode opposite, aco_opcode minmax3) +combine_minmax(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode opposite, aco_opcode op3src, + aco_opcode minmax) { /* TODO: this can handle SDWA min/max instructions by using opsel */ - if (combine_three_valu_op(ctx, instr, instr->opcode, minmax3, "120", 1 | 2)) - return true; - /* min(-max(a, b), c) -> min3(-a, -b, c) * - * max(-min(a, b), c) -> max3(-a, -b, c) */ + /* min(min(a, b), c) -> min3(a, b, c) + * max(max(a, b), c) -> max3(a, b, c) + * gfx11: min(-min(a, b), c) -> maxmin(-a, -b, c) + * gfx11: max(-max(a, b), c) -> minmax(-a, -b, c) + */ + for (unsigned swap = 0; swap < 2; swap++) { + Operand operands[3]; + bool neg[3], abs[3], clamp, precise; + uint8_t opsel = 0, omod = 0; + bool inbetween_neg; + if (match_op3_for_vop3(ctx, instr->opcode, instr->opcode, instr.get(), swap, "120", operands, + neg, abs, &opsel, &clamp, &omod, &inbetween_neg, NULL, NULL, + &precise) && + (!inbetween_neg || + (minmax != aco_opcode::num_opcodes && ctx.program->gfx_level >= GFX11))) { + ctx.uses[instr->operands[swap].tempId()]--; + if (inbetween_neg) { + neg[0] = !neg[0]; + neg[1] = !neg[1]; + create_vop3_for_op3(ctx, minmax, instr, operands, neg, abs, opsel, clamp, omod); + } else { + create_vop3_for_op3(ctx, op3src, instr, operands, neg, abs, opsel, clamp, omod); + } + return true; + } + } + + /* min(-max(a, b), c) -> min3(-a, -b, c) + * max(-min(a, b), c) -> max3(-a, -b, c) + * gfx11: min(max(a, b), c) -> maxmin(a, b, c) + * gfx11: max(min(a, b), c) -> minmax(a, b, c) + */ for (unsigned swap = 0; swap < 2; swap++) { Operand operands[3]; bool neg[3], abs[3], clamp, precise; @@ -2637,11 +2666,16 @@ combine_minmax(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode opposite, a bool inbetween_neg; if (match_op3_for_vop3(ctx, instr->opcode, opposite, instr.get(), swap, "120", operands, neg, abs, &opsel, &clamp, &omod, &inbetween_neg, NULL, NULL, &precise) && - inbetween_neg) { + (inbetween_neg || + (minmax != aco_opcode::num_opcodes && ctx.program->gfx_level >= GFX11))) { ctx.uses[instr->operands[swap].tempId()]--; - neg[0] = !neg[0]; - neg[1] = !neg[1]; - create_vop3_for_op3(ctx, minmax3, instr, operands, neg, abs, opsel, clamp, omod); + if (inbetween_neg) { + neg[0] = !neg[0]; + neg[1] = !neg[1]; + create_vop3_for_op3(ctx, op3src, instr, operands, neg, abs, opsel, clamp, omod); + } else { + create_vop3_for_op3(ctx, minmax, instr, operands, neg, abs, opsel, clamp, omod); + } return true; } } @@ -2959,7 +2993,7 @@ combine_add_bcnt(opt_ctx& ctx, aco_ptr<Instruction>& instr) bool get_minmax_info(aco_opcode op, aco_opcode* min, aco_opcode* max, aco_opcode* min3, aco_opcode* max3, - aco_opcode* med3, bool* some_gfx9_only) + aco_opcode* med3, aco_opcode* minmax, bool* some_gfx9_only) { switch (op) { #define MINMAX(type, gfx9) \ @@ -2970,9 +3004,21 @@ get_minmax_info(aco_opcode op, aco_opcode* min, aco_opcode* max, aco_opcode* min *med3 = aco_opcode::v_med3_##type; \ *min3 = aco_opcode::v_min3_##type; \ *max3 = aco_opcode::v_max3_##type; \ + *minmax = op == *min ? aco_opcode::v_maxmin_##type : aco_opcode::v_minmax_##type; \ + *some_gfx9_only = gfx9; \ + return true; +#define MINMAX_INT16(type, gfx9) \ + case aco_opcode::v_min_##type: \ + case aco_opcode::v_max_##type: \ + *min = aco_opcode::v_min_##type; \ + *max = aco_opcode::v_max_##type; \ + *med3 = aco_opcode::v_med3_##type; \ + *min3 = aco_opcode::v_min3_##type; \ + *max3 = aco_opcode::v_max3_##type; \ + *minmax = aco_opcode::num_opcodes; \ *some_gfx9_only = gfx9; \ return true; -#define MINMAX_E64(type, gfx9) \ +#define MINMAX_INT16_E64(type, gfx9) \ case aco_opcode::v_min_##type##_e64: \ case aco_opcode::v_max_##type##_e64: \ *min = aco_opcode::v_min_##type##_e64; \ @@ -2980,17 +3026,19 @@ get_minmax_info(aco_opcode op, aco_opcode* min, aco_opcode* max, aco_opcode* min *med3 = aco_opcode::v_med3_##type; \ *min3 = aco_opcode::v_min3_##type; \ *max3 = aco_opcode::v_max3_##type; \ + *minmax = aco_opcode::num_opcodes; \ *some_gfx9_only = gfx9; \ return true; MINMAX(f32, false) MINMAX(u32, false) MINMAX(i32, false) MINMAX(f16, true) - MINMAX(u16, true) - MINMAX(i16, true) - MINMAX_E64(u16, true) - MINMAX_E64(i16, true) -#undef MINMAX_E64 + MINMAX_INT16(u16, true) + MINMAX_INT16(i16, true) + MINMAX_INT16_E64(u16, true) + MINMAX_INT16_E64(i16, true) +#undef MINMAX_INT16_E64 +#undef MINMAX_INT16 #undef MINMAX default: return false; } @@ -4315,12 +4363,13 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& instr) ctx.mad_infos.emplace_back(nullptr, 0); ctx.info[instr->definitions[0].tempId()].set_mad(instr.get(), ctx.mad_infos.size() - 1); } else { - aco_opcode min, max, min3, max3, med3; + aco_opcode min, max, min3, max3, med3, minmax; bool some_gfx9_only; - if (get_minmax_info(instr->opcode, &min, &max, &min3, &max3, &med3, &some_gfx9_only) && + if (get_minmax_info(instr->opcode, &min, &max, &min3, &max3, &med3, &minmax, + &some_gfx9_only) && (!some_gfx9_only || ctx.program->gfx_level >= GFX9)) { if (combine_minmax(ctx, instr, instr->opcode == min ? max : min, - instr->opcode == min ? min3 : max3)) { + instr->opcode == min ? min3 : max3, minmax)) { } else { combine_clamp(ctx, instr, min, max, med3); } diff --git a/src/amd/compiler/tests/helpers.cpp b/src/amd/compiler/tests/helpers.cpp index 211284bfb49..bfd9b723a77 100644 --- a/src/amd/compiler/tests/helpers.cpp +++ b/src/amd/compiler/tests/helpers.cpp @@ -345,6 +345,16 @@ Temp fsat(Temp src, Builder b) Operand::c32(0x3f800000u), src); } +Temp fmin(Temp src0, Temp src1, Builder b) +{ + return b.vop2(aco_opcode::v_min_f32, b.def(v1), src0, src1); +} + +Temp fmax(Temp src0, Temp src1, Builder b) +{ + return b.vop2(aco_opcode::v_max_f32, b.def(v1), src0, src1); +} + Temp ext_ushort(Temp src, unsigned idx, Builder b) { return b.pseudo(aco_opcode::p_extract, b.def(src.regClass()), src, Operand::c32(idx), diff --git a/src/amd/compiler/tests/helpers.h b/src/amd/compiler/tests/helpers.h index 8f1f272fb6a..d1123ef2d96 100644 --- a/src/amd/compiler/tests/helpers.h +++ b/src/amd/compiler/tests/helpers.h @@ -102,6 +102,8 @@ aco::Temp fadd(aco::Temp src0, aco::Temp src1, aco::Builder b=bld); aco::Temp fmul(aco::Temp src0, aco::Temp src1, aco::Builder b=bld); aco::Temp fma(aco::Temp src0, aco::Temp src1, aco::Temp src2, aco::Builder b=bld); aco::Temp fsat(aco::Temp src, aco::Builder b=bld); +aco::Temp fmin(aco::Temp src0, aco::Temp src1, aco::Builder b=bld); +aco::Temp fmax(aco::Temp src0, aco::Temp src1, aco::Builder b=bld); aco::Temp ext_ushort(aco::Temp src, unsigned idx, aco::Builder b=bld); aco::Temp ext_ubyte(aco::Temp src, unsigned idx, aco::Builder b=bld); void emit_divergent_if_else(aco::Program* prog, aco::Builder& b, aco::Operand cond, std::function<void()> then, diff --git a/src/amd/compiler/tests/test_optimizer.cpp b/src/amd/compiler/tests/test_optimizer.cpp index 7b6fd01b5ef..86d8c2f5f05 100644 --- a/src/amd/compiler/tests/test_optimizer.cpp +++ b/src/amd/compiler/tests/test_optimizer.cpp @@ -713,23 +713,64 @@ BEGIN_TEST(optimize.add3) END_TEST BEGIN_TEST(optimize.minmax) - for (unsigned i = GFX9; i <= GFX10; i++) { - //>> v1: %a = p_startpgm - if (!setup_cs("v1", (amd_gfx_level)i)) + for (unsigned i = GFX10_3; i <= GFX11; i++) { + //>> v1: %a, v1: %b, v1: %c = p_startpgm + if (!setup_cs("v1 v1 v1", (amd_gfx_level)i)) continue; - //! v1: %res0 = v_max3_f32 -0, %a, 0 + Temp a = inputs[0]; + Temp b = inputs[1]; + Temp c = inputs[2]; + + //! v1: %res0 = v_min3_f32 %a, %b, %c //! p_unit_test 0, %res0 - Temp xor0 = fneg(inputs[0]); - Temp min = bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand::zero(), xor0); - Temp xor1 = fneg(min); - writeout(0, bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::zero(), xor1)); + writeout(0, fmin(c, fmin(a, b))); - //! v1: %res1 = v_max3_f32 -0, -%a, 0 + //! v1: %res1 = v_max3_f32 %a, %b, %c //! p_unit_test 1, %res1 - min = bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand::zero(), Operand(inputs[0])); - xor1 = fneg(min); - writeout(1, bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::zero(), xor1)); + writeout(1, fmax(c, fmax(a, b))); + + //! v1: %res2 = v_min3_f32 -%a, -%b, %c + //! p_unit_test 2, %res2 + writeout(2, fmin(c, fneg(fmax(a, b)))); + + //! v1: %res3 = v_max3_f32 -%a, -%b, %c + //! p_unit_test 3, %res3 + writeout(3, fmax(c, fneg(fmin(a, b)))); + + //! v1: %res4 = v_max3_f32 -%a, %b, %c + //! p_unit_test 4, %res4 + writeout(4, fmax(c, fneg(fmin(a, fneg(b))))); + + //~gfx10_3! v1: %res5_tmp = v_max_f32 %a, %b + //~gfx10_3! v1: %res5 = v_min_f32 %c, %res5_tmp + //~gfx11! v1: %res5 = v_maxmin_f32 %a, %b, %c + //! p_unit_test 5, %res5 + writeout(5, fmin(c, fmax(a, b))); + + //~gfx10_3! v1: %res6_tmp = v_min_f32 %a, %b + //~gfx10_3! v1: %res6 = v_max_f32 %c, %res6_tmp + //~gfx11! v1: %res6 = v_minmax_f32 %a, %b, %c + //! p_unit_test 6, %res6 + writeout(6, fmax(c, fmin(a, b))); + + //~gfx10_3! v1: %res7_tmp = v_min_f32 %a, %b + //~gfx10_3! v1: %res7 = v_min_f32 %c, -%res7_tmp + //~gfx11! v1: %res7 = v_maxmin_f32 -%a, -%b, %c + //! p_unit_test 7, %res7 + writeout(7, fmin(c, fneg(fmin(a, b)))); + + //~gfx10_3! v1: %res8_tmp = v_max_f32 %a, %b + //~gfx10_3! v1: %res8 = v_max_f32 %c, -%res8_tmp + //~gfx11! v1: %res8 = v_minmax_f32 -%a, -%b, %c + //! p_unit_test 8, %res8 + writeout(8, fmax(c, fneg(fmax(a, b)))); + + //~gfx10_3! v1: %res9_tmp = v_max_f32 %a, -%b + //~gfx10_3! v1: %res9 = v_max_f32 %c, -%res9_tmp + //~gfx11! v1: %res9 = v_minmax_f32 -%a, %b, %c + //! p_unit_test 9, %res9 + writeout(9, fmax(c, fneg(fmax(a, fneg(b))))); finish_opt_test(); }
