Module: Mesa
Branch: main
Commit: 917cfd587c4a735816ab27884128c13396a526f3
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=917cfd587c4a735816ab27884128c13396a526f3

Author: Rhys Perry <[email protected]>
Date:   Wed Nov 16 17:42:20 2022 +0000

aco: use v_minmax/v_maxmin opcodes

fossil-db (gfx1100):
Totals from 29868 (22.12% of 135032) affected shaders:
MaxWaves: 741336 -> 741344 (+0.00%)
Instrs: 34624902 -> 34539766 (-0.25%); split: -0.25%, +0.00%
CodeSize: 187196804 -> 187192100 (-0.00%); split: -0.01%, +0.01%
VGPRs: 1816860 -> 1816788 (-0.00%); split: -0.01%, +0.01%
Latency: 502597202 -> 502245627 (-0.07%); split: -0.08%, +0.01%
InvThroughput: 84813176 -> 84586122 (-0.27%); split: -0.28%, +0.01%
VClause: 633826 -> 633749 (-0.01%); split: -0.02%, +0.01%
SClause: 1317738 -> 1317047 (-0.05%); split: -0.06%, +0.01%
Copies: 2130610 -> 2130954 (+0.02%); split: -0.03%, +0.05%
Branches: 766093 -> 765969 (-0.02%); split: -0.02%, +0.00%
PreSGPRs: 1630250 -> 1630034 (-0.01%); split: -0.02%, +0.00%
PreVGPRs: 1590777 -> 1590664 (-0.01%); split: -0.01%, +0.00%

Signed-off-by: Rhys Perry <[email protected]>
Reviewed-by: Daniel Schürmann <[email protected]>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19933>

---

 src/amd/compiler/aco_optimizer.cpp        | 87 ++++++++++++++++++++++++-------
 src/amd/compiler/tests/helpers.cpp        | 10 ++++
 src/amd/compiler/tests/helpers.h          |  2 +
 src/amd/compiler/tests/test_optimizer.cpp | 65 ++++++++++++++++++-----
 4 files changed, 133 insertions(+), 31 deletions(-)

diff --git a/src/amd/compiler/aco_optimizer.cpp 
b/src/amd/compiler/aco_optimizer.cpp
index 1902acee531..4b8d77a0806 100644
--- a/src/amd/compiler/aco_optimizer.cpp
+++ b/src/amd/compiler/aco_optimizer.cpp
@@ -2622,14 +2622,43 @@ combine_add_or_then_and_lshl(opt_ctx& ctx, 
aco_ptr<Instruction>& instr)
 }
 
 bool
-combine_minmax(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode opposite, 
aco_opcode minmax3)
+combine_minmax(opt_ctx& ctx, aco_ptr<Instruction>& instr, aco_opcode opposite, 
aco_opcode op3src,
+               aco_opcode minmax)
 {
    /* TODO: this can handle SDWA min/max instructions by using opsel */
-   if (combine_three_valu_op(ctx, instr, instr->opcode, minmax3, "120", 1 | 2))
-      return true;
 
-   /* min(-max(a, b), c) -> min3(-a, -b, c) *
-    * max(-min(a, b), c) -> max3(-a, -b, c) */
+   /* min(min(a, b), c) -> min3(a, b, c)
+    * max(max(a, b), c) -> max3(a, b, c)
+    * gfx11: min(-min(a, b), c) -> maxmin(-a, -b, c)
+    * gfx11: max(-max(a, b), c) -> minmax(-a, -b, c)
+    */
+   for (unsigned swap = 0; swap < 2; swap++) {
+      Operand operands[3];
+      bool neg[3], abs[3], clamp, precise;
+      uint8_t opsel = 0, omod = 0;
+      bool inbetween_neg;
+      if (match_op3_for_vop3(ctx, instr->opcode, instr->opcode, instr.get(), 
swap, "120", operands,
+                             neg, abs, &opsel, &clamp, &omod, &inbetween_neg, 
NULL, NULL,
+                             &precise) &&
+          (!inbetween_neg ||
+           (minmax != aco_opcode::num_opcodes && ctx.program->gfx_level >= 
GFX11))) {
+         ctx.uses[instr->operands[swap].tempId()]--;
+         if (inbetween_neg) {
+            neg[0] = !neg[0];
+            neg[1] = !neg[1];
+            create_vop3_for_op3(ctx, minmax, instr, operands, neg, abs, opsel, 
clamp, omod);
+         } else {
+            create_vop3_for_op3(ctx, op3src, instr, operands, neg, abs, opsel, 
clamp, omod);
+         }
+         return true;
+      }
+   }
+
+   /* min(-max(a, b), c) -> min3(-a, -b, c)
+    * max(-min(a, b), c) -> max3(-a, -b, c)
+    * gfx11: min(max(a, b), c) -> maxmin(a, b, c)
+    * gfx11: max(min(a, b), c) -> minmax(a, b, c)
+    */
    for (unsigned swap = 0; swap < 2; swap++) {
       Operand operands[3];
       bool neg[3], abs[3], clamp, precise;
@@ -2637,11 +2666,16 @@ combine_minmax(opt_ctx& ctx, aco_ptr<Instruction>& 
instr, aco_opcode opposite, a
       bool inbetween_neg;
       if (match_op3_for_vop3(ctx, instr->opcode, opposite, instr.get(), swap, 
"120", operands, neg,
                              abs, &opsel, &clamp, &omod, &inbetween_neg, NULL, 
NULL, &precise) &&
-          inbetween_neg) {
+          (inbetween_neg ||
+           (minmax != aco_opcode::num_opcodes && ctx.program->gfx_level >= 
GFX11))) {
          ctx.uses[instr->operands[swap].tempId()]--;
-         neg[0] = !neg[0];
-         neg[1] = !neg[1];
-         create_vop3_for_op3(ctx, minmax3, instr, operands, neg, abs, opsel, 
clamp, omod);
+         if (inbetween_neg) {
+            neg[0] = !neg[0];
+            neg[1] = !neg[1];
+            create_vop3_for_op3(ctx, op3src, instr, operands, neg, abs, opsel, 
clamp, omod);
+         } else {
+            create_vop3_for_op3(ctx, minmax, instr, operands, neg, abs, opsel, 
clamp, omod);
+         }
          return true;
       }
    }
@@ -2959,7 +2993,7 @@ combine_add_bcnt(opt_ctx& ctx, aco_ptr<Instruction>& 
instr)
 
 bool
 get_minmax_info(aco_opcode op, aco_opcode* min, aco_opcode* max, aco_opcode* 
min3, aco_opcode* max3,
-                aco_opcode* med3, bool* some_gfx9_only)
+                aco_opcode* med3, aco_opcode* minmax, bool* some_gfx9_only)
 {
    switch (op) {
 #define MINMAX(type, gfx9)                                                     
                    \
@@ -2970,9 +3004,21 @@ get_minmax_info(aco_opcode op, aco_opcode* min, 
aco_opcode* max, aco_opcode* min
       *med3 = aco_opcode::v_med3_##type;                                       
                    \
       *min3 = aco_opcode::v_min3_##type;                                       
                    \
       *max3 = aco_opcode::v_max3_##type;                                       
                    \
+      *minmax = op == *min ? aco_opcode::v_maxmin_##type : 
aco_opcode::v_minmax_##type;            \
+      *some_gfx9_only = gfx9;                                                  
                    \
+      return true;
+#define MINMAX_INT16(type, gfx9)                                               
                    \
+   case aco_opcode::v_min_##type:                                              
                    \
+   case aco_opcode::v_max_##type:                                              
                    \
+      *min = aco_opcode::v_min_##type;                                         
                    \
+      *max = aco_opcode::v_max_##type;                                         
                    \
+      *med3 = aco_opcode::v_med3_##type;                                       
                    \
+      *min3 = aco_opcode::v_min3_##type;                                       
                    \
+      *max3 = aco_opcode::v_max3_##type;                                       
                    \
+      *minmax = aco_opcode::num_opcodes;                                       
                    \
       *some_gfx9_only = gfx9;                                                  
                    \
       return true;
-#define MINMAX_E64(type, gfx9)                                                 
                    \
+#define MINMAX_INT16_E64(type, gfx9)                                           
                    \
    case aco_opcode::v_min_##type##_e64:                                        
                    \
    case aco_opcode::v_max_##type##_e64:                                        
                    \
       *min = aco_opcode::v_min_##type##_e64;                                   
                    \
@@ -2980,17 +3026,19 @@ get_minmax_info(aco_opcode op, aco_opcode* min, 
aco_opcode* max, aco_opcode* min
       *med3 = aco_opcode::v_med3_##type;                                       
                    \
       *min3 = aco_opcode::v_min3_##type;                                       
                    \
       *max3 = aco_opcode::v_max3_##type;                                       
                    \
+      *minmax = aco_opcode::num_opcodes;                                       
                    \
       *some_gfx9_only = gfx9;                                                  
                    \
       return true;
       MINMAX(f32, false)
       MINMAX(u32, false)
       MINMAX(i32, false)
       MINMAX(f16, true)
-      MINMAX(u16, true)
-      MINMAX(i16, true)
-      MINMAX_E64(u16, true)
-      MINMAX_E64(i16, true)
-#undef MINMAX_E64
+      MINMAX_INT16(u16, true)
+      MINMAX_INT16(i16, true)
+      MINMAX_INT16_E64(u16, true)
+      MINMAX_INT16_E64(i16, true)
+#undef MINMAX_INT16_E64
+#undef MINMAX_INT16
 #undef MINMAX
    default: return false;
    }
@@ -4315,12 +4363,13 @@ combine_instruction(opt_ctx& ctx, aco_ptr<Instruction>& 
instr)
       ctx.mad_infos.emplace_back(nullptr, 0);
       ctx.info[instr->definitions[0].tempId()].set_mad(instr.get(), 
ctx.mad_infos.size() - 1);
    } else {
-      aco_opcode min, max, min3, max3, med3;
+      aco_opcode min, max, min3, max3, med3, minmax;
       bool some_gfx9_only;
-      if (get_minmax_info(instr->opcode, &min, &max, &min3, &max3, &med3, 
&some_gfx9_only) &&
+      if (get_minmax_info(instr->opcode, &min, &max, &min3, &max3, &med3, 
&minmax,
+                          &some_gfx9_only) &&
           (!some_gfx9_only || ctx.program->gfx_level >= GFX9)) {
          if (combine_minmax(ctx, instr, instr->opcode == min ? max : min,
-                            instr->opcode == min ? min3 : max3)) {
+                            instr->opcode == min ? min3 : max3, minmax)) {
          } else {
             combine_clamp(ctx, instr, min, max, med3);
          }
diff --git a/src/amd/compiler/tests/helpers.cpp 
b/src/amd/compiler/tests/helpers.cpp
index 211284bfb49..bfd9b723a77 100644
--- a/src/amd/compiler/tests/helpers.cpp
+++ b/src/amd/compiler/tests/helpers.cpp
@@ -345,6 +345,16 @@ Temp fsat(Temp src, Builder b)
                     Operand::c32(0x3f800000u), src);
 }
 
+Temp fmin(Temp src0, Temp src1, Builder b)
+{
+   return b.vop2(aco_opcode::v_min_f32, b.def(v1), src0, src1);
+}
+
+Temp fmax(Temp src0, Temp src1, Builder b)
+{
+   return b.vop2(aco_opcode::v_max_f32, b.def(v1), src0, src1);
+}
+
 Temp ext_ushort(Temp src, unsigned idx, Builder b)
 {
    return b.pseudo(aco_opcode::p_extract, b.def(src.regClass()), src, 
Operand::c32(idx),
diff --git a/src/amd/compiler/tests/helpers.h b/src/amd/compiler/tests/helpers.h
index 8f1f272fb6a..d1123ef2d96 100644
--- a/src/amd/compiler/tests/helpers.h
+++ b/src/amd/compiler/tests/helpers.h
@@ -102,6 +102,8 @@ aco::Temp fadd(aco::Temp src0, aco::Temp src1, aco::Builder 
b=bld);
 aco::Temp fmul(aco::Temp src0, aco::Temp src1, aco::Builder b=bld);
 aco::Temp fma(aco::Temp src0, aco::Temp src1, aco::Temp src2, aco::Builder 
b=bld);
 aco::Temp fsat(aco::Temp src, aco::Builder b=bld);
+aco::Temp fmin(aco::Temp src0, aco::Temp src1, aco::Builder b=bld);
+aco::Temp fmax(aco::Temp src0, aco::Temp src1, aco::Builder b=bld);
 aco::Temp ext_ushort(aco::Temp src, unsigned idx, aco::Builder b=bld);
 aco::Temp ext_ubyte(aco::Temp src, unsigned idx, aco::Builder b=bld);
 void emit_divergent_if_else(aco::Program* prog, aco::Builder& b, aco::Operand 
cond, std::function<void()> then,
diff --git a/src/amd/compiler/tests/test_optimizer.cpp 
b/src/amd/compiler/tests/test_optimizer.cpp
index 7b6fd01b5ef..86d8c2f5f05 100644
--- a/src/amd/compiler/tests/test_optimizer.cpp
+++ b/src/amd/compiler/tests/test_optimizer.cpp
@@ -713,23 +713,64 @@ BEGIN_TEST(optimize.add3)
 END_TEST
 
 BEGIN_TEST(optimize.minmax)
-   for (unsigned i = GFX9; i <= GFX10; i++) {
-      //>> v1: %a = p_startpgm
-      if (!setup_cs("v1", (amd_gfx_level)i))
+   for (unsigned i = GFX10_3; i <= GFX11; i++) {
+      //>> v1: %a, v1: %b, v1: %c = p_startpgm
+      if (!setup_cs("v1 v1 v1", (amd_gfx_level)i))
          continue;
 
-      //! v1: %res0 = v_max3_f32 -0, %a, 0
+      Temp a = inputs[0];
+      Temp b = inputs[1];
+      Temp c = inputs[2];
+
+      //! v1: %res0 = v_min3_f32 %a, %b, %c
       //! p_unit_test 0, %res0
-      Temp xor0 = fneg(inputs[0]);
-      Temp min = bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand::zero(), 
xor0);
-      Temp xor1 = fneg(min);
-      writeout(0, bld.vop2(aco_opcode::v_max_f32, bld.def(v1), 
Operand::zero(), xor1));
+      writeout(0, fmin(c, fmin(a, b)));
 
-      //! v1: %res1 = v_max3_f32 -0, -%a, 0
+      //! v1: %res1 = v_max3_f32 %a, %b, %c
       //! p_unit_test 1, %res1
-      min = bld.vop2(aco_opcode::v_min_f32, bld.def(v1), Operand::zero(), 
Operand(inputs[0]));
-      xor1 = fneg(min);
-      writeout(1, bld.vop2(aco_opcode::v_max_f32, bld.def(v1), 
Operand::zero(), xor1));
+      writeout(1, fmax(c, fmax(a, b)));
+
+      //! v1: %res2 = v_min3_f32 -%a, -%b, %c
+      //! p_unit_test 2, %res2
+      writeout(2, fmin(c, fneg(fmax(a, b))));
+
+      //! v1: %res3 = v_max3_f32 -%a, -%b, %c
+      //! p_unit_test 3, %res3
+      writeout(3, fmax(c, fneg(fmin(a, b))));
+
+      //! v1: %res4 = v_max3_f32 -%a, %b, %c
+      //! p_unit_test 4, %res4
+      writeout(4, fmax(c, fneg(fmin(a, fneg(b)))));
+
+      //~gfx10_3! v1: %res5_tmp = v_max_f32 %a, %b
+      //~gfx10_3! v1: %res5 = v_min_f32 %c, %res5_tmp
+      //~gfx11! v1: %res5 = v_maxmin_f32 %a, %b, %c
+      //! p_unit_test 5, %res5
+      writeout(5, fmin(c, fmax(a, b)));
+
+      //~gfx10_3! v1: %res6_tmp = v_min_f32 %a, %b
+      //~gfx10_3! v1: %res6 = v_max_f32 %c, %res6_tmp
+      //~gfx11! v1: %res6 = v_minmax_f32 %a, %b, %c
+      //! p_unit_test 6, %res6
+      writeout(6, fmax(c, fmin(a, b)));
+
+      //~gfx10_3! v1: %res7_tmp = v_min_f32 %a, %b
+      //~gfx10_3! v1: %res7 = v_min_f32 %c, -%res7_tmp
+      //~gfx11! v1: %res7 = v_maxmin_f32 -%a, -%b, %c
+      //! p_unit_test 7, %res7
+      writeout(7, fmin(c, fneg(fmin(a, b))));
+
+      //~gfx10_3! v1: %res8_tmp = v_max_f32 %a, %b
+      //~gfx10_3! v1: %res8 = v_max_f32 %c, -%res8_tmp
+      //~gfx11! v1: %res8 = v_minmax_f32 -%a, -%b, %c
+      //! p_unit_test 8, %res8
+      writeout(8, fmax(c, fneg(fmax(a, b))));
+
+      //~gfx10_3! v1: %res9_tmp = v_max_f32 %a, -%b
+      //~gfx10_3! v1: %res9 = v_max_f32 %c, -%res9_tmp
+      //~gfx11! v1: %res9 = v_minmax_f32 -%a, %b, %c
+      //! p_unit_test 9, %res9
+      writeout(9, fmax(c, fneg(fmax(a, fneg(b)))));
 
       finish_opt_test();
    }

Reply via email to