From: Dhruv Chawla <dhr...@nvidia.com> This patch folds the following patterns: - umax (a, add (a, b)) -> [sum, ovf] = adds (a, b); !ovf ? sum : a - umin (a, add (a, b)) -> [sum, ovf] = adds (a, b); !ovf ? a : sum - umax (a, sub (a, b)) -> [diff, ovf] = subs (a, b); ovf ? diff : a - umin (a, sub (a, b)) -> [diff, ovf] = subs (a, b); ovf ? a : diff
Where ovf is the overflow flag. adds and subs are generated by generating a parallel compare+plus/minus which maps to the pattern add<mode>3_compareC. sub<mode>3_compareC is also created to have an equivalent pattern for the subs instruction. In the case of subs, the overflow flag represents underflow. This patch is a respin of the patch posted at https://gcc.gnu.org/pipermail/gcc-patches/2025-May/685021.html as per the suggestion to turn it into a target-specific transform by Richard Biener. Bootstrapped and regtested on aarch64-unknown-linux-gnu. Signed-off-by: Dhruv Chawla <dhr...@nvidia.com> PR middle-end/116815 gcc/ChangeLog: * config/aarch64/aarch64.md (sub<mode>3_compareC): New pattern. (*aarch64_plus_within_<optab><mode>3_<ovf_commutate>): Likewise. (*aarch64_minus_within_<optab><mode>3): Likewise. * config/aarch64/iterators.md (ovf_add_cmp): New code attribute. (ovf_sub_cmp): Likewise. (ovf_commutate): New iterator. (ovf_comm_opp): New int attribute. gcc/testsuite/ChangeLog: * gcc.target/aarch64/pr116815-1.c: New test. * gcc.target/aarch64/pr116815-2.c: Likewise. * gcc.target/aarch64/pr116815-3.c: Likewise. * gcc.target/aarch64/pr116815-4.c: Likewise. * gcc.target/aarch64/pr116815-5.c: Likewise. * gcc.target/aarch64/pr116815-6.c: Likewise. --- gcc/config/aarch64/aarch64.md | 76 +++++++++++ gcc/config/aarch64/iterators.md | 9 ++ gcc/testsuite/gcc.target/aarch64/pr116815-1.c | 120 ++++++++++++++++++ gcc/testsuite/gcc.target/aarch64/pr116815-2.c | 93 ++++++++++++++ gcc/testsuite/gcc.target/aarch64/pr116815-3.c | 62 +++++++++ gcc/testsuite/gcc.target/aarch64/pr116815-4.c | 48 +++++++ gcc/testsuite/gcc.target/aarch64/pr116815-5.c | 44 +++++++ gcc/testsuite/gcc.target/aarch64/pr116815-6.c | 60 +++++++++ 8 files changed, 512 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/pr116815-1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/pr116815-2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/pr116815-3.c create mode 100644 gcc/testsuite/gcc.target/aarch64/pr116815-4.c create mode 100644 gcc/testsuite/gcc.target/aarch64/pr116815-5.c create mode 100644 gcc/testsuite/gcc.target/aarch64/pr116815-6.c diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 8e431a10cb1..a55fe5c85d8 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -3715,6 +3715,20 @@ [(set_attr "type" "alus_sreg")] ) +;; An equivalent to add<mode>3_compareC +(define_insn "sub<mode>3_compareC" + [(set (reg:CC_C CC_REGNUM) + (compare:CC_C + (minus:GPI + (match_operand:GPI 1 "register_operand" "r") + (match_operand:GPI 2 "register_operand" "r")) + (match_dup 1))) + (set (match_operand:GPI 0 "register_operand" "=r") + (minus:GPI (match_dup 1) (match_dup 2)))] + "" + "subs\t%<w>0, %<w>1, %<w>2" +) + (define_peephole2 [(set (match_operand:GPI 0 "aarch64_general_reg") (minus:GPI (match_operand:GPI 1 "aarch64_reg_or_zero") @@ -4455,6 +4469,68 @@ [(set_attr "type" "<su>div")] ) +;; umax (a, add (a, b)) => [sum, ovf] = adds (a, b); !ovf ? sum : a +;; umin (a, add (a, b)) => [sum, ovf] = adds (a, b); !ovf ? a : sum +;; ... along with the commutative version of add (a, b) i.e. add (b, a). +(define_insn_and_split "*aarch64_plus_within_<optab><mode>3_<ovf_commutate>" + [(set (match_operand:GPI 0 "register_operand" "=r") + (UMAXMIN:GPI + (plus:GPI (match_operand:GPI 1 "register_operand" "r") + (match_operand:GPI 2 "register_operand" "r")) + (match_dup ovf_commutate))) + (clobber (match_scratch:GPI 3))] + "!TARGET_CSSC" + "#" + "&& !reload_completed" + [(parallel + [(set (reg:CC_C CC_REGNUM) + (compare:CC_C (plus:GPI (match_dup ovf_commutate) + (match_dup <ovf_comm_opp>)) + (match_dup ovf_commutate))) + (set (match_dup 3) (plus:GPI (match_dup ovf_commutate) + (match_dup <ovf_comm_opp>)))]) + (set (match_dup 0) + (if_then_else:GPI (<ovf_add_cmp> (reg:CC_C CC_REGNUM) + (const_int 0)) + (match_dup 3) + (match_dup ovf_commutate)))] + { + if (GET_CODE (operands[3]) == SCRATCH) + operands[3] = gen_reg_rtx (<MODE>mode); + } +) + +;; In this case the ovf represents an underflow. +;; umax (a, sub (a, b)) => [diff, ovf] = subs (a, b); ovf ? diff : a +;; umin (a, sub (a, b)) => [diff, ovf] = subs (a, b); ovf ? a : diff +(define_insn_and_split "*aarch64_minus_within_<optab><mode>3" + [(set (match_operand:GPI 0 "register_operand" "=r") + (UMAXMIN:GPI + (minus:GPI (match_operand:GPI 1 "register_operand" "r") + (match_operand:GPI 2 "register_operand" "r")) + (match_dup 1))) + (clobber (match_scratch:GPI 3))] + "!TARGET_CSSC" + "#" + "&& !reload_completed" + [(parallel + [(set (reg:CC_C CC_REGNUM) + (compare:CC_C (minus:GPI (match_dup 1) (match_dup 2)) + (match_dup 1))) + (set (match_dup 3) (minus:GPI (match_dup 1) (match_dup 2)))]) + ;; There is an inverse mapping here from CC_C -> CC as this requires the + ;; inverse of the comparison from the above pattern. + (set (match_dup 0) + (if_then_else:GPI (<ovf_sub_cmp> (reg:CC CC_REGNUM) + (const_int 0)) + (match_dup 3) + (match_dup 1)))] + { + if (GET_CODE (operands[3]) == SCRATCH) + operands[3] = gen_reg_rtx (<MODE>mode); + } +) + ;; ------------------------------------------------------------------- ;; Comparison insns ;; ------------------------------------------------------------------- diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index c3771d9402b..a29243247bd 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -2804,6 +2804,8 @@ (define_code_iterator FMAXMIN [smax smin]) +(define_code_iterator UMAXMIN [umax umin]) + ;; Signed and unsigned max operations. (define_code_iterator USMAX [smax umax]) @@ -3092,6 +3094,9 @@ (define_code_attr maxminand [(smax "bic") (smin "and")]) +(define_code_attr ovf_add_cmp [(umax "geu") (umin "ltu")]) +(define_code_attr ovf_sub_cmp [(umax "ltu") (umin "geu")]) + ;; MLA/MLS attributes. (define_code_attr as [(ss_plus "a") (ss_minus "s")]) @@ -5007,3 +5012,7 @@ (UNSPEC_F2CVT "f2cvt") (UNSPEC_F1CVTLT "f1cvtlt") (UNSPEC_F2CVTLT "f2cvtlt")]) + +;; Operand numbers for commutative operations +(define_int_iterator ovf_commutate [1 2]) +(define_int_attr ovf_comm_opp [(1 "2") (2 "1")]) diff --git a/gcc/testsuite/gcc.target/aarch64/pr116815-1.c b/gcc/testsuite/gcc.target/aarch64/pr116815-1.c new file mode 100644 index 00000000000..f3bdb794318 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/pr116815-1.c @@ -0,0 +1,120 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +/* PR middle-end/116815 */ + +/* Single-use tests. */ + +static inline unsigned __attribute__ ((always_inline)) +max (unsigned a, unsigned b) +{ + return a > b ? a : b; +} + +static inline unsigned __attribute__ ((always_inline)) +min (unsigned a, unsigned b) +{ + return a < b ? a : b; +} + +#define OPERATION(op, type, N, exp1, exp2) \ + unsigned u##op##type##N (unsigned a, unsigned b) { return op (exp1, exp2); } + +/* +** umaxadd1: +** adds (w[0-9]+), w0, w1 +** csel w0, \1, w0, cc +** ret +*/ +OPERATION (max, add, 1, a, a + b) + +/* +** umaxadd2: +** adds (w[0-9]+), w0, w1 +** csel w0, \1, w0, cc +** ret +*/ +OPERATION (max, add, 2, a, b + a) + +/* +** umaxadd3: +** adds (w[0-9]+), w0, w1 +** csel w0, \1, w0, cc +** ret +*/ +OPERATION (max, add, 3, a + b, a) + +/* +** umaxadd4: +** adds (w[0-9]+), w0, w1 +** csel w0, \1, w0, cc +** ret +*/ +OPERATION (max, add, 4, b + a, a) + +/* +** uminadd1: +** adds (w[0-9]+), w0, w1 +** csel w0, \1, w0, cs +** ret +*/ +OPERATION (min, add, 1, a, a + b) + +/* +** uminadd2: +** adds (w[0-9]+), w0, w1 +** csel w0, \1, w0, cs +** ret +*/ +OPERATION (min, add, 2, a, b + a) + +/* +** uminadd3: +** adds (w[0-9]+), w0, w1 +** csel w0, \1, w0, cs +** ret +*/ +OPERATION (min, add, 3, a + b, a) + +/* +** uminadd4: +** adds (w[0-9]+), w0, w1 +** csel w0, \1, w0, cs +** ret +*/ +OPERATION (min, add, 4, b + a, a) + +/* sub requires the inverse of the comparison from add. */ + +/* +** umaxsub1: +** subs (w[0-9]+), w0, w1 +** csel w0, \1, w0, cc +** ret +*/ +OPERATION (max, sub, 1, a, a - b) + +/* +** umaxsub2: +** subs (w[0-9]+), w0, w1 +** csel w0, \1, w0, cc +** ret +*/ +OPERATION (max, sub, 2, a - b, a) + +/* +** uminsub1: +** subs (w[0-9]+), w0, w1 +** csel w0, \1, w0, cs +** ret +*/ +OPERATION (min, sub, 1, a, a - b) + +/* +** uminsub2: +** subs (w[0-9]+), w0, w1 +** csel w0, \1, w0, cs +** ret +*/ +OPERATION (min, sub, 2, a - b, a) diff --git a/gcc/testsuite/gcc.target/aarch64/pr116815-2.c b/gcc/testsuite/gcc.target/aarch64/pr116815-2.c new file mode 100644 index 00000000000..f2dd7b0bbb3 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/pr116815-2.c @@ -0,0 +1,93 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +/* PR middle-end/116815 */ + +/* Negative tests. */ + +static inline int __attribute__ ((always_inline)) +smax (int a, int b) +{ + return a > b ? a : b; +} + +static inline int __attribute__ ((always_inline)) +smin (int a, int b) +{ + return a < b ? a : b; +} + +static inline unsigned __attribute__ ((always_inline)) +umax (unsigned a, unsigned b) +{ + return a > b ? a : b; +} + +static inline unsigned __attribute__ ((always_inline)) +umin (unsigned a, unsigned b) +{ + return a < b ? a : b; +} + +#define ASSUME(cond) if (!(cond)) __builtin_unreachable (); + +/* This transformation does not trigger on signed types. */ + +int +smax_add (int a, int b) +{ + ASSUME (b >= 0); + return smax (a, a + b); +} + +int +smin_add (int a, int b) +{ + ASSUME (b >= 0); + return smin (a, a + b); +} + +int +smax_sub (int a, int b) +{ + ASSUME (b >= 0); + return smax (a, a - b); +} + +int +smin_sub (int a, int b) +{ + ASSUME (b >= 0); + return smin (a, a - b); +} + +/* Invalid patterns. */ + +/* This can potentially be matched, but the RHS gets factored to + (a + b) * b. */ +unsigned +umax_factored (unsigned a, unsigned b) +{ + return umax (a * b, a * b + b * b); +} + +unsigned +umin_mult (unsigned a, unsigned b) +{ + return umin (a, a * b); +} + +unsigned +umax_sub (unsigned a, unsigned b) +{ + return umax (a, b - a); +} + +unsigned +umin_sub (unsigned a, unsigned b) +{ + return umin (a, b - a); +} + +/* { dg-final { scan-assembler-not "adds\\t" } } */ +/* { dg-final { scan-assembler-not "subs\\t" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/pr116815-3.c b/gcc/testsuite/gcc.target/aarch64/pr116815-3.c new file mode 100644 index 00000000000..f8050377116 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/pr116815-3.c @@ -0,0 +1,62 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +/* PR middle-end/116815 */ + +/* Multi-use tests. */ + +static inline unsigned __attribute__ ((always_inline)) +max (unsigned a, unsigned b) +{ + return a > b ? a : b; +} + +static inline unsigned __attribute__ ((always_inline)) +min (unsigned a, unsigned b) +{ + return a < b ? a : b; +} + +/* FIXME: This should only generate one adds. */ + +/* +** umax_add_umin_add: +** adds (w[0-9]+), w0, w1 +** csel \1, \1, w0, cc +** adds (w[0-9]+), w1, w0 +** csel w0, \2, w1, cs +** add w0, \1, \2 +** ret +*/ +unsigned +umax_add_umin_add (unsigned a, unsigned b) +{ + return max (a, a + b) + min (a + b, b); +} + +/* +** umin_add_umax_add: +** adds (w[0-9]+), w0, w1 +** csel \1, \1, w0, cs +** adds (w[0-9]+), w1, w0 +** csel \2, \2, w1, cc +** add w0, \1, \2 +** ret +*/ +unsigned +umin_add_umax_add (unsigned a, unsigned b) +{ + return min (a, b + a) + max (b + a, b); +} + +/* FIXME: This pattern does not get optimized. */ + +unsigned +multiple_paths (unsigned a, unsigned b) +{ + if (a > 5) + return max (a, a + b); + else + return min (a, a + b); +} diff --git a/gcc/testsuite/gcc.target/aarch64/pr116815-4.c b/gcc/testsuite/gcc.target/aarch64/pr116815-4.c new file mode 100644 index 00000000000..c1be921334b --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/pr116815-4.c @@ -0,0 +1,48 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +/* PR middle-end/116815 */ + +/* Single-use tests with a use of the min-max in an if-condition. */ + +static inline unsigned __attribute__ ((always_inline)) +max (unsigned a, unsigned b) +{ + return a > b ? a : b; +} + +static inline unsigned __attribute__ ((always_inline)) +min (unsigned a, unsigned b) +{ + return a < b ? a : b; +} + +#define OPERATION(op, type, N, exp1, exp2) \ + unsigned u##op##type##N (unsigned a, unsigned b, unsigned c, unsigned d, \ + unsigned e) \ + { \ + unsigned result = op (exp1, exp2); \ + if (result == c || result == c * 2) \ + return d; \ + else \ + return e; \ + } + +OPERATION (max, add, 1, a, a + b) +OPERATION (max, add, 2, a, b + a) +OPERATION (max, add, 3, a + b, a) +OPERATION (max, add, 4, b + a, a) + +OPERATION (min, add, 1, a, a + b) +OPERATION (min, add, 2, a, b + a) +OPERATION (min, add, 3, a + b, a) +OPERATION (min, add, 4, b + a, a) + +OPERATION (max, sub, 1, a, a - b) +OPERATION (max, sub, 2, a - b, a) + +OPERATION (min, sub, 1, a, a - b) +OPERATION (min, sub, 2, a - b, a) + +/* { dg-final { scan-assembler-times "adds\\t" 8 } } */ +/* { dg-final { scan-assembler-times "subs\\t" 4 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/pr116815-5.c b/gcc/testsuite/gcc.target/aarch64/pr116815-5.c new file mode 100644 index 00000000000..015c868aec2 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/pr116815-5.c @@ -0,0 +1,44 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + +#pragma GCC target "+cssc" + +/* PR middle-end/116815 */ + +/* Make sure that umax/umin instructions are generated with CSSC. */ + +static inline unsigned __attribute__ ((always_inline)) +max (unsigned a, unsigned b) +{ + return a > b ? a : b; +} + +static inline unsigned __attribute__ ((always_inline)) +min (unsigned a, unsigned b) +{ + return a < b ? a : b; +} + +#define OPERATION(op, type, N, exp1, exp2) \ + unsigned u##op##type##N (unsigned a, unsigned b) { return op (exp1, exp2); } + +OPERATION (max, add, 1, a, a + b) +OPERATION (max, add, 2, a, b + a) +OPERATION (max, add, 3, a + b, a) +OPERATION (max, add, 4, b + a, a) + +OPERATION (min, add, 1, a, a + b) +OPERATION (min, add, 2, a, b + a) +OPERATION (min, add, 3, a + b, a) +OPERATION (min, add, 4, b + a, a) + +OPERATION (max, sub, 1, a, a - b) +OPERATION (max, sub, 2, a - b, a) + +OPERATION (min, sub, 1, a, a - b) +OPERATION (min, sub, 2, a - b, a) + +/* { dg-final { scan-assembler-times "umax\\t" 6 } } */ +/* { dg-final { scan-assembler-times "umin\\t" 6 } } */ +/* { dg-final { scan-assembler-not "adds\\t" } } */ +/* { dg-final { scan-assembler-not "subs\\t" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/pr116815-6.c b/gcc/testsuite/gcc.target/aarch64/pr116815-6.c new file mode 100644 index 00000000000..2b5407bcfc2 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/pr116815-6.c @@ -0,0 +1,60 @@ +/* { dg-do run } */ +/* { dg-options "-O2" } */ + +/* PR middle-end/116815 */ + +/* Verify that the transformation gives correct results */ + +[[gnu::always_inline]] +inline unsigned min (unsigned a, unsigned b) +{ + return (a < b) ? a : b; +} + +[[gnu::always_inline]] +inline unsigned max (unsigned a, unsigned b) +{ + return (a > b) ? a : b; +} + +[[gnu::noipa]] unsigned +umaxadd (unsigned a, unsigned b) +{ + return max (a + b, a); +} + +[[gnu::noipa]] unsigned +umaxsub (unsigned a, unsigned b) +{ + return max (a - b, a); +} + +[[gnu::noipa]] unsigned +uminadd (unsigned a, unsigned b) +{ + return min (a + b, a); +} + +[[gnu::noipa]] unsigned +uminsub (unsigned a, unsigned b) +{ + return min (a - b, a); +} + +int +main () +{ + /* Overflows to 0x30000000. */ + if (umaxadd (0x90000000, 0xa0000000) != 0x90000000) + __builtin_abort (); + + if (uminadd (0x90000000, 0xa0000000) != 0x30000000) + __builtin_abort (); + + /* Underflows to 0x60000000. */ + if (umaxsub (0x00000000, 0xa0000000) != 0x60000000) + __builtin_abort (); + + if (uminsub (0x00000000, 0xa0000000) != 0x00000000) + __builtin_abort (); +} -- 2.44.0