sub(a, b)) [PR116815]

Andrew Pinski Wed, 23 Jul 2025 22:51:28 -0700

On Wed, Jul 23, 2025 at 10:16 PM <dhr...@nvidia.com> wrote:
>
> From: Dhruv Chawla <dhr...@nvidia.com>
>
> This patch folds the following patterns:
> - max (a, add (a, b)) -> [sum, ovf] = adds (a, b); !ovf ? sum : a
> - min (a, add (a, b)) -> [sum, ovf] = adds (a, b); !ovf ? a : sum
> - max (a, sub (a, b)) -> [sum, ovf] = subs (a, b); !ovf ? a : sum
> - min (a, sub (a, b)) -> [sum, ovf] = subs (a, b); !ovf ? sum : a
>
> Where ovf is the overflow flag. adds and subs are generated by
> generating a parallel compare+plus/minus which maps to the pattern
> add<mode>3_compareC. sub<mode>3_compareC is also created to have an
> equivalent pattern for the subs instruction.
>
> This patch is a respin of the patch posted at
> https://gcc.gnu.org/pipermail/gcc-patches/2025-May/685021.html as per
> the suggestion to turn it into a target-specific transform by Richard
> Biener.
>
> Bootstrapped and regtested on aarch64-unknown-linux-gnu.
>
> Signed-off-by: Dhruv Chawla <dhr...@nvidia.com>
>
>         PR middle-end/116815
>
> gcc/ChangeLog:
>
>         * config/aarch64/aarch64.md (sub<mode>3_compareC): New pattern.
>         (*aarch64_plus_within_<optab><mode>3_<ovf_commutate>): Likewise.
>         (*aarch64_minus_within_<optab><mode>3): Likewise.
>         * config/aarch64/iterators.md (ovf_add_cmp): New code attribute.
>         (ovf_sub_cmp): Likewise.
>         (ovf_commutate): New iterator.
>         (ovf_comm_opp): New int attribute.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/aarch64/pr116815-1.c: New test.
>         * gcc.target/aarch64/pr116815-2.c: Likewise.
>         * gcc.target/aarch64/pr116815-3.c: Likewise.
>         * gcc.target/aarch64/pr116815-4.c: Likewise.
>         * gcc.target/aarch64/pr116815-5.c: Likewise.
> ---
>  gcc/config/aarch64/aarch64.md                 |  73 +++++++++++
>  gcc/config/aarch64/iterators.md               |   9 ++
>  gcc/testsuite/gcc.target/aarch64/pr116815-1.c | 119 ++++++++++++++++++
>  gcc/testsuite/gcc.target/aarch64/pr116815-2.c |  94 ++++++++++++++
>  gcc/testsuite/gcc.target/aarch64/pr116815-3.c |  63 ++++++++++
>  gcc/testsuite/gcc.target/aarch64/pr116815-4.c |  49 ++++++++
>  gcc/testsuite/gcc.target/aarch64/pr116815-5.c |  45 +++++++
>  7 files changed, 452 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/pr116815-1.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/pr116815-2.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/pr116815-3.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/pr116815-4.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/pr116815-5.c
>
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index a4ae6859da0..c9f88c40473 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -3741,6 +3741,20 @@
>    [(set_attr "type" "alus_sreg")]
>  )
>
> +;; An equivalent to add<mode>3_compareC
> +(define_insn "sub<mode>3_compareC"
> +  [(set (reg:CC_C CC_REGNUM)
> +       (compare:CC_C
> +         (minus:GPI
> +           (match_operand:GPI 1 "register_operand" "r")
> +           (match_operand:GPI 2 "register_operand" "r"))
> +         (match_dup 1)))
> +   (set (match_operand:GPI 0 "register_operand" "=r")
> +       (minus:GPI (match_dup 1) (match_dup 2)))]
> +  ""
> +  "subs\t%<w>0, %<w>1, %<w>2"
> +)
> +
>  (define_peephole2
>    [(set (match_operand:GPI 0 "aarch64_general_reg")
>         (minus:GPI (match_operand:GPI 1 "aarch64_reg_or_zero")
> @@ -4481,6 +4495,65 @@
>    [(set_attr "type" "<su>div")]
>  )
>
> +;; umax (a, add (a, b)) => [sum, ovf] = adds (a, b); !ovf ? sum : a
> +;; umin (a, add (a, b)) => [sum, ovf] = adds (a, b); !ovf ? a : sum
> +;; ... along with the commutative version of add (a, b) i.e. add (b, a)
> +(define_insn_and_split "*aarch64_plus_within_<optab><mode>3_<ovf_commutate>"
> +  [(set (match_operand:GPI 0 "register_operand")
> +       (UMAXMIN:GPI
> +         (plus:GPI (match_operand:GPI 1 "register_operand")
> +                   (match_operand:GPI 2 "register_operand"))
> +         (match_dup <ovf_commutate>)))
> +   (clobber (match_scratch:GPI 3))]
> +  "!TARGET_CSSC"
> +  "#"
> +  "&& !reload_completed"


Since this is a define_insn_and_split, I think there should be
constraints and not just predicates on it. I don't think you can
depend on it being split before RA (though I could be wrong) or
matching post RA.

> +  [(parallel
> +      [(set (reg:CC_C CC_REGNUM)
> +           (compare:CC_C (plus:GPI (match_dup ovf_commutate)

I am thinking you are missing "<>" around ovf_commutate here and other
places below. Or did I misunderstand how this works?

> +                                   (match_dup <ovf_comm_opp>))
> +                         (match_dup ovf_commutate)))
> +       (set (match_dup 3) (plus:GPI (match_dup ovf_commutate)
> +                                   (match_dup <ovf_comm_opp>)))])
> +   (set (match_dup 0)
> +       (if_then_else:GPI (<ovf_add_cmp> (reg:CC CC_REGNUM)
> +                                           (const_int 0))
> +                         (match_dup 3)
> +                         (match_dup ovf_commutate)))]
> +  {
> +    if (GET_CODE (operands[3]) == SCRATCH)
> +      operands[3] = gen_reg_rtx (<MODE>mode);
> +  }
> +)
> +
> +;; umax (a, sub (a, b)) => [sum, ovf] = subs (a, b); !ovf ? a : sum
> +;; umin (a, sub (a, b)) => [sum, ovf] = subs (a, b); !ovf ? sum : a
> +(define_insn_and_split "*aarch64_minus_within_<optab><mode>3"
> +  [(set (match_operand:GPI 0 "register_operand")
> +       (UMAXMIN:GPI
> +         (minus:GPI (match_operand:GPI 1 "register_operand")
> +                    (match_operand:GPI 2 "register_operand"))
> +         (match_dup 1)))
> +   (clobber (match_scratch:GPI 3))]
> +  "!TARGET_CSSC"
> +  "#"
> +  "&& !reload_completed"
> +  [(parallel
> +      [(set (reg:CC_C CC_REGNUM)
> +           (compare:CC_C (minus:GPI (match_dup 1) (match_dup 2))
> +                         (match_dup 1)))
> +       (set (match_dup 3) (minus:GPI (match_dup 1) (match_dup 2)))])
> +   (set (match_dup 0)
> +           (if_then_else:GPI (<ovf_sub_cmp> (reg:CC CC_REGNUM)
> +                                               (const_int 0))
> +                             (match_dup 3)
> +                             (match_dup 1)))]
> +  {
> +    if (GET_CODE (operands[3]) == SCRATCH)
> +      operands[3] = gen_reg_rtx (<MODE>mode);
> +  }
> +)
Same comments as above dealing with pre/post RA.

Thanks,
Andrew

> +
>  ;; -------------------------------------------------------------------
>  ;; Comparison insns
>  ;; -------------------------------------------------------------------
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 795c4ac7a57..430c49cc6c9 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -2795,6 +2795,8 @@
>
>  (define_code_iterator FMAXMIN [smax smin])
>
> +(define_code_iterator UMAXMIN [umax umin])
> +
>  ;; Signed and unsigned max operations.
>  (define_code_iterator USMAX [smax umax])
>
> @@ -3087,6 +3089,13 @@
>
>  (define_code_attr maxminand [(smax "bic") (smin "and")])
>
> +(define_code_attr ovf_add_cmp [(umax "geu") (umin "ltu")])
> +(define_code_attr ovf_sub_cmp [(umax "ltu") (umin "geu")])
> +
> +;; Operand numbers for commutative operations
> +(define_int_iterator ovf_commutate [1 2])
> +(define_int_attr ovf_comm_opp [(1 "2") (2 "1")])
> +
>  ;; MLA/MLS attributes.
>  (define_code_attr as [(ss_plus "a") (ss_minus "s")])
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr116815-1.c 
> b/gcc/testsuite/gcc.target/aarch64/pr116815-1.c
> new file mode 100644
> index 00000000000..375c5f7c8b3
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr116815-1.c
> @@ -0,0 +1,119 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
> +
> +/* PR middle-end/116815 */
> +
> +/* Single-use tests.  */
> +
> +static inline unsigned
> +max (unsigned a, unsigned b)
> +{
> +  return a > b ? a : b;
> +}
> +
> +static inline unsigned
> +min (unsigned a, unsigned b)
> +{
> +  return a < b ? a : b;
> +}

Just a small comment here about the testcase, I would mark these as
always_inline; yes the inliner heuristics should inline these always
but this testcase is not testing that.

> +
> +#define OPERATION(op, type, N, exp1, exp2)                                   
>   \
> +  unsigned u##op##type##N (unsigned a, unsigned b) { return op (exp1, exp2); 
> }
> +
> +/*
> +** umaxadd1:
> +**     adds    (w[0-9]+), w0, w1
> +**     csel    w0, \1, w0, cs
> +**     ret
> +*/
> +OPERATION (max, add, 1, a, a + b)
> +
> +/*
> +** umaxadd2:
> +**     adds    (w[0-9]+), w0, w1
> +**     csel    w0, \1, w0, cs
> +**     ret
> +*/
> +OPERATION (max, add, 2, a, b + a)
> +
> +/*
> +** umaxadd3:
> +**     adds    (w[0-9]+), w0, w1
> +**     csel    w0, \1, w0, cs
> +**     ret
> +*/
> +OPERATION (max, add, 3, a + b, a)
> +
> +/*
> +** umaxadd4:
> +**     adds    (w[0-9]+), w0, w1
> +**     csel    w0, \1, w0, cs
> +**     ret
> +*/
> +OPERATION (max, add, 4, b + a, a)
> +
> +/*
> +** uminadd1:
> +**     adds    (w[0-9]+), w0, w1
> +**     csel    w0, \1, w0, cc
> +**     ret
> +*/
> +OPERATION (min, add, 1, a, a + b)
> +
> +/*
> +** uminadd2:
> +**     adds    (w[0-9]+), w0, w1
> +**     csel    w0, \1, w0, cc
> +**     ret
> +*/
> +OPERATION (min, add, 2, a, b + a)
> +
> +/*
> +** uminadd3:
> +**     adds    (w[0-9]+), w0, w1
> +**     csel    w0, \1, w0, cc
> +**     ret
> +*/
> +OPERATION (min, add, 3, a + b, a)
> +
> +/*
> +** uminadd4:
> +**     adds    (w[0-9]+), w0, w1
> +**     csel    w0, \1, w0, cc
> +**     ret
> +*/
> +OPERATION (min, add, 4, b + a, a)
> +
> +/*
> +** umaxsub1:
> +**     subs    (w[0-9]+), w0, w1
> +**     csel    w0, \1, w0, cc
> +**     ret
> +*/
> +OPERATION (max, sub, 1, a, a - b)
> +
> +/*
> +** umaxsub2:
> +**     subs    (w[0-9]+), w0, w1
> +**     csel    w0, \1, w0, cc
> +**     ret
> +*/
> +OPERATION (max, sub, 2, a - b, a)
> +
> +/*
> +** uminsub1:
> +**     subs    (w[0-9]+), w0, w1
> +**     csel    w0, \1, w0, cs
> +**     ret
> +*/
> +OPERATION (min, sub, 1, a, a - b)
> +
> +/*
> +** uminsub2:
> +**     subs    (w[0-9]+), w0, w1
> +**     csel    w0, \1, w0, cs
> +**     ret
> +*/
> +OPERATION (min, sub, 2, a - b, a)
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr116815-2.c 
> b/gcc/testsuite/gcc.target/aarch64/pr116815-2.c
> new file mode 100644
> index 00000000000..c5e8df2aa34
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr116815-2.c
> @@ -0,0 +1,94 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +/* PR middle-end/116815 */
> +
> +/* Negative tests.  */
> +
> +static inline int
> +smax (int a, int b)
> +{
> +  return a > b ? a : b;
> +}
> +
> +static inline int
> +smin (int a, int b)
> +{
> +  return a < b ? a : b;
> +}
> +
> +static inline unsigned
> +umax (unsigned a, unsigned b)
> +{
> +  return a > b ? a : b;
> +}
> +
> +static inline unsigned
> +umin (unsigned a, unsigned b)
> +{
> +  return a < b ? a : b;
> +}
> +
> +#define ASSUME(cond) if (!(cond)) __builtin_unreachable ();
> +
> +/* This transformation does not trigger on signed types.  */
> +
> +int
> +smax_add (int a, int b)
> +{
> +  ASSUME (b >= 0);
> +  return smax (a, a + b);
> +}
> +
> +int
> +smin_add (int a, int b)
> +{
> +  ASSUME (b >= 0);
> +  return smin (a, a + b);
> +}
> +
> +int
> +smax_sub (int a, int b)
> +{
> +  ASSUME (b >= 0);
> +  return smax (a, a - b);
> +}
> +
> +int
> +smin_sub (int a, int b)
> +{
> +  ASSUME (b >= 0);
> +  return smin (a, a - b);
> +}
> +
> +/* Invalid patterns.  */
> +
> +/* This can potentially be matched, but the RHS gets factored to
> +   (a + b) * b.  */
> +unsigned
> +umax_factored (unsigned a, unsigned b)
> +{
> +  return umax (a * b, a * b + b * b);
> +}
> +
> +unsigned
> +umin_mult (unsigned a, unsigned b)
> +{
> +  return umin (a, a * b);
> +}
> +
> +unsigned
> +umax_sub (unsigned a, unsigned b)
> +{
> +  return umax (a, b - a);
> +}
> +
> +unsigned
> +umin_sub (unsigned a, unsigned b)
> +{
> +  return umin (a, b - a);
> +}
> +
> +/* { dg-final { scan-assembler-not "adds\\t" } } */
> +/* { dg-final { scan-assembler-not "subs\\t" } } */
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr116815-3.c 
> b/gcc/testsuite/gcc.target/aarch64/pr116815-3.c
> new file mode 100644
> index 00000000000..3095a50ac91
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr116815-3.c
> @@ -0,0 +1,63 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
> +
> +/* PR middle-end/116815 */
> +
> +/* Multi-use tests.  */
> +
> +static inline unsigned
> +max (unsigned a, unsigned b)
> +{
> +  return a > b ? a : b;
> +}
> +
> +static inline unsigned
> +min (unsigned a, unsigned b)
> +{
> +  return a < b ? a : b;
> +}
> +
> +/* FIXME: This should only generate one adds.  */
> +
> +/*
> +** umax_add_umin_add:
> +**     adds    (w[0-9]+), w0, w1
> +**     csel    \1, \1, w0, cs
> +**     adds    (w[0-9]+), w1, w0
> +**     csel    w0, \2, w1, cc
> +**     add     w0, \1, \2
> +**     ret
> +*/
> +unsigned
> +umax_add_umin_add (unsigned a, unsigned b)
> +{
> +  return max (a, a + b) + min (a + b, b);
> +}
> +
> +/*
> +** umin_add_umax_add:
> +**     adds    (w[0-9]+), w0, w1
> +**     csel    \1, \1, w0, cc
> +**     adds    (w[0-9]+), w1, w0
> +**     csel    \2, \2, w1, cs
> +**     add     w0, \1, \2
> +**     ret
> +*/
> +unsigned
> +umin_add_umax_add (unsigned a, unsigned b)
> +{
> +  return min (a, b + a) + max (b + a, b);
> +}
> +
> +/* FIXME: This pattern does not get optimized.  */
> +
> +unsigned
> +multiple_paths (unsigned a, unsigned b)
> +{
> +  if (a > 5)
> +    return max (a, a + b);
> +  else
> +    return min (a, a + b);
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr116815-4.c 
> b/gcc/testsuite/gcc.target/aarch64/pr116815-4.c
> new file mode 100644
> index 00000000000..1d5ca1c1d92
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr116815-4.c
> @@ -0,0 +1,49 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +/* PR middle-end/116815 */
> +
> +/* Single-use tests with a use of the min-max in an if-condition.  */
> +
> +static inline unsigned
> +max (unsigned a, unsigned b)
> +{
> +  return a > b ? a : b;
> +}
> +
> +static inline unsigned
> +min (unsigned a, unsigned b)
> +{
> +  return a < b ? a : b;
> +}
> +
> +#define OPERATION(op, type, N, exp1, exp2)                                   
>   \
> +  unsigned u##op##type##N (unsigned a, unsigned b, unsigned c, unsigned d,   
>   \
> +                          unsigned e)                                        
>  \
> +  {                                                                          
>   \
> +    unsigned result = op (exp1, exp2);                                       
>   \
> +    if (result == c || result == c * 2)                                      
>   \
> +      return d;                                                              
>   \
> +    else                                                                     
>   \
> +      return e;                                                              
>   \
> +  }
> +
> +OPERATION (max, add, 1, a, a + b)
> +OPERATION (max, add, 2, a, b + a)
> +OPERATION (max, add, 3, a + b, a)
> +OPERATION (max, add, 4, b + a, a)
> +
> +OPERATION (min, add, 1, a, a + b)
> +OPERATION (min, add, 2, a, b + a)
> +OPERATION (min, add, 3, a + b, a)
> +OPERATION (min, add, 4, b + a, a)
> +
> +OPERATION (max, sub, 1, a, a - b)
> +OPERATION (max, sub, 2, a - b, a)
> +
> +OPERATION (min, sub, 1, a, a - b)
> +OPERATION (min, sub, 2, a - b, a)
> +
> +/* { dg-final { scan-assembler-times "adds\\t" 8 } } */
> +/* { dg-final { scan-assembler-times "subs\\t" 4 } } */
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr116815-5.c 
> b/gcc/testsuite/gcc.target/aarch64/pr116815-5.c
> new file mode 100644
> index 00000000000..0500056b6b5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr116815-5.c
> @@ -0,0 +1,45 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +#pragma GCC target "+cssc"
> +
> +/* PR middle-end/116815 */
> +
> +/* Make sure that umax/umin instructions are generated with CSSC.  */
> +
> +static inline unsigned
> +max (unsigned a, unsigned b)
> +{
> +  return a > b ? a : b;
> +}
> +
> +static inline unsigned
> +min (unsigned a, unsigned b)
> +{
> +  return a < b ? a : b;
> +}
> +
> +#define OPERATION(op, type, N, exp1, exp2)                                   
>   \
> +  unsigned u##op##type##N (unsigned a, unsigned b) { return op (exp1, exp2); 
> }
> +
> +OPERATION (max, add, 1, a, a + b)
> +OPERATION (max, add, 2, a, b + a)
> +OPERATION (max, add, 3, a + b, a)
> +OPERATION (max, add, 4, b + a, a)
> +
> +OPERATION (min, add, 1, a, a + b)
> +OPERATION (min, add, 2, a, b + a)
> +OPERATION (min, add, 3, a + b, a)
> +OPERATION (min, add, 4, b + a, a)
> +
> +OPERATION (max, sub, 1, a, a - b)
> +OPERATION (max, sub, 2, a - b, a)
> +
> +OPERATION (min, sub, 1, a, a - b)
> +OPERATION (min, sub, 2, a - b, a)
> +
> +/* { dg-final { scan-assembler-times "umax\\t" 6 } } */
> +/* { dg-final { scan-assembler-times "umin\\t" 6 } } */
> +/* { dg-final { scan-assembler-not "adds\\t" } } */
> +/* { dg-final { scan-assembler-not "subs\\t" } } */
> +
> --
> 2.44.0
>

Re: [PATCH] [aarch64] Make better use of overflowing operations in max/min(a, add/sub(a, b)) [PR116815]

Reply via email to