On Tue, 27 Jan 2026, Tamar Christina wrote:

> > > +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> > > diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc
> > > index
> > 02e194ae06f34957194c4e4f2eb4fdb3ef72d2f5..aa12221a2b2b584fa10fe3
> > 78e16115128408ee3e 100644
> > > --- a/gcc/tree-ssa-math-opts.cc
> > > +++ b/gcc/tree-ssa-math-opts.cc
> > > @@ -3120,6 +3120,30 @@ convert_mult_to_fma_1 (tree mul_result, tree
> > op1, tree op2)
> > >        if (is_gimple_debug (use_stmt))
> > >         continue;
> > >
> > > +      /* If the use is a type convert, look further into it if the 
> > > operations
> > > +        are the same under two's complement.  */
> > > +      tree lhs_type;
> > > +      if (gimple_assign_cast_p (use_stmt)
> > > +         && (lhs_type = TREE_TYPE (gimple_get_lhs (use_stmt)))
> > > +         && TREE_CODE (lhs_type) == TREE_CODE (TREE_TYPE (op1))
> > 
> > strict equality is going to be brittle, what are you trying to protect 
> > against
> > with this?
> > 
> 
> It was capturing some conversions like (int)bool.  But..
> 
> > > +         && (TYPE_UNSIGNED (lhs_type)
> > > +             || (ANY_INTEGRAL_TYPE_P (lhs_type)
> > > +                 && !TYPE_OVERFLOW_WRAPS (lhs_type)))
> > > +         && (element_precision (lhs_type)
> > > +               == element_precision (gimple_assign_rhs1 (use_stmt))))
> > 
> > I think that you want to simplify this to tree_nop_conversion_p and make
> > sure to perform the FMA in a wrapping type if you looked through one - that
> > would also allow the reverse sign conversion case.
> 
> I hadn't found tree_nop_conversion_p before and that's indeed much cleaner.
> 
> > 
> > > +       {
> > > +         tree cast_lhs = gimple_get_lhs (use_stmt);
> > > +         gimple *tmp_use;
> > > +         use_operand_p tmp_use_p;
> > > +         if (single_imm_use (cast_lhs, &tmp_use_p, &tmp_use))
> > > +           {
> > > +             use_stmt = tmp_use;
> > > +             result = cast_lhs;
> > > +             gsi_remove (&gsi, true);
> > 
> > release_defs missing?
> > 
> > > +             gsi = gsi_for_stmt (use_stmt);
> > > +           }
> > > +       }
> > > +
> > >        if (is_gimple_assign (use_stmt)
> > >           && gimple_assign_rhs_code (use_stmt) == NEGATE_EXPR)
> > >         {
> > > @@ -3156,6 +3180,11 @@ convert_mult_to_fma_1 (tree mul_result, tree
> > op1, tree op2)
> > >        if (negate_p)
> > >         mulop1 = gimple_build (&seq, NEGATE_EXPR, type, mulop1);
> > >
> > > +      /* Ensure all the operands are of the same type Use the type of the
> > > +        addend as that's the statement being replaced.  */
> > > +      op2 = gimple_convert (&seq, TREE_TYPE (addop), op2);
> > > +      mulop1 = gimple_convert (&seq, TREE_TYPE (addop), mulop1);
> > > +
> > 
> > In your code example I see back-and-forth conversion because of the use of
> > gimple_convert with a 'seq' - if we'd use the 'gsi' overloads that would be
> > avoided by also match-and-simplifying with other stmts in the IL.
> > 
> 
> Ack. I had expected the fold_stmt call on the final FMA to take care of it, 
> but
> changed to the gsi variant.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu,
> arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
> -m32, -m64 and no issues.
> 
> Ok for master?

OK.

Thanks,
Richard.

> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
>       PR tree-optimization/122749
>       * tree-ssa-math-opts.cc (convert_mult_to_fma_1, convert_mult_to_fma):
>       Unwrap converts around addend.
> 
> gcc/testsuite/ChangeLog:
> 
>       PR tree-optimization/122749
>       * gcc.target/aarch64/pr122749_1.c: New test.
>       * gcc.target/aarch64/pr122749_2.c: New test.
>       * gcc.target/aarch64/pr122749_3.c: New test.
>       * gcc.target/aarch64/pr122749_4.c: New test.
>       * gcc.target/aarch64/pr122749_5.c: New test.
>       * gcc.target/aarch64/pr122749_6.c: New test.
>       * gcc.target/aarch64/pr122749_8.c: New test.
>       * gcc.target/aarch64/pr122749_9.c: New test.
>       * gcc.target/aarch64/sve/pr122749_1.c: New test.
>       * gcc.target/aarch64/sve/pr122749_11.c: New test.
>       * gcc.target/aarch64/sve/pr122749_12.c: New test.
>       * gcc.target/aarch64/sve/pr122749_13.c: New test.
>       * gcc.target/aarch64/sve/pr122749_14.c: New test.
>       * gcc.target/aarch64/sve/pr122749_2.c: New test.
>       * gcc.target/aarch64/sve/pr122749_3.c: New test.
>       * gcc.target/aarch64/sve/pr122749_4.c: New test.
>       * gcc.target/aarch64/sve/pr122749_5.c: New test.
>       * gcc.target/aarch64/sve/pr122749_6.c: New test.
>       * gcc.target/aarch64/sve/pr122749_8.c: New test.
>       * gcc.target/aarch64/sve/pr122749_9.c: New test.
> 
> -- inline copy of patch --
> 
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr122749_1.c 
> b/gcc/testsuite/gcc.target/aarch64/pr122749_1.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..25311fce4e3a79b389cbb750231c1277ccaf0611
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr122749_1.c
> @@ -0,0 +1,48 @@
> +/* { dg-do run { target arm_v8_neon_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 --param 
> vect-epilogues-nomask=0 -fdump-tree-vect-details -fdump-tree-widening_mul" } 
> */
> +
> +#include <limits.h>
> +#include <stdint.h>
> +
> +typedef int8_t elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +#pragma GCC novector
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +int
> +main (void)
> +{
> +  elem_t buf[] = { 1, -2, INT8_MAX, INT8_MIN, 5, -7, 3, -4 };
> +  int len = sizeof (buf) / sizeof (buf[0]);
> +  elem_t want = reference (buf, len);
> +  elem_t got = foo2 (buf, len);
> +
> +  if (want != got)
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\.FMA" 1 "widening_mul" { xfail *-*-* 
> } } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr122749_2.c 
> b/gcc/testsuite/gcc.target/aarch64/pr122749_2.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..f4a70a611176893e9fa55d8bc1826805ed5d966d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr122749_2.c
> @@ -0,0 +1,48 @@
> +/* { dg-do run { target arm_v8_neon_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 --param 
> vect-epilogues-nomask=0 -fdump-tree-vect-details -fdump-tree-widening_mul" } 
> */
> +
> +#include <limits.h>
> +#include <stdint.h>
> +
> +typedef int16_t elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +#pragma GCC novector
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +int
> +main (void)
> +{
> +  elem_t buf[] = { 1, -2, INT16_MAX, INT16_MIN, 5, -7, 3, -4 };
> +  int len = sizeof (buf) / sizeof (buf[0]);
> +  elem_t want = reference (buf, len);
> +  elem_t got = foo2 (buf, len);
> +
> +  if (want != got)
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\.FMA" 1 "widening_mul" { xfail *-*-* 
> } } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr122749_3.c 
> b/gcc/testsuite/gcc.target/aarch64/pr122749_3.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..61bcd30be2b47f482e8b3f0a024b2a1d51c4fda7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr122749_3.c
> @@ -0,0 +1,48 @@
> +/* { dg-do run { target arm_v8_neon_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 --param 
> vect-epilogues-nomask=0 -fdump-tree-vect-details -fdump-tree-widening_mul" } 
> */
> +
> +#include <limits.h>
> +#include <stdint.h>
> +
> +typedef int32_t elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +#pragma GCC novector
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +int
> +main (void)
> +{
> +  elem_t buf[] = { 1, -2, INT32_MAX, INT32_MIN, 5, -7, 3, -4 };
> +  int len = sizeof (buf) / sizeof (buf[0]);
> +  elem_t want = reference (buf, len);
> +  elem_t got = foo2 (buf, len);
> +
> +  if (want != got)
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\.FMA" 1 "widening_mul" { xfail *-*-* 
> } } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr122749_4.c 
> b/gcc/testsuite/gcc.target/aarch64/pr122749_4.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..6089716b0ca7498f9b8089f1b72d2968b1c2ee76
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr122749_4.c
> @@ -0,0 +1,45 @@
> +/* { dg-do run { target arm_v8_neon_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 --param 
> vect-epilogues-nomask=0 -fdump-tree-vect-details -fdump-tree-widening_mul" } 
> */
> +
> +typedef float elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +#pragma GCC novector
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +int
> +main (void)
> +{
> +  elem_t buf[] = { 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f };
> +  int len = sizeof (buf) / sizeof (buf[0]);
> +  elem_t want = reference (buf, len);
> +  elem_t got = foo2 (buf, len);
> +
> +  if (want != got)
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\\.FMA" 4 "widening_mul" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr122749_5.c 
> b/gcc/testsuite/gcc.target/aarch64/pr122749_5.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..562dc5be861762272ea8d23b8304e1abb439e20f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr122749_5.c
> @@ -0,0 +1,45 @@
> +/* { dg-do run { target arm_v8_neon_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 --param 
> vect-epilogues-nomask=0 -fdump-tree-vect-details -fdump-tree-widening_mul" } 
> */
> +
> +typedef double elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +#pragma GCC novector
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +int
> +main (void)
> +{
> +  elem_t buf[] = { 1.0, 2.0, 1.0, 2.0, 1.0, 2.0 };
> +  int len = sizeof (buf) / sizeof (buf[0]);
> +  elem_t want = reference (buf, len);
> +  elem_t got = foo2 (buf, len);
> +
> +  if (want != got)
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\\.FMA" 2 "widening_mul" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr122749_6.c 
> b/gcc/testsuite/gcc.target/aarch64/pr122749_6.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..3e51c5e22a18a9a3acd2416c3ba72496c9621adf
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr122749_6.c
> @@ -0,0 +1,45 @@
> +/* { dg-do run { target arm_v8_neon_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 --param 
> vect-epilogues-nomask=0 -fwrapv -fdump-tree-vect-details 
> -fdump-tree-widening_mul" } */
> +
> +typedef float elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +#pragma GCC novector
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +int
> +main (void)
> +{
> +  elem_t buf[] = { 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f };
> +  int len = sizeof (buf) / sizeof (buf[0]);
> +  elem_t want = reference (buf, len);
> +  elem_t got = foo2 (buf, len);
> +
> +  if (want != got)
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\\.FMA" 4 "widening_mul" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr122749_8.c 
> b/gcc/testsuite/gcc.target/aarch64/pr122749_8.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..6aa729c13d1616273d579077253d3fcdf55cc555
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr122749_8.c
> @@ -0,0 +1,48 @@
> +/* { dg-do run { target arm_v8_neon_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 --param 
> vect-epilogues-nomask=0 -fdump-tree-vect-details -fdump-tree-widening_mul" } 
> */
> +
> +#include <limits.h>
> +#include <stdint.h>
> +
> +typedef uint8_t elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +#pragma GCC novector
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +int
> +main (void)
> +{
> +  elem_t buf[] = { 1, 2, UINT8_MAX, 7, 0, UINT8_MAX, 5, 9 };
> +  int len = sizeof (buf) / sizeof (buf[0]);
> +  elem_t want = reference (buf, len);
> +  elem_t got = foo2 (buf, len);
> +
> +  if (want != got)
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\.FMA" 1 "widening_mul" { xfail *-*-* 
> } } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr122749_9.c 
> b/gcc/testsuite/gcc.target/aarch64/pr122749_9.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..d987a9936afb2cb4ba19e62736fa4ed171669e25
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr122749_9.c
> @@ -0,0 +1,48 @@
> +/* { dg-do run { target arm_v8_neon_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 --param 
> vect-epilogues-nomask=0 -fdump-tree-vect-details -fdump-tree-widening_mul" } 
> */
> +
> +#include <limits.h>
> +#include <stdint.h>
> +
> +typedef uint16_t elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +#pragma GCC novector
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +int
> +main (void)
> +{
> +  elem_t buf[] = { 1, 2, UINT16_MAX, 7, 0, UINT16_MAX, 5, 9 };
> +  int len = sizeof (buf) / sizeof (buf[0]);
> +  elem_t want = reference (buf, len);
> +  elem_t got = foo2 (buf, len);
> +
> +  if (want != got)
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\.FMA" 1 "widening_mul" { xfail *-*-* 
> } } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_1.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_1.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..32a36461fbc7bb78048ae68c8dc0bdd81b11a2cd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_1.c
> @@ -0,0 +1,48 @@
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details 
> -fdump-tree-widening_mul" } */
> +
> +#include <limits.h>
> +#include <stdint.h>
> +
> +typedef int8_t elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +#pragma GCC novector
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +int
> +main (void)
> +{
> +  elem_t buf[] = { 1, -2, INT8_MAX, INT8_MIN, 5, -7, 3, -4 };
> +  int len = sizeof (buf) / sizeof (buf[0]);
> +  elem_t want = reference (buf, len);
> +  elem_t got = foo2 (buf, len);
> +
> +  if (want != got)
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_11.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_11.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..bd160dd0ebf515a3ff3ddd1969303aabf8c03aea
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_11.c
> @@ -0,0 +1,48 @@
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details 
> -fdump-tree-widening_mul" } */
> +
> +#include <limits.h>
> +#include <stdint.h>
> +
> +typedef uint8_t elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +#pragma GCC novector
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +int
> +main (void)
> +{
> +  elem_t buf[] = { 1, 2, UINT8_MAX, 7, 0, UINT8_MAX, 5, 9 };
> +  int len = sizeof (buf) / sizeof (buf[0]);
> +  elem_t want = reference (buf, len);
> +  elem_t got = foo2 (buf, len);
> +
> +  if (want != got)
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_12.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_12.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..8f0198ce42600b0fe92bf483123ad1cb71ff9f24
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_12.c
> @@ -0,0 +1,48 @@
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details 
> -fdump-tree-widening_mul" } */
> +
> +#include <limits.h>
> +#include <stdint.h>
> +
> +typedef uint16_t elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +#pragma GCC novector
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +int
> +main (void)
> +{
> +  elem_t buf[] = { 1, 2, UINT16_MAX, 7, 0, UINT16_MAX, 5, 9 };
> +  int len = sizeof (buf) / sizeof (buf[0]);
> +  elem_t want = reference (buf, len);
> +  elem_t got = foo2 (buf, len);
> +
> +  if (want != got)
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_13.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_13.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..218afde13984fc64755d3c4567a05a33b5485411
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_13.c
> @@ -0,0 +1,48 @@
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details 
> -fdump-tree-widening_mul" } */
> +
> +#include <limits.h>
> +#include <stdint.h>
> +
> +typedef uint32_t elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +#pragma GCC novector
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +int
> +main (void)
> +{
> +  elem_t buf[] = { 1, 2, UINT32_MAX, 7, 0, UINT32_MAX, 5, 9 };
> +  int len = sizeof (buf) / sizeof (buf[0]);
> +  elem_t want = reference (buf, len);
> +  elem_t got = foo2 (buf, len);
> +
> +  if (want != got)
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_14.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_14.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..1587628757e28f66dfd515e191ef04331c549434
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_14.c
> @@ -0,0 +1,48 @@
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details 
> -fdump-tree-widening_mul" } */
> +
> +#include <limits.h>
> +#include <stdint.h>
> +
> +typedef uint64_t elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +#pragma GCC novector
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +int
> +main (void)
> +{
> +  elem_t buf[] = { 1, 2, UINT64_MAX, 7, 0, UINT64_MAX, 5, 9 };
> +  int len = sizeof (buf) / sizeof (buf[0]);
> +  elem_t want = reference (buf, len);
> +  elem_t got = foo2 (buf, len);
> +
> +  if (want != got)
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_2.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_2.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..0f5918a9023521b06ac20ef922b025dc6a1e8f01
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_2.c
> @@ -0,0 +1,48 @@
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details 
> -fdump-tree-widening_mul" } */
> +
> +#include <limits.h>
> +#include <stdint.h>
> +
> +typedef int16_t elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +#pragma GCC novector
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +int
> +main (void)
> +{
> +  elem_t buf[] = { 1, -2, INT16_MAX, INT16_MIN, 5, -7, 3, -4 };
> +  int len = sizeof (buf) / sizeof (buf[0]);
> +  elem_t want = reference (buf, len);
> +  elem_t got = foo2 (buf, len);
> +
> +  if (want != got)
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_3.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_3.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..92548cb6ec4fdc4a3d133669fb914c5ab9a103ba
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_3.c
> @@ -0,0 +1,48 @@
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details 
> -fdump-tree-widening_mul" } */
> +
> +#include <limits.h>
> +#include <stdint.h>
> +
> +typedef int32_t elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +#pragma GCC novector
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +int
> +main (void)
> +{
> +  elem_t buf[] = { 1, -2, INT32_MAX, INT32_MIN, 5, -7, 3, -4 };
> +  int len = sizeof (buf) / sizeof (buf[0]);
> +  elem_t want = reference (buf, len);
> +  elem_t got = foo2 (buf, len);
> +
> +  if (want != got)
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_4.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_4.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..6085a18bab7f2ae0e5855a982e186f831705bf52
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_4.c
> @@ -0,0 +1,48 @@
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details 
> -fdump-tree-widening_mul" } */
> +
> +#include <limits.h>
> +#include <stdint.h>
> +
> +typedef int64_t elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +#pragma GCC novector
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +int
> +main (void)
> +{
> +  elem_t buf[] = { 1, -2, INT64_MAX, INT64_MIN, 5, -7, 3, -4 };
> +  int len = sizeof (buf) / sizeof (buf[0]);
> +  elem_t want = reference (buf, len);
> +  elem_t got = foo2 (buf, len);
> +
> +  if (want != got)
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_5.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_5.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..d61b91bb06dc0a035bd6adfabccc580eac7f78a6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_5.c
> @@ -0,0 +1,45 @@
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details 
> -fdump-tree-widening_mul" } */
> +
> +typedef float elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +#pragma GCC novector
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +int
> +main (void)
> +{
> +  elem_t buf[] = { 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f };
> +  int len = sizeof (buf) / sizeof (buf[0]);
> +  elem_t want = reference (buf, len);
> +  elem_t got = foo2 (buf, len);
> +
> +  if (want != got)
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_6.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_6.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..7598f7a28bcf1745ce672c0bab22fec0fda37a3f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_6.c
> @@ -0,0 +1,45 @@
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details 
> -fdump-tree-widening_mul" } */
> +
> +typedef double elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +#pragma GCC novector
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +int
> +main (void)
> +{
> +  elem_t buf[] = { 1.0, 2.0, 1.0, 2.0, 1.0, 2.0 };
> +  int len = sizeof (buf) / sizeof (buf[0]);
> +  elem_t want = reference (buf, len);
> +  elem_t got = foo2 (buf, len);
> +
> +  if (want != got)
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_8.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_8.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..e1c337d44ead96d868d71f0ae54960f2189e499e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_8.c
> @@ -0,0 +1,45 @@
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 -fwrapv 
> -fdump-tree-vect-details -fdump-tree-widening_mul" } */
> +
> +typedef float elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +#pragma GCC novector
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +int
> +main (void)
> +{
> +  elem_t buf[] = { 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f };
> +  int len = sizeof (buf) / sizeof (buf[0]);
> +  elem_t want = reference (buf, len);
> +  elem_t got = foo2 (buf, len);
> +
> +  if (want != got)
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_9.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_9.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..13d962e2130f986910f1a94489e4014761e917b5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_9.c
> @@ -0,0 +1,45 @@
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 -fwrapv 
> -fdump-tree-vect-details -fdump-tree-widening_mul" } */
> +
> +typedef double elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> +  elem_t x = 0;
> +
> +#pragma GCC novector
> +  for (int i = 0; i < len; i++)
> +    x += (elem_t) i * buf[i];
> +
> +  return x;
> +}
> +
> +int
> +main (void)
> +{
> +  elem_t buf[] = { 1.0, 2.0, 1.0, 2.0, 1.0, 2.0 };
> +  int len = sizeof (buf) / sizeof (buf[0]);
> +  elem_t want = reference (buf, len);
> +  elem_t got = foo2 (buf, len);
> +
> +  if (want != got)
> +    __builtin_abort ();
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc
> index 
> 4c3fb0f4fc5313199357d19ab809a7d8d88ed2d6..4b50a96ad3aa19857c5b8436ee8d6d3080d3c9ed
>  100644
> --- a/gcc/tree-ssa-math-opts.cc
> +++ b/gcc/tree-ssa-math-opts.cc
> @@ -3120,6 +3120,26 @@ convert_mult_to_fma_1 (tree mul_result, tree op1, tree 
> op2)
>        if (is_gimple_debug (use_stmt))
>       continue;
>  
> +      /* If the use is a type convert, look further into it if the operations
> +      are the same under two's complement.  */
> +      tree lhs_type;
> +      if (gimple_assign_cast_p (use_stmt)
> +       && (lhs_type = TREE_TYPE (gimple_get_lhs (use_stmt)))
> +       && tree_nop_conversion_p (lhs_type, TREE_TYPE (op1)))
> +     {
> +       tree cast_lhs = gimple_get_lhs (use_stmt);
> +       gimple *tmp_use;
> +       use_operand_p tmp_use_p;
> +       if (single_imm_use (cast_lhs, &tmp_use_p, &tmp_use))
> +         {
> +           release_defs (use_stmt);
> +           use_stmt = tmp_use;
> +           result = cast_lhs;
> +           gsi_remove (&gsi, true);
> +           gsi = gsi_for_stmt (use_stmt);
> +         }
> +     }
> +
>        if (is_gimple_assign (use_stmt)
>         && gimple_assign_rhs_code (use_stmt) == NEGATE_EXPR)
>       {
> @@ -3159,6 +3179,13 @@ convert_mult_to_fma_1 (tree mul_result, tree op1, tree 
> op2)
>        if (seq)
>       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
>  
> +      /* Ensure all the operands are of the same type.  Use the type of the
> +      addend as that's the statement being replaced.  */
> +      op2 = gimple_convert (&gsi, true, GSI_SAME_STMT,
> +                         UNKNOWN_LOCATION, TREE_TYPE (addop), op2);
> +      mulop1 = gimple_convert (&gsi, true, GSI_SAME_STMT,
> +                            UNKNOWN_LOCATION, TREE_TYPE (addop), mulop1);
> +
>        if (len)
>       fma_stmt
>         = gimple_build_call_internal (IFN_COND_LEN_FMA, 7, cond, mulop1, op2,
> @@ -3419,6 +3446,20 @@ convert_mult_to_fma (gimple *mul_stmt, tree op1, tree 
> op2,
>        if (is_gimple_debug (use_stmt))
>       continue;
>  
> +      /* If the use is a type convert, look further into it if the operations
> +      are the same under two's complement.  */
> +      tree lhs_type;
> +      if (gimple_assign_cast_p (use_stmt)
> +       && (lhs_type = TREE_TYPE (gimple_get_lhs (use_stmt)))
> +       && tree_nop_conversion_p (lhs_type, TREE_TYPE (op1)))
> +     {
> +       tree cast_lhs = gimple_get_lhs (use_stmt);
> +       gimple *tmp_use;
> +       use_operand_p tmp_use_p;
> +       if (single_imm_use (cast_lhs, &tmp_use_p, &tmp_use))
> +         use_stmt = tmp_use;
> +     }
> +
>        /* For now restrict this operations to single basic blocks.  In theory
>        we would want to support sinking the multiplication in
>        m = a*b;
> 

-- 
Richard Biener <[email protected]>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Jochen Jaser, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

Reply via email to