On Mon, Jan 5, 2026 at 10:45 PM Tamar Christina <[email protected]> wrote:
>
> The following example
>
> int foo2 (char *buf, int len) {
> int x;
> for (int i =0; i < len; i++) {
> x += (int) i * buf[i];
> }
> return x;
> }
>
> compiled with -O3 -mcpu=neoverse-v2 used to generate a 4x unrolled MLA
> sequence
>
> mla z29.s, p7/m, z2.s, z0.s
> mla z27.s, p7/m, z4.s, z26.s
> mla z30.s, p7/m, z1.s, z0.s
> mla z28.s, p7/m, z23.s, z3.s
>
> but now generates MUL + ADD
>
> mul z2.s, z2.s, z1.s
> mul z4.s, z4.s, z26.s
> mul z1.s, z24.s, z1.s
> mul z3.s, z23.s, z3.s
> add z29.s, z2.s, z29.s
> add z30.s, z1.s, z30.s
> add z28.s, z3.s, z28.s
> add z0.s, z4.s, z0.s
>
> This is since the fix for r16-3328-g3182e95eda4 we now insert casts around the
> reduction addend. This causes convert_mult_to_fma to miss the mul + add
> sequence.
>
> This patch teaches it to look around the casts for the operands and only
> accept
> the conversions if it's essentially only a sign changing operations. If the
> operation is being converted from unsigned to signed, additionally it requires
> that we're not using a type where the overflow wraps.
>
> Concretely, it converts:
>
> # vect_vec_iv_.13_49 = PHI <_50(5), { 0, 1, 2, ... }(4)>
> vect__3.8_38 = MEM <vector([4,4]) char> [(char *)_16];
> vect__4.12_45 = (vector([4,4]) int) vect__3.8_38;
> vect__5.14_54 = vect__4.12_45 * vect_vec_iv_.13_49;
> vect_x_12.17_62 = VIEW_CONVERT_EXPR<vector([4,4]) unsigned
> int>(vect__5.14_54);
> vect_x_12.17_63 = VIEW_CONVERT_EXPR<vector([4,4]) unsigned
> int>(vect_x_16.15_58);
> vect_x_12.17_64 = vect_x_12.17_62 + vect_x_12.17_63;
> vect_x_12.16_65 = VIEW_CONVERT_EXPR<vector([4,4]) int>(vect_x_12.17_64);
>
> into:
>
> # vect_vec_iv_.13_49 = PHI <_50(5), { 0, 1, 2, ... }(4)>
> vect__3.8_38 = MEM <vector([4,4]) charD.8> [(charD.8 *)_16];
> vect__4.12_45 = (vector([4,4]) intD.7) vect__3.8_38;
> vect_x_12.17_63 = VIEW_CONVERT_EXPR<vector([4,4]) unsigned
> int>(vect_x_16.15_58);
> _2 = (vector([4,4]) unsigned int) vect_vec_iv_.13_49;
> _1 = (vector([4,4]) unsigned int) vect__4.12_45;
> vect_x_12.17_64 = .FMA (_1, _2, vect_x_12.17_63);
> vect_x_12.16_65 = VIEW_CONVERT_EXPR<vector([4,4]) intD.7>(vect_x_12.17_64);
>
> thus restoring FMAs on reductions.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu,
> arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
> -m32, -m64 and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> PR tree-optimization/122749
> * tree-ssa-math-opts.cc (convert_mult_to_fma_1, convert_mult_to_fma):
> Unwrap converts around addend.
>
> gcc/testsuite/ChangeLog:
>
> PR tree-optimization/122749
> * gcc.target/aarch64/pr122749_1.c: New test.
> * gcc.target/aarch64/pr122749_2.c: New test.
> * gcc.target/aarch64/pr122749_3.c: New test.
> * gcc.target/aarch64/pr122749_4.c: New test.
> * gcc.target/aarch64/pr122749_5.c: New test.
> * gcc.target/aarch64/pr122749_6.c: New test.
> * gcc.target/aarch64/pr122749_8.c: New test.
> * gcc.target/aarch64/pr122749_9.c: New test.
> * gcc.target/aarch64/sve/pr122749_1.c: New test.
> * gcc.target/aarch64/sve/pr122749_11.c: New test.
> * gcc.target/aarch64/sve/pr122749_12.c: New test.
> * gcc.target/aarch64/sve/pr122749_13.c: New test.
> * gcc.target/aarch64/sve/pr122749_14.c: New test.
> * gcc.target/aarch64/sve/pr122749_2.c: New test.
> * gcc.target/aarch64/sve/pr122749_3.c: New test.
> * gcc.target/aarch64/sve/pr122749_4.c: New test.
> * gcc.target/aarch64/sve/pr122749_5.c: New test.
> * gcc.target/aarch64/sve/pr122749_6.c: New test.
> * gcc.target/aarch64/sve/pr122749_8.c: New test.
> * gcc.target/aarch64/sve/pr122749_9.c: New test.
>
> ---
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr122749_1.c
> b/gcc/testsuite/gcc.target/aarch64/pr122749_1.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..25311fce4e3a79b389cbb750231c1277ccaf0611
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr122749_1.c
> @@ -0,0 +1,48 @@
> +/* { dg-do run { target arm_v8_neon_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 --param
> vect-epilogues-nomask=0 -fdump-tree-vect-details -fdump-tree-widening_mul" }
> */
> +
> +#include <limits.h>
> +#include <stdint.h>
> +
> +typedef int8_t elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> +#pragma GCC novector
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +int
> +main (void)
> +{
> + elem_t buf[] = { 1, -2, INT8_MAX, INT8_MIN, 5, -7, 3, -4 };
> + int len = sizeof (buf) / sizeof (buf[0]);
> + elem_t want = reference (buf, len);
> + elem_t got = foo2 (buf, len);
> +
> + if (want != got)
> + __builtin_abort ();
> +
> + return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\.FMA" 1 "widening_mul" { xfail *-*-*
> } } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr122749_2.c
> b/gcc/testsuite/gcc.target/aarch64/pr122749_2.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..f4a70a611176893e9fa55d8bc1826805ed5d966d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr122749_2.c
> @@ -0,0 +1,48 @@
> +/* { dg-do run { target arm_v8_neon_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 --param
> vect-epilogues-nomask=0 -fdump-tree-vect-details -fdump-tree-widening_mul" }
> */
> +
> +#include <limits.h>
> +#include <stdint.h>
> +
> +typedef int16_t elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> +#pragma GCC novector
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +int
> +main (void)
> +{
> + elem_t buf[] = { 1, -2, INT16_MAX, INT16_MIN, 5, -7, 3, -4 };
> + int len = sizeof (buf) / sizeof (buf[0]);
> + elem_t want = reference (buf, len);
> + elem_t got = foo2 (buf, len);
> +
> + if (want != got)
> + __builtin_abort ();
> +
> + return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\.FMA" 1 "widening_mul" { xfail *-*-*
> } } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr122749_3.c
> b/gcc/testsuite/gcc.target/aarch64/pr122749_3.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..61bcd30be2b47f482e8b3f0a024b2a1d51c4fda7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr122749_3.c
> @@ -0,0 +1,48 @@
> +/* { dg-do run { target arm_v8_neon_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 --param
> vect-epilogues-nomask=0 -fdump-tree-vect-details -fdump-tree-widening_mul" }
> */
> +
> +#include <limits.h>
> +#include <stdint.h>
> +
> +typedef int32_t elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> +#pragma GCC novector
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +int
> +main (void)
> +{
> + elem_t buf[] = { 1, -2, INT32_MAX, INT32_MIN, 5, -7, 3, -4 };
> + int len = sizeof (buf) / sizeof (buf[0]);
> + elem_t want = reference (buf, len);
> + elem_t got = foo2 (buf, len);
> +
> + if (want != got)
> + __builtin_abort ();
> +
> + return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\.FMA" 1 "widening_mul" { xfail *-*-*
> } } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr122749_4.c
> b/gcc/testsuite/gcc.target/aarch64/pr122749_4.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..6089716b0ca7498f9b8089f1b72d2968b1c2ee76
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr122749_4.c
> @@ -0,0 +1,45 @@
> +/* { dg-do run { target arm_v8_neon_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 --param
> vect-epilogues-nomask=0 -fdump-tree-vect-details -fdump-tree-widening_mul" }
> */
> +
> +typedef float elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> +#pragma GCC novector
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +int
> +main (void)
> +{
> + elem_t buf[] = { 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f };
> + int len = sizeof (buf) / sizeof (buf[0]);
> + elem_t want = reference (buf, len);
> + elem_t got = foo2 (buf, len);
> +
> + if (want != got)
> + __builtin_abort ();
> +
> + return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\\.FMA" 4 "widening_mul" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr122749_5.c
> b/gcc/testsuite/gcc.target/aarch64/pr122749_5.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..562dc5be861762272ea8d23b8304e1abb439e20f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr122749_5.c
> @@ -0,0 +1,45 @@
> +/* { dg-do run { target arm_v8_neon_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 --param
> vect-epilogues-nomask=0 -fdump-tree-vect-details -fdump-tree-widening_mul" }
> */
> +
> +typedef double elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> +#pragma GCC novector
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +int
> +main (void)
> +{
> + elem_t buf[] = { 1.0, 2.0, 1.0, 2.0, 1.0, 2.0 };
> + int len = sizeof (buf) / sizeof (buf[0]);
> + elem_t want = reference (buf, len);
> + elem_t got = foo2 (buf, len);
> +
> + if (want != got)
> + __builtin_abort ();
> +
> + return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\\.FMA" 2 "widening_mul" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr122749_6.c
> b/gcc/testsuite/gcc.target/aarch64/pr122749_6.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..3e51c5e22a18a9a3acd2416c3ba72496c9621adf
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr122749_6.c
> @@ -0,0 +1,45 @@
> +/* { dg-do run { target arm_v8_neon_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 --param
> vect-epilogues-nomask=0 -fwrapv -fdump-tree-vect-details
> -fdump-tree-widening_mul" } */
> +
> +typedef float elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> +#pragma GCC novector
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +int
> +main (void)
> +{
> + elem_t buf[] = { 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f };
> + int len = sizeof (buf) / sizeof (buf[0]);
> + elem_t want = reference (buf, len);
> + elem_t got = foo2 (buf, len);
> +
> + if (want != got)
> + __builtin_abort ();
> +
> + return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\\.FMA" 4 "widening_mul" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr122749_8.c
> b/gcc/testsuite/gcc.target/aarch64/pr122749_8.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..6aa729c13d1616273d579077253d3fcdf55cc555
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr122749_8.c
> @@ -0,0 +1,48 @@
> +/* { dg-do run { target arm_v8_neon_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 --param
> vect-epilogues-nomask=0 -fdump-tree-vect-details -fdump-tree-widening_mul" }
> */
> +
> +#include <limits.h>
> +#include <stdint.h>
> +
> +typedef uint8_t elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> +#pragma GCC novector
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +int
> +main (void)
> +{
> + elem_t buf[] = { 1, 2, UINT8_MAX, 7, 0, UINT8_MAX, 5, 9 };
> + int len = sizeof (buf) / sizeof (buf[0]);
> + elem_t want = reference (buf, len);
> + elem_t got = foo2 (buf, len);
> +
> + if (want != got)
> + __builtin_abort ();
> +
> + return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\.FMA" 1 "widening_mul" { xfail *-*-*
> } } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr122749_9.c
> b/gcc/testsuite/gcc.target/aarch64/pr122749_9.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..d987a9936afb2cb4ba19e62736fa4ed171669e25
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr122749_9.c
> @@ -0,0 +1,48 @@
> +/* { dg-do run { target arm_v8_neon_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 --param
> vect-epilogues-nomask=0 -fdump-tree-vect-details -fdump-tree-widening_mul" }
> */
> +
> +#include <limits.h>
> +#include <stdint.h>
> +
> +typedef uint16_t elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> +#pragma GCC novector
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +int
> +main (void)
> +{
> + elem_t buf[] = { 1, 2, UINT16_MAX, 7, 0, UINT16_MAX, 5, 9 };
> + int len = sizeof (buf) / sizeof (buf[0]);
> + elem_t want = reference (buf, len);
> + elem_t got = foo2 (buf, len);
> +
> + if (want != got)
> + __builtin_abort ();
> +
> + return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\.FMA" 1 "widening_mul" { xfail *-*-*
> } } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_1.c
> b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_1.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..32a36461fbc7bb78048ae68c8dc0bdd81b11a2cd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_1.c
> @@ -0,0 +1,48 @@
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details
> -fdump-tree-widening_mul" } */
> +
> +#include <limits.h>
> +#include <stdint.h>
> +
> +typedef int8_t elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> +#pragma GCC novector
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +int
> +main (void)
> +{
> + elem_t buf[] = { 1, -2, INT8_MAX, INT8_MIN, 5, -7, 3, -4 };
> + int len = sizeof (buf) / sizeof (buf[0]);
> + elem_t want = reference (buf, len);
> + elem_t got = foo2 (buf, len);
> +
> + if (want != got)
> + __builtin_abort ();
> +
> + return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_11.c
> b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_11.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..bd160dd0ebf515a3ff3ddd1969303aabf8c03aea
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_11.c
> @@ -0,0 +1,48 @@
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details
> -fdump-tree-widening_mul" } */
> +
> +#include <limits.h>
> +#include <stdint.h>
> +
> +typedef uint8_t elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> +#pragma GCC novector
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +int
> +main (void)
> +{
> + elem_t buf[] = { 1, 2, UINT8_MAX, 7, 0, UINT8_MAX, 5, 9 };
> + int len = sizeof (buf) / sizeof (buf[0]);
> + elem_t want = reference (buf, len);
> + elem_t got = foo2 (buf, len);
> +
> + if (want != got)
> + __builtin_abort ();
> +
> + return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_12.c
> b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_12.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..8f0198ce42600b0fe92bf483123ad1cb71ff9f24
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_12.c
> @@ -0,0 +1,48 @@
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details
> -fdump-tree-widening_mul" } */
> +
> +#include <limits.h>
> +#include <stdint.h>
> +
> +typedef uint16_t elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> +#pragma GCC novector
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +int
> +main (void)
> +{
> + elem_t buf[] = { 1, 2, UINT16_MAX, 7, 0, UINT16_MAX, 5, 9 };
> + int len = sizeof (buf) / sizeof (buf[0]);
> + elem_t want = reference (buf, len);
> + elem_t got = foo2 (buf, len);
> +
> + if (want != got)
> + __builtin_abort ();
> +
> + return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_13.c
> b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_13.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..218afde13984fc64755d3c4567a05a33b5485411
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_13.c
> @@ -0,0 +1,48 @@
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details
> -fdump-tree-widening_mul" } */
> +
> +#include <limits.h>
> +#include <stdint.h>
> +
> +typedef uint32_t elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> +#pragma GCC novector
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +int
> +main (void)
> +{
> + elem_t buf[] = { 1, 2, UINT32_MAX, 7, 0, UINT32_MAX, 5, 9 };
> + int len = sizeof (buf) / sizeof (buf[0]);
> + elem_t want = reference (buf, len);
> + elem_t got = foo2 (buf, len);
> +
> + if (want != got)
> + __builtin_abort ();
> +
> + return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_14.c
> b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_14.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..1587628757e28f66dfd515e191ef04331c549434
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_14.c
> @@ -0,0 +1,48 @@
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details
> -fdump-tree-widening_mul" } */
> +
> +#include <limits.h>
> +#include <stdint.h>
> +
> +typedef uint64_t elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> +#pragma GCC novector
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +int
> +main (void)
> +{
> + elem_t buf[] = { 1, 2, UINT64_MAX, 7, 0, UINT64_MAX, 5, 9 };
> + int len = sizeof (buf) / sizeof (buf[0]);
> + elem_t want = reference (buf, len);
> + elem_t got = foo2 (buf, len);
> +
> + if (want != got)
> + __builtin_abort ();
> +
> + return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_2.c
> b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_2.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..0f5918a9023521b06ac20ef922b025dc6a1e8f01
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_2.c
> @@ -0,0 +1,48 @@
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details
> -fdump-tree-widening_mul" } */
> +
> +#include <limits.h>
> +#include <stdint.h>
> +
> +typedef int16_t elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> +#pragma GCC novector
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +int
> +main (void)
> +{
> + elem_t buf[] = { 1, -2, INT16_MAX, INT16_MIN, 5, -7, 3, -4 };
> + int len = sizeof (buf) / sizeof (buf[0]);
> + elem_t want = reference (buf, len);
> + elem_t got = foo2 (buf, len);
> +
> + if (want != got)
> + __builtin_abort ();
> +
> + return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_3.c
> b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_3.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..92548cb6ec4fdc4a3d133669fb914c5ab9a103ba
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_3.c
> @@ -0,0 +1,48 @@
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details
> -fdump-tree-widening_mul" } */
> +
> +#include <limits.h>
> +#include <stdint.h>
> +
> +typedef int32_t elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> +#pragma GCC novector
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +int
> +main (void)
> +{
> + elem_t buf[] = { 1, -2, INT32_MAX, INT32_MIN, 5, -7, 3, -4 };
> + int len = sizeof (buf) / sizeof (buf[0]);
> + elem_t want = reference (buf, len);
> + elem_t got = foo2 (buf, len);
> +
> + if (want != got)
> + __builtin_abort ();
> +
> + return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_4.c
> b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_4.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..6085a18bab7f2ae0e5855a982e186f831705bf52
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_4.c
> @@ -0,0 +1,48 @@
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details
> -fdump-tree-widening_mul" } */
> +
> +#include <limits.h>
> +#include <stdint.h>
> +
> +typedef int64_t elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> +#pragma GCC novector
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +int
> +main (void)
> +{
> + elem_t buf[] = { 1, -2, INT64_MAX, INT64_MIN, 5, -7, 3, -4 };
> + int len = sizeof (buf) / sizeof (buf[0]);
> + elem_t want = reference (buf, len);
> + elem_t got = foo2 (buf, len);
> +
> + if (want != got)
> + __builtin_abort ();
> +
> + return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_5.c
> b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_5.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..d61b91bb06dc0a035bd6adfabccc580eac7f78a6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_5.c
> @@ -0,0 +1,45 @@
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details
> -fdump-tree-widening_mul" } */
> +
> +typedef float elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> +#pragma GCC novector
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +int
> +main (void)
> +{
> + elem_t buf[] = { 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f };
> + int len = sizeof (buf) / sizeof (buf[0]);
> + elem_t want = reference (buf, len);
> + elem_t got = foo2 (buf, len);
> +
> + if (want != got)
> + __builtin_abort ();
> +
> + return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_6.c
> b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_6.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..7598f7a28bcf1745ce672c0bab22fec0fda37a3f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_6.c
> @@ -0,0 +1,45 @@
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details
> -fdump-tree-widening_mul" } */
> +
> +typedef double elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> +#pragma GCC novector
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +int
> +main (void)
> +{
> + elem_t buf[] = { 1.0, 2.0, 1.0, 2.0, 1.0, 2.0 };
> + int len = sizeof (buf) / sizeof (buf[0]);
> + elem_t want = reference (buf, len);
> + elem_t got = foo2 (buf, len);
> +
> + if (want != got)
> + __builtin_abort ();
> +
> + return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_8.c
> b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_8.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..e1c337d44ead96d868d71f0ae54960f2189e499e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_8.c
> @@ -0,0 +1,45 @@
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 -fwrapv
> -fdump-tree-vect-details -fdump-tree-widening_mul" } */
> +
> +typedef float elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> +#pragma GCC novector
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +int
> +main (void)
> +{
> + elem_t buf[] = { 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f };
> + int len = sizeof (buf) / sizeof (buf[0]);
> + elem_t want = reference (buf, len);
> + elem_t got = foo2 (buf, len);
> +
> + if (want != got)
> + __builtin_abort ();
> +
> + return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_9.c
> b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_9.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..13d962e2130f986910f1a94489e4014761e917b5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_9.c
> @@ -0,0 +1,45 @@
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-additional-options "-Ofast -std=gnu99 -fwrapv
> -fdump-tree-vect-details -fdump-tree-widening_mul" } */
> +
> +typedef double elem_t;
> +
> +__attribute__ ((noipa))
> +elem_t
> +foo2 (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +static elem_t
> +reference (elem_t *buf, int len)
> +{
> + elem_t x = 0;
> +
> +#pragma GCC novector
> + for (int i = 0; i < len; i++)
> + x += (elem_t) i * buf[i];
> +
> + return x;
> +}
> +
> +int
> +main (void)
> +{
> + elem_t buf[] = { 1.0, 2.0, 1.0, 2.0, 1.0, 2.0 };
> + int len = sizeof (buf) / sizeof (buf[0]);
> + elem_t want = reference (buf, len);
> + elem_t got = foo2 (buf, len);
> +
> + if (want != got)
> + __builtin_abort ();
> +
> + return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
> diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc
> index
> 02e194ae06f34957194c4e4f2eb4fdb3ef72d2f5..aa12221a2b2b584fa10fe378e16115128408ee3e
> 100644
> --- a/gcc/tree-ssa-math-opts.cc
> +++ b/gcc/tree-ssa-math-opts.cc
> @@ -3120,6 +3120,30 @@ convert_mult_to_fma_1 (tree mul_result, tree op1, tree
> op2)
> if (is_gimple_debug (use_stmt))
> continue;
>
> + /* If the use is a type convert, look further into it if the operations
> + are the same under two's complement. */
> + tree lhs_type;
> + if (gimple_assign_cast_p (use_stmt)
> + && (lhs_type = TREE_TYPE (gimple_get_lhs (use_stmt)))
> + && TREE_CODE (lhs_type) == TREE_CODE (TREE_TYPE (op1))
strict equality is going to be brittle, what are you trying to protect against
with this?
> + && (TYPE_UNSIGNED (lhs_type)
> + || (ANY_INTEGRAL_TYPE_P (lhs_type)
> + && !TYPE_OVERFLOW_WRAPS (lhs_type)))
> + && (element_precision (lhs_type)
> + == element_precision (gimple_assign_rhs1 (use_stmt))))
I think that you want to simplify this to tree_nop_conversion_p and make
sure to perform the FMA in a wrapping type if you looked through one - that
would also allow the reverse sign conversion case.
> + {
> + tree cast_lhs = gimple_get_lhs (use_stmt);
> + gimple *tmp_use;
> + use_operand_p tmp_use_p;
> + if (single_imm_use (cast_lhs, &tmp_use_p, &tmp_use))
> + {
> + use_stmt = tmp_use;
> + result = cast_lhs;
> + gsi_remove (&gsi, true);
release_defs missing?
> + gsi = gsi_for_stmt (use_stmt);
> + }
> + }
> +
> if (is_gimple_assign (use_stmt)
> && gimple_assign_rhs_code (use_stmt) == NEGATE_EXPR)
> {
> @@ -3156,6 +3180,11 @@ convert_mult_to_fma_1 (tree mul_result, tree op1, tree
> op2)
> if (negate_p)
> mulop1 = gimple_build (&seq, NEGATE_EXPR, type, mulop1);
>
> + /* Ensure all the operands are of the same type Use the type of the
> + addend as that's the statement being replaced. */
> + op2 = gimple_convert (&seq, TREE_TYPE (addop), op2);
> + mulop1 = gimple_convert (&seq, TREE_TYPE (addop), mulop1);
> +
In your code example I see back-and-forth conversion because of the use of
gimple_convert with a 'seq' - if we'd use the 'gsi' overloads that would be
avoided by also match-and-simplifying with other stmts in the IL.
Thanks,
Richard.
> if (seq)
> gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
>
> @@ -3419,6 +3448,25 @@ convert_mult_to_fma (gimple *mul_stmt, tree op1, tree
> op2,
> if (is_gimple_debug (use_stmt))
> continue;
>
> + /* If the use is a type convert, look further into it if the operations
> + are the same under two's complement. */
> + tree lhs_type;
> + if (gimple_assign_cast_p (use_stmt)
> + && (lhs_type = TREE_TYPE (gimple_get_lhs (use_stmt)))
> + && TREE_CODE (lhs_type) == TREE_CODE (TREE_TYPE (op1))
> + && (TYPE_UNSIGNED (lhs_type)
> + || (ANY_INTEGRAL_TYPE_P (lhs_type)
> + && !TYPE_OVERFLOW_WRAPS (lhs_type)))
> + && (element_precision (lhs_type)
> + == element_precision (gimple_assign_rhs1 (use_stmt))))
> + {
> + tree cast_lhs = gimple_get_lhs (use_stmt);
> + gimple *tmp_use;
> + use_operand_p tmp_use_p;
> + if (single_imm_use (cast_lhs, &tmp_use_p, &tmp_use))
> + use_stmt = tmp_use;
> + }
> +
> /* For now restrict this operations to single basic blocks. In theory
> we would want to support sinking the multiplication in
> m = a*b;
>
>
> --