The following example

int foo2 (char *buf, int len) {
    int x;
    for (int i =0; i < len; i++) {
        x += (int) i * buf[i];
    }
    return x;
}

compiled with -O3 -mcpu=neoverse-v2 used to generate a 4x unrolled MLA sequence

        mla     z29.s, p7/m, z2.s, z0.s
        mla     z27.s, p7/m, z4.s, z26.s
        mla     z30.s, p7/m, z1.s, z0.s
        mla     z28.s, p7/m, z23.s, z3.s

but now generates MUL + ADD

        mul     z2.s, z2.s, z1.s
        mul     z4.s, z4.s, z26.s
        mul     z1.s, z24.s, z1.s
        mul     z3.s, z23.s, z3.s
        add     z29.s, z2.s, z29.s
        add     z30.s, z1.s, z30.s
        add     z28.s, z3.s, z28.s
        add     z0.s, z4.s, z0.s

This is since the fix for r16-3328-g3182e95eda4 we now insert casts around the
reduction addend.  This causes convert_mult_to_fma to miss the mul + add
sequence.

This patch teaches it to look around the casts for the operands and only accept
the conversions if it's essentially only a sign changing operations.  If the
operation is being converted from unsigned to signed, additionally it requires
that we're not using a type where the overflow wraps.

Concretely, it converts:

  # vect_vec_iv_.13_49 = PHI <_50(5), { 0, 1, 2, ... }(4)>
  vect__3.8_38 = MEM <vector([4,4]) char> [(char *)_16];
  vect__4.12_45 = (vector([4,4]) int) vect__3.8_38;
  vect__5.14_54 = vect__4.12_45 * vect_vec_iv_.13_49;
  vect_x_12.17_62 = VIEW_CONVERT_EXPR<vector([4,4]) unsigned 
int>(vect__5.14_54);
  vect_x_12.17_63 = VIEW_CONVERT_EXPR<vector([4,4]) unsigned 
int>(vect_x_16.15_58);
  vect_x_12.17_64 = vect_x_12.17_62 + vect_x_12.17_63;
  vect_x_12.16_65 = VIEW_CONVERT_EXPR<vector([4,4]) int>(vect_x_12.17_64);

into:

  # vect_vec_iv_.13_49 = PHI <_50(5), { 0, 1, 2, ... }(4)>
  vect__3.8_38 = MEM <vector([4,4]) charD.8> [(charD.8 *)_16];
  vect__4.12_45 = (vector([4,4]) intD.7) vect__3.8_38;
  vect_x_12.17_63 = VIEW_CONVERT_EXPR<vector([4,4]) unsigned 
int>(vect_x_16.15_58);
  _2 = (vector([4,4]) unsigned int) vect_vec_iv_.13_49;
  _1 = (vector([4,4]) unsigned int) vect__4.12_45;
  vect_x_12.17_64 = .FMA (_1, _2, vect_x_12.17_63);
  vect_x_12.16_65 = VIEW_CONVERT_EXPR<vector([4,4]) intD.7>(vect_x_12.17_64);

thus restoring FMAs on reductions.

Bootstrapped Regtested on aarch64-none-linux-gnu,
arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
-m32, -m64 and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

        PR tree-optimization/122749
        * tree-ssa-math-opts.cc (convert_mult_to_fma_1, convert_mult_to_fma):
        Unwrap converts around addend.

gcc/testsuite/ChangeLog:

        PR tree-optimization/122749
        * gcc.target/aarch64/pr122749_1.c: New test.
        * gcc.target/aarch64/pr122749_2.c: New test.
        * gcc.target/aarch64/pr122749_3.c: New test.
        * gcc.target/aarch64/pr122749_4.c: New test.
        * gcc.target/aarch64/pr122749_5.c: New test.
        * gcc.target/aarch64/pr122749_6.c: New test.
        * gcc.target/aarch64/pr122749_8.c: New test.
        * gcc.target/aarch64/pr122749_9.c: New test.
        * gcc.target/aarch64/sve/pr122749_1.c: New test.
        * gcc.target/aarch64/sve/pr122749_11.c: New test.
        * gcc.target/aarch64/sve/pr122749_12.c: New test.
        * gcc.target/aarch64/sve/pr122749_13.c: New test.
        * gcc.target/aarch64/sve/pr122749_14.c: New test.
        * gcc.target/aarch64/sve/pr122749_2.c: New test.
        * gcc.target/aarch64/sve/pr122749_3.c: New test.
        * gcc.target/aarch64/sve/pr122749_4.c: New test.
        * gcc.target/aarch64/sve/pr122749_5.c: New test.
        * gcc.target/aarch64/sve/pr122749_6.c: New test.
        * gcc.target/aarch64/sve/pr122749_8.c: New test.
        * gcc.target/aarch64/sve/pr122749_9.c: New test.

---
diff --git a/gcc/testsuite/gcc.target/aarch64/pr122749_1.c 
b/gcc/testsuite/gcc.target/aarch64/pr122749_1.c
new file mode 100644
index 
0000000000000000000000000000000000000000..25311fce4e3a79b389cbb750231c1277ccaf0611
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr122749_1.c
@@ -0,0 +1,48 @@
+/* { dg-do run { target arm_v8_neon_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 --param vect-epilogues-nomask=0 
-fdump-tree-vect-details -fdump-tree-widening_mul" } */
+
+#include <limits.h>
+#include <stdint.h>
+
+typedef int8_t elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1, -2, INT8_MAX, INT8_MIN, 5, -7, 3, -4 };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\.FMA" 1 "widening_mul" { xfail *-*-* } 
} } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr122749_2.c 
b/gcc/testsuite/gcc.target/aarch64/pr122749_2.c
new file mode 100644
index 
0000000000000000000000000000000000000000..f4a70a611176893e9fa55d8bc1826805ed5d966d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr122749_2.c
@@ -0,0 +1,48 @@
+/* { dg-do run { target arm_v8_neon_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 --param vect-epilogues-nomask=0 
-fdump-tree-vect-details -fdump-tree-widening_mul" } */
+
+#include <limits.h>
+#include <stdint.h>
+
+typedef int16_t elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1, -2, INT16_MAX, INT16_MIN, 5, -7, 3, -4 };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\.FMA" 1 "widening_mul" { xfail *-*-* } 
} } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr122749_3.c 
b/gcc/testsuite/gcc.target/aarch64/pr122749_3.c
new file mode 100644
index 
0000000000000000000000000000000000000000..61bcd30be2b47f482e8b3f0a024b2a1d51c4fda7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr122749_3.c
@@ -0,0 +1,48 @@
+/* { dg-do run { target arm_v8_neon_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 --param vect-epilogues-nomask=0 
-fdump-tree-vect-details -fdump-tree-widening_mul" } */
+
+#include <limits.h>
+#include <stdint.h>
+
+typedef int32_t elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1, -2, INT32_MAX, INT32_MIN, 5, -7, 3, -4 };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\.FMA" 1 "widening_mul" { xfail *-*-* } 
} } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr122749_4.c 
b/gcc/testsuite/gcc.target/aarch64/pr122749_4.c
new file mode 100644
index 
0000000000000000000000000000000000000000..6089716b0ca7498f9b8089f1b72d2968b1c2ee76
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr122749_4.c
@@ -0,0 +1,45 @@
+/* { dg-do run { target arm_v8_neon_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 --param vect-epilogues-nomask=0 
-fdump-tree-vect-details -fdump-tree-widening_mul" } */
+
+typedef float elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\\.FMA" 4 "widening_mul" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr122749_5.c 
b/gcc/testsuite/gcc.target/aarch64/pr122749_5.c
new file mode 100644
index 
0000000000000000000000000000000000000000..562dc5be861762272ea8d23b8304e1abb439e20f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr122749_5.c
@@ -0,0 +1,45 @@
+/* { dg-do run { target arm_v8_neon_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 --param vect-epilogues-nomask=0 
-fdump-tree-vect-details -fdump-tree-widening_mul" } */
+
+typedef double elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1.0, 2.0, 1.0, 2.0, 1.0, 2.0 };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\\.FMA" 2 "widening_mul" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr122749_6.c 
b/gcc/testsuite/gcc.target/aarch64/pr122749_6.c
new file mode 100644
index 
0000000000000000000000000000000000000000..3e51c5e22a18a9a3acd2416c3ba72496c9621adf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr122749_6.c
@@ -0,0 +1,45 @@
+/* { dg-do run { target arm_v8_neon_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 --param vect-epilogues-nomask=0 
-fwrapv -fdump-tree-vect-details -fdump-tree-widening_mul" } */
+
+typedef float elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\\.FMA" 4 "widening_mul" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr122749_8.c 
b/gcc/testsuite/gcc.target/aarch64/pr122749_8.c
new file mode 100644
index 
0000000000000000000000000000000000000000..6aa729c13d1616273d579077253d3fcdf55cc555
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr122749_8.c
@@ -0,0 +1,48 @@
+/* { dg-do run { target arm_v8_neon_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 --param vect-epilogues-nomask=0 
-fdump-tree-vect-details -fdump-tree-widening_mul" } */
+
+#include <limits.h>
+#include <stdint.h>
+
+typedef uint8_t elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1, 2, UINT8_MAX, 7, 0, UINT8_MAX, 5, 9 };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\.FMA" 1 "widening_mul" { xfail *-*-* } 
} } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr122749_9.c 
b/gcc/testsuite/gcc.target/aarch64/pr122749_9.c
new file mode 100644
index 
0000000000000000000000000000000000000000..d987a9936afb2cb4ba19e62736fa4ed171669e25
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr122749_9.c
@@ -0,0 +1,48 @@
+/* { dg-do run { target arm_v8_neon_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 --param vect-epilogues-nomask=0 
-fdump-tree-vect-details -fdump-tree-widening_mul" } */
+
+#include <limits.h>
+#include <stdint.h>
+
+typedef uint16_t elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1, 2, UINT16_MAX, 7, 0, UINT16_MAX, 5, 9 };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\.FMA" 1 "widening_mul" { xfail *-*-* } 
} } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_1.c
new file mode 100644
index 
0000000000000000000000000000000000000000..32a36461fbc7bb78048ae68c8dc0bdd81b11a2cd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_1.c
@@ -0,0 +1,48 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details 
-fdump-tree-widening_mul" } */
+
+#include <limits.h>
+#include <stdint.h>
+
+typedef int8_t elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1, -2, INT8_MAX, INT8_MIN, 5, -7, 3, -4 };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_11.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_11.c
new file mode 100644
index 
0000000000000000000000000000000000000000..bd160dd0ebf515a3ff3ddd1969303aabf8c03aea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_11.c
@@ -0,0 +1,48 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details 
-fdump-tree-widening_mul" } */
+
+#include <limits.h>
+#include <stdint.h>
+
+typedef uint8_t elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1, 2, UINT8_MAX, 7, 0, UINT8_MAX, 5, 9 };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_12.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_12.c
new file mode 100644
index 
0000000000000000000000000000000000000000..8f0198ce42600b0fe92bf483123ad1cb71ff9f24
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_12.c
@@ -0,0 +1,48 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details 
-fdump-tree-widening_mul" } */
+
+#include <limits.h>
+#include <stdint.h>
+
+typedef uint16_t elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1, 2, UINT16_MAX, 7, 0, UINT16_MAX, 5, 9 };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_13.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_13.c
new file mode 100644
index 
0000000000000000000000000000000000000000..218afde13984fc64755d3c4567a05a33b5485411
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_13.c
@@ -0,0 +1,48 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details 
-fdump-tree-widening_mul" } */
+
+#include <limits.h>
+#include <stdint.h>
+
+typedef uint32_t elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1, 2, UINT32_MAX, 7, 0, UINT32_MAX, 5, 9 };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_14.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_14.c
new file mode 100644
index 
0000000000000000000000000000000000000000..1587628757e28f66dfd515e191ef04331c549434
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_14.c
@@ -0,0 +1,48 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details 
-fdump-tree-widening_mul" } */
+
+#include <limits.h>
+#include <stdint.h>
+
+typedef uint64_t elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1, 2, UINT64_MAX, 7, 0, UINT64_MAX, 5, 9 };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_2.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_2.c
new file mode 100644
index 
0000000000000000000000000000000000000000..0f5918a9023521b06ac20ef922b025dc6a1e8f01
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_2.c
@@ -0,0 +1,48 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details 
-fdump-tree-widening_mul" } */
+
+#include <limits.h>
+#include <stdint.h>
+
+typedef int16_t elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1, -2, INT16_MAX, INT16_MIN, 5, -7, 3, -4 };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_3.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_3.c
new file mode 100644
index 
0000000000000000000000000000000000000000..92548cb6ec4fdc4a3d133669fb914c5ab9a103ba
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_3.c
@@ -0,0 +1,48 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details 
-fdump-tree-widening_mul" } */
+
+#include <limits.h>
+#include <stdint.h>
+
+typedef int32_t elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1, -2, INT32_MAX, INT32_MIN, 5, -7, 3, -4 };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_4.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_4.c
new file mode 100644
index 
0000000000000000000000000000000000000000..6085a18bab7f2ae0e5855a982e186f831705bf52
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_4.c
@@ -0,0 +1,48 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details 
-fdump-tree-widening_mul" } */
+
+#include <limits.h>
+#include <stdint.h>
+
+typedef int64_t elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1, -2, INT64_MAX, INT64_MIN, 5, -7, 3, -4 };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_5.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_5.c
new file mode 100644
index 
0000000000000000000000000000000000000000..d61b91bb06dc0a035bd6adfabccc580eac7f78a6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_5.c
@@ -0,0 +1,45 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details 
-fdump-tree-widening_mul" } */
+
+typedef float elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_6.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_6.c
new file mode 100644
index 
0000000000000000000000000000000000000000..7598f7a28bcf1745ce672c0bab22fec0fda37a3f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_6.c
@@ -0,0 +1,45 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details 
-fdump-tree-widening_mul" } */
+
+typedef double elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1.0, 2.0, 1.0, 2.0, 1.0, 2.0 };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_8.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_8.c
new file mode 100644
index 
0000000000000000000000000000000000000000..e1c337d44ead96d868d71f0ae54960f2189e499e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_8.c
@@ -0,0 +1,45 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 -fwrapv -fdump-tree-vect-details 
-fdump-tree-widening_mul" } */
+
+typedef float elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_9.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_9.c
new file mode 100644
index 
0000000000000000000000000000000000000000..13d962e2130f986910f1a94489e4014761e917b5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_9.c
@@ -0,0 +1,45 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 -fwrapv -fdump-tree-vect-details 
-fdump-tree-widening_mul" } */
+
+typedef double elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1.0, 2.0, 1.0, 2.0, 1.0, 2.0 };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc
index 
02e194ae06f34957194c4e4f2eb4fdb3ef72d2f5..aa12221a2b2b584fa10fe378e16115128408ee3e
 100644
--- a/gcc/tree-ssa-math-opts.cc
+++ b/gcc/tree-ssa-math-opts.cc
@@ -3120,6 +3120,30 @@ convert_mult_to_fma_1 (tree mul_result, tree op1, tree 
op2)
       if (is_gimple_debug (use_stmt))
        continue;
 
+      /* If the use is a type convert, look further into it if the operations
+        are the same under two's complement.  */
+      tree lhs_type;
+      if (gimple_assign_cast_p (use_stmt)
+         && (lhs_type = TREE_TYPE (gimple_get_lhs (use_stmt)))
+         && TREE_CODE (lhs_type) == TREE_CODE (TREE_TYPE (op1))
+         && (TYPE_UNSIGNED (lhs_type)
+             || (ANY_INTEGRAL_TYPE_P (lhs_type)
+                 && !TYPE_OVERFLOW_WRAPS (lhs_type)))
+         && (element_precision (lhs_type)
+               == element_precision (gimple_assign_rhs1 (use_stmt))))
+       {
+         tree cast_lhs = gimple_get_lhs (use_stmt);
+         gimple *tmp_use;
+         use_operand_p tmp_use_p;
+         if (single_imm_use (cast_lhs, &tmp_use_p, &tmp_use))
+           {
+             use_stmt = tmp_use;
+             result = cast_lhs;
+             gsi_remove (&gsi, true);
+             gsi = gsi_for_stmt (use_stmt);
+           }
+       }
+
       if (is_gimple_assign (use_stmt)
          && gimple_assign_rhs_code (use_stmt) == NEGATE_EXPR)
        {
@@ -3156,6 +3180,11 @@ convert_mult_to_fma_1 (tree mul_result, tree op1, tree 
op2)
       if (negate_p)
        mulop1 = gimple_build (&seq, NEGATE_EXPR, type, mulop1);
 
+      /* Ensure all the operands are of the same type Use the type of the
+        addend as that's the statement being replaced.  */
+      op2 = gimple_convert (&seq, TREE_TYPE (addop), op2);
+      mulop1 = gimple_convert (&seq, TREE_TYPE (addop), mulop1);
+
       if (seq)
        gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
 
@@ -3419,6 +3448,25 @@ convert_mult_to_fma (gimple *mul_stmt, tree op1, tree 
op2,
       if (is_gimple_debug (use_stmt))
        continue;
 
+      /* If the use is a type convert, look further into it if the operations
+        are the same under two's complement.  */
+      tree lhs_type;
+      if (gimple_assign_cast_p (use_stmt)
+         && (lhs_type = TREE_TYPE (gimple_get_lhs (use_stmt)))
+         && TREE_CODE (lhs_type) == TREE_CODE (TREE_TYPE (op1))
+         && (TYPE_UNSIGNED (lhs_type)
+             || (ANY_INTEGRAL_TYPE_P (lhs_type)
+                 && !TYPE_OVERFLOW_WRAPS (lhs_type)))
+         && (element_precision (lhs_type)
+               == element_precision (gimple_assign_rhs1 (use_stmt))))
+       {
+         tree cast_lhs = gimple_get_lhs (use_stmt);
+         gimple *tmp_use;
+         use_operand_p tmp_use_p;
+         if (single_imm_use (cast_lhs, &tmp_use_p, &tmp_use))
+           use_stmt = tmp_use;
+       }
+
       /* For now restrict this operations to single basic blocks.  In theory
         we would want to support sinking the multiplication in
         m = a*b;


-- 
diff --git a/gcc/testsuite/gcc.target/aarch64/pr122749_1.c b/gcc/testsuite/gcc.target/aarch64/pr122749_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..25311fce4e3a79b389cbb750231c1277ccaf0611
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr122749_1.c
@@ -0,0 +1,48 @@
+/* { dg-do run { target arm_v8_neon_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 --param vect-epilogues-nomask=0 -fdump-tree-vect-details -fdump-tree-widening_mul" } */
+
+#include <limits.h>
+#include <stdint.h>
+
+typedef int8_t elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1, -2, INT8_MAX, INT8_MIN, 5, -7, 3, -4 };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\.FMA" 1 "widening_mul" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr122749_2.c b/gcc/testsuite/gcc.target/aarch64/pr122749_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..f4a70a611176893e9fa55d8bc1826805ed5d966d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr122749_2.c
@@ -0,0 +1,48 @@
+/* { dg-do run { target arm_v8_neon_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 --param vect-epilogues-nomask=0 -fdump-tree-vect-details -fdump-tree-widening_mul" } */
+
+#include <limits.h>
+#include <stdint.h>
+
+typedef int16_t elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1, -2, INT16_MAX, INT16_MIN, 5, -7, 3, -4 };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\.FMA" 1 "widening_mul" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr122749_3.c b/gcc/testsuite/gcc.target/aarch64/pr122749_3.c
new file mode 100644
index 0000000000000000000000000000000000000000..61bcd30be2b47f482e8b3f0a024b2a1d51c4fda7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr122749_3.c
@@ -0,0 +1,48 @@
+/* { dg-do run { target arm_v8_neon_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 --param vect-epilogues-nomask=0 -fdump-tree-vect-details -fdump-tree-widening_mul" } */
+
+#include <limits.h>
+#include <stdint.h>
+
+typedef int32_t elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1, -2, INT32_MAX, INT32_MIN, 5, -7, 3, -4 };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\.FMA" 1 "widening_mul" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr122749_4.c b/gcc/testsuite/gcc.target/aarch64/pr122749_4.c
new file mode 100644
index 0000000000000000000000000000000000000000..6089716b0ca7498f9b8089f1b72d2968b1c2ee76
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr122749_4.c
@@ -0,0 +1,45 @@
+/* { dg-do run { target arm_v8_neon_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 --param vect-epilogues-nomask=0 -fdump-tree-vect-details -fdump-tree-widening_mul" } */
+
+typedef float elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\\.FMA" 4 "widening_mul" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr122749_5.c b/gcc/testsuite/gcc.target/aarch64/pr122749_5.c
new file mode 100644
index 0000000000000000000000000000000000000000..562dc5be861762272ea8d23b8304e1abb439e20f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr122749_5.c
@@ -0,0 +1,45 @@
+/* { dg-do run { target arm_v8_neon_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 --param vect-epilogues-nomask=0 -fdump-tree-vect-details -fdump-tree-widening_mul" } */
+
+typedef double elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1.0, 2.0, 1.0, 2.0, 1.0, 2.0 };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\\.FMA" 2 "widening_mul" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr122749_6.c b/gcc/testsuite/gcc.target/aarch64/pr122749_6.c
new file mode 100644
index 0000000000000000000000000000000000000000..3e51c5e22a18a9a3acd2416c3ba72496c9621adf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr122749_6.c
@@ -0,0 +1,45 @@
+/* { dg-do run { target arm_v8_neon_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 --param vect-epilogues-nomask=0 -fwrapv -fdump-tree-vect-details -fdump-tree-widening_mul" } */
+
+typedef float elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\\.FMA" 4 "widening_mul" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr122749_8.c b/gcc/testsuite/gcc.target/aarch64/pr122749_8.c
new file mode 100644
index 0000000000000000000000000000000000000000..6aa729c13d1616273d579077253d3fcdf55cc555
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr122749_8.c
@@ -0,0 +1,48 @@
+/* { dg-do run { target arm_v8_neon_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 --param vect-epilogues-nomask=0 -fdump-tree-vect-details -fdump-tree-widening_mul" } */
+
+#include <limits.h>
+#include <stdint.h>
+
+typedef uint8_t elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1, 2, UINT8_MAX, 7, 0, UINT8_MAX, 5, 9 };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\.FMA" 1 "widening_mul" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr122749_9.c b/gcc/testsuite/gcc.target/aarch64/pr122749_9.c
new file mode 100644
index 0000000000000000000000000000000000000000..d987a9936afb2cb4ba19e62736fa4ed171669e25
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr122749_9.c
@@ -0,0 +1,48 @@
+/* { dg-do run { target arm_v8_neon_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 --param vect-epilogues-nomask=0 -fdump-tree-vect-details -fdump-tree-widening_mul" } */
+
+#include <limits.h>
+#include <stdint.h>
+
+typedef uint16_t elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1, 2, UINT16_MAX, 7, 0, UINT16_MAX, 5, 9 };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\.FMA" 1 "widening_mul" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_1.c b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..32a36461fbc7bb78048ae68c8dc0bdd81b11a2cd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_1.c
@@ -0,0 +1,48 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details -fdump-tree-widening_mul" } */
+
+#include <limits.h>
+#include <stdint.h>
+
+typedef int8_t elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1, -2, INT8_MAX, INT8_MIN, 5, -7, 3, -4 };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_11.c b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_11.c
new file mode 100644
index 0000000000000000000000000000000000000000..bd160dd0ebf515a3ff3ddd1969303aabf8c03aea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_11.c
@@ -0,0 +1,48 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details -fdump-tree-widening_mul" } */
+
+#include <limits.h>
+#include <stdint.h>
+
+typedef uint8_t elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1, 2, UINT8_MAX, 7, 0, UINT8_MAX, 5, 9 };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_12.c b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_12.c
new file mode 100644
index 0000000000000000000000000000000000000000..8f0198ce42600b0fe92bf483123ad1cb71ff9f24
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_12.c
@@ -0,0 +1,48 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details -fdump-tree-widening_mul" } */
+
+#include <limits.h>
+#include <stdint.h>
+
+typedef uint16_t elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1, 2, UINT16_MAX, 7, 0, UINT16_MAX, 5, 9 };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_13.c b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_13.c
new file mode 100644
index 0000000000000000000000000000000000000000..218afde13984fc64755d3c4567a05a33b5485411
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_13.c
@@ -0,0 +1,48 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details -fdump-tree-widening_mul" } */
+
+#include <limits.h>
+#include <stdint.h>
+
+typedef uint32_t elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1, 2, UINT32_MAX, 7, 0, UINT32_MAX, 5, 9 };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_14.c b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_14.c
new file mode 100644
index 0000000000000000000000000000000000000000..1587628757e28f66dfd515e191ef04331c549434
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_14.c
@@ -0,0 +1,48 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details -fdump-tree-widening_mul" } */
+
+#include <limits.h>
+#include <stdint.h>
+
+typedef uint64_t elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1, 2, UINT64_MAX, 7, 0, UINT64_MAX, 5, 9 };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_2.c b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..0f5918a9023521b06ac20ef922b025dc6a1e8f01
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_2.c
@@ -0,0 +1,48 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details -fdump-tree-widening_mul" } */
+
+#include <limits.h>
+#include <stdint.h>
+
+typedef int16_t elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1, -2, INT16_MAX, INT16_MIN, 5, -7, 3, -4 };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_3.c b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_3.c
new file mode 100644
index 0000000000000000000000000000000000000000..92548cb6ec4fdc4a3d133669fb914c5ab9a103ba
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_3.c
@@ -0,0 +1,48 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details -fdump-tree-widening_mul" } */
+
+#include <limits.h>
+#include <stdint.h>
+
+typedef int32_t elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1, -2, INT32_MAX, INT32_MIN, 5, -7, 3, -4 };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_4.c b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_4.c
new file mode 100644
index 0000000000000000000000000000000000000000..6085a18bab7f2ae0e5855a982e186f831705bf52
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_4.c
@@ -0,0 +1,48 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details -fdump-tree-widening_mul" } */
+
+#include <limits.h>
+#include <stdint.h>
+
+typedef int64_t elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1, -2, INT64_MAX, INT64_MIN, 5, -7, 3, -4 };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_5.c b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_5.c
new file mode 100644
index 0000000000000000000000000000000000000000..d61b91bb06dc0a035bd6adfabccc580eac7f78a6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_5.c
@@ -0,0 +1,45 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details -fdump-tree-widening_mul" } */
+
+typedef float elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_6.c b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_6.c
new file mode 100644
index 0000000000000000000000000000000000000000..7598f7a28bcf1745ce672c0bab22fec0fda37a3f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_6.c
@@ -0,0 +1,45 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 -fdump-tree-vect-details -fdump-tree-widening_mul" } */
+
+typedef double elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1.0, 2.0, 1.0, 2.0, 1.0, 2.0 };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_8.c b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_8.c
new file mode 100644
index 0000000000000000000000000000000000000000..e1c337d44ead96d868d71f0ae54960f2189e499e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_8.c
@@ -0,0 +1,45 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 -fwrapv -fdump-tree-vect-details -fdump-tree-widening_mul" } */
+
+typedef float elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr122749_9.c b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_9.c
new file mode 100644
index 0000000000000000000000000000000000000000..13d962e2130f986910f1a94489e4014761e917b5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr122749_9.c
@@ -0,0 +1,45 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-additional-options "-Ofast -std=gnu99 -fwrapv -fdump-tree-vect-details -fdump-tree-widening_mul" } */
+
+typedef double elem_t;
+
+__attribute__ ((noipa))
+elem_t
+foo2 (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+static elem_t
+reference (elem_t *buf, int len)
+{
+  elem_t x = 0;
+
+#pragma GCC novector
+  for (int i = 0; i < len; i++)
+    x += (elem_t) i * buf[i];
+
+  return x;
+}
+
+int
+main (void)
+{
+  elem_t buf[] = { 1.0, 2.0, 1.0, 2.0, 1.0, 2.0 };
+  int len = sizeof (buf) / sizeof (buf[0]);
+  elem_t want = reference (buf, len);
+  elem_t got = foo2 (buf, len);
+
+  if (want != got)
+    __builtin_abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "\\.COND_FMA" 1 "widening_mul" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loop" 1 "vect" } } */
diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc
index 02e194ae06f34957194c4e4f2eb4fdb3ef72d2f5..aa12221a2b2b584fa10fe378e16115128408ee3e 100644
--- a/gcc/tree-ssa-math-opts.cc
+++ b/gcc/tree-ssa-math-opts.cc
@@ -3120,6 +3120,30 @@ convert_mult_to_fma_1 (tree mul_result, tree op1, tree op2)
       if (is_gimple_debug (use_stmt))
 	continue;
 
+      /* If the use is a type convert, look further into it if the operations
+	 are the same under two's complement.  */
+      tree lhs_type;
+      if (gimple_assign_cast_p (use_stmt)
+	  && (lhs_type = TREE_TYPE (gimple_get_lhs (use_stmt)))
+	  && TREE_CODE (lhs_type) == TREE_CODE (TREE_TYPE (op1))
+	  && (TYPE_UNSIGNED (lhs_type)
+	      || (ANY_INTEGRAL_TYPE_P (lhs_type)
+		  && !TYPE_OVERFLOW_WRAPS (lhs_type)))
+	  && (element_precision (lhs_type)
+		== element_precision (gimple_assign_rhs1 (use_stmt))))
+	{
+	  tree cast_lhs = gimple_get_lhs (use_stmt);
+	  gimple *tmp_use;
+	  use_operand_p tmp_use_p;
+	  if (single_imm_use (cast_lhs, &tmp_use_p, &tmp_use))
+	    {
+	      use_stmt = tmp_use;
+	      result = cast_lhs;
+	      gsi_remove (&gsi, true);
+	      gsi = gsi_for_stmt (use_stmt);
+	    }
+	}
+
       if (is_gimple_assign (use_stmt)
 	  && gimple_assign_rhs_code (use_stmt) == NEGATE_EXPR)
 	{
@@ -3156,6 +3180,11 @@ convert_mult_to_fma_1 (tree mul_result, tree op1, tree op2)
       if (negate_p)
 	mulop1 = gimple_build (&seq, NEGATE_EXPR, type, mulop1);
 
+      /* Ensure all the operands are of the same type Use the type of the
+	 addend as that's the statement being replaced.  */
+      op2 = gimple_convert (&seq, TREE_TYPE (addop), op2);
+      mulop1 = gimple_convert (&seq, TREE_TYPE (addop), mulop1);
+
       if (seq)
 	gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
 
@@ -3419,6 +3448,25 @@ convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2,
       if (is_gimple_debug (use_stmt))
 	continue;
 
+      /* If the use is a type convert, look further into it if the operations
+	 are the same under two's complement.  */
+      tree lhs_type;
+      if (gimple_assign_cast_p (use_stmt)
+	  && (lhs_type = TREE_TYPE (gimple_get_lhs (use_stmt)))
+	  && TREE_CODE (lhs_type) == TREE_CODE (TREE_TYPE (op1))
+	  && (TYPE_UNSIGNED (lhs_type)
+	      || (ANY_INTEGRAL_TYPE_P (lhs_type)
+		  && !TYPE_OVERFLOW_WRAPS (lhs_type)))
+	  && (element_precision (lhs_type)
+		== element_precision (gimple_assign_rhs1 (use_stmt))))
+	{
+	  tree cast_lhs = gimple_get_lhs (use_stmt);
+	  gimple *tmp_use;
+	  use_operand_p tmp_use_p;
+	  if (single_imm_use (cast_lhs, &tmp_use_p, &tmp_use))
+	    use_stmt = tmp_use;
+	}
+
       /* For now restrict this operations to single basic blocks.  In theory
 	 we would want to support sinking the multiplication in
 	 m = a*b;

Reply via email to