The following adds the ability to vectorize a fma reduction pair
as SLP reduction (we cannot yet handle ternary association in
reduction vectorization yet).

Bootstrapped and tested on x86_64-unknown-linux-gnu.

        PR tree-optimization/109892
        * tree-vect-loop.cc (reduction_fn_for_scalar_code): Handle fma.

        * gcc.dg/vect/vect-reduc-fma-1.c: New testcase.
        * gcc.dg/vect/vect-reduc-fma-2.c: Likewise.
---
 gcc/testsuite/gcc.dg/vect/vect-reduc-fma-1.c | 15 +++++++++++++++
 gcc/testsuite/gcc.dg/vect/vect-reduc-fma-2.c | 20 ++++++++++++++++++++
 gcc/tree-vect-loop.cc                        |  4 ++++
 3 files changed, 39 insertions(+)
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-fma-1.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-fma-2.c

diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-1.c 
b/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-1.c
new file mode 100644
index 00000000000..e958b43e23b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-1.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-mfma" { target { x86_64-*-* i?86-*-* } } } */
+
+double f(double x[], long n)
+{
+    double r0 = 0, r1 = 0;
+    for (; n; x += 2, n--) {
+        r0 = __builtin_fma(x[0], x[0], r0);
+        r1 = __builtin_fma(x[1], x[1], r1);
+    }
+    return r0 + r1;
+}
+
+/* We should vectorize this as SLP reduction.  */
+/* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors and 
unroll factor 1" "vect" { target { x86_64-*-* i?86-*-* } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-2.c 
b/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-2.c
new file mode 100644
index 00000000000..ea1ca9720e5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-2.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-ffp-contract=on" } */
+/* { dg-additional-options "-mfma" { target { x86_64-*-* i?86-*-* } } } */
+
+static double muladd(double x, double y, double z)
+{
+    return x * y + z;
+}
+double g(double x[], long n)
+{
+    double r0 = 0, r1 = 0;
+    for (; n; x += 2, n--) {
+        r0 = muladd(x[0], x[0], r0);
+        r1 = muladd(x[1], x[1], r1);
+    }
+    return r0 + r1;
+}
+
+/* We should vectorize this as SLP reduction.  */
+/* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors and 
unroll factor 1" "vect" { target { x86_64-*-* i?86-*-* } } } } */
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index a3f95433a5b..1e6e9cede18 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -3906,6 +3906,10 @@ reduction_fn_for_scalar_code (code_helper code, 
internal_fn *reduc_fn)
        *reduc_fn = IFN_REDUC_FMIN;
        return true;
 
+      CASE_CFN_FMA:
+       *reduc_fn = IFN_LAST;
+       return true;
+
       default:
        return false;
       }
-- 
2.43.0

Reply via email to