The following adds the ability to vectorize a fma reduction pair as SLP reduction (we cannot yet handle ternary association in reduction vectorization yet).
Bootstrapped and tested on x86_64-unknown-linux-gnu. PR tree-optimization/109892 * tree-vect-loop.cc (reduction_fn_for_scalar_code): Handle fma. * gcc.dg/vect/vect-reduc-fma-1.c: New testcase. * gcc.dg/vect/vect-reduc-fma-2.c: Likewise. --- gcc/testsuite/gcc.dg/vect/vect-reduc-fma-1.c | 15 +++++++++++++++ gcc/testsuite/gcc.dg/vect/vect-reduc-fma-2.c | 20 ++++++++++++++++++++ gcc/tree-vect-loop.cc | 4 ++++ 3 files changed, 39 insertions(+) create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-fma-1.c create mode 100644 gcc/testsuite/gcc.dg/vect/vect-reduc-fma-2.c diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-1.c new file mode 100644 index 00000000000..e958b43e23b --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-1.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-mfma" { target { x86_64-*-* i?86-*-* } } } */ + +double f(double x[], long n) +{ + double r0 = 0, r1 = 0; + for (; n; x += 2, n--) { + r0 = __builtin_fma(x[0], x[0], r0); + r1 = __builtin_fma(x[1], x[1], r1); + } + return r0 + r1; +} + +/* We should vectorize this as SLP reduction. */ +/* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors and unroll factor 1" "vect" { target { x86_64-*-* i?86-*-* } } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-2.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-2.c new file mode 100644 index 00000000000..ea1ca9720e5 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-fma-2.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-ffp-contract=on" } */ +/* { dg-additional-options "-mfma" { target { x86_64-*-* i?86-*-* } } } */ + +static double muladd(double x, double y, double z) +{ + return x * y + z; +} +double g(double x[], long n) +{ + double r0 = 0, r1 = 0; + for (; n; x += 2, n--) { + r0 = muladd(x[0], x[0], r0); + r1 = muladd(x[1], x[1], r1); + } + return r0 + r1; +} + +/* We should vectorize this as SLP reduction. */ +/* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors and unroll factor 1" "vect" { target { x86_64-*-* i?86-*-* } } } } */ diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc index a3f95433a5b..1e6e9cede18 100644 --- a/gcc/tree-vect-loop.cc +++ b/gcc/tree-vect-loop.cc @@ -3906,6 +3906,10 @@ reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn) *reduc_fn = IFN_REDUC_FMIN; return true; + CASE_CFN_FMA: + *reduc_fn = IFN_LAST; + return true; + default: return false; } -- 2.43.0