Re: Use canonicalize_math_after_vectorization_p for FMA folds

2018-05-24 Thread Richard Biener
On Thu, May 24, 2018 at 10:07 AM Richard Sandiford <
richard.sandif...@linaro.org> wrote:

> The folds in r260348 kicked in before vectorisation, which hurts
> for two reasons:

> (1) the current suboptimal handling of nothrow meant that we could
>  drop the flag early and so prevent if-conversion

> (2) some architectures provide more scalar forms than vector forms
>  (true for Advanced SIMD)

> (1) is a bug in itself that needs to be fixed eventually, but delaying
> the folds is still needed for (2).

> Tested on aarch64-linux-gnu (with and without SVE), aarch64_be-elf
> and x86_64-linux-gnu.  OK to install?

OK.

Richard.

> (Patch is mostly just reindent.)

> Richard


> 2018-05-24  Richard Sandiford  

> gcc/
>  * match.pd: Delay FMA folds until after vectorization.

> gcc/testsuite/
>  * gcc.dg/vect/vect-fma-1.c: New test.

> Index: gcc/match.pd
> ===
> --- gcc/match.pd2018-05-18 09:26:37.735714314 +0100
> +++ gcc/match.pd2018-05-24 09:05:10.432158893 +0100
> @@ -4703,59 +4703,60 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>wi::to_wide (@ipos) + isize))
>   (BIT_FIELD_REF @0 @rsize @rpos)

> -(for fmas (FMA)
> +(if (canonicalize_math_after_vectorization_p ())
> + (for fmas (FMA)
> +  (simplify
> +   (fmas:c (negate @0) @1 @2)
> +   (IFN_FNMA @0 @1 @2))
> +  (simplify
> +   (fmas @0 @1 (negate @2))
> +   (IFN_FMS @0 @1 @2))
> +  (simplify
> +   (fmas:c (negate @0) @1 (negate @2))
> +   (IFN_FNMS @0 @1 @2))
> +  (simplify
> +   (negate (fmas@3 @0 @1 @2))
> +   (if (single_use (@3))
> +(IFN_FNMS @0 @1 @2
> +
> + (simplify
> +  (IFN_FMS:c (negate @0) @1 @2)
> +  (IFN_FNMS @0 @1 @2))
>(simplify
> -  (fmas:c (negate @0) @1 @2)
> +  (IFN_FMS @0 @1 (negate @2))
> +  (IFN_FMA @0 @1 @2))
> + (simplify
> +  (IFN_FMS:c (negate @0) @1 (negate @2))
> (IFN_FNMA @0 @1 @2))
>(simplify
> -  (fmas @0 @1 (negate @2))
> -  (IFN_FMS @0 @1 @2))
> +  (negate (IFN_FMS@3 @0 @1 @2))
> +   (if (single_use (@3))
> +(IFN_FNMA @0 @1 @2)))
> +
> + (simplify
> +  (IFN_FNMA:c (negate @0) @1 @2)
> +  (IFN_FMA @0 @1 @2))
>(simplify
> -  (fmas:c (negate @0) @1 (negate @2))
> +  (IFN_FNMA @0 @1 (negate @2))
> (IFN_FNMS @0 @1 @2))
>(simplify
> -  (negate (fmas@3 @0 @1 @2))
> +  (IFN_FNMA:c (negate @0) @1 (negate @2))
> +  (IFN_FMS @0 @1 @2))
> + (simplify
> +  (negate (IFN_FNMA@3 @0 @1 @2))
> (if (single_use (@3))
> -   (IFN_FNMS @0 @1 @2
> +   (IFN_FMS @0 @1 @2)))

> -(simplify
> - (IFN_FMS:c (negate @0) @1 @2)
> - (IFN_FNMS @0 @1 @2))
> -(simplify
> - (IFN_FMS @0 @1 (negate @2))
> - (IFN_FMA @0 @1 @2))
> -(simplify
> - (IFN_FMS:c (negate @0) @1 (negate @2))
> - (IFN_FNMA @0 @1 @2))
> -(simplify
> - (negate (IFN_FMS@3 @0 @1 @2))
> + (simplify
> +  (IFN_FNMS:c (negate @0) @1 @2)
> +  (IFN_FMS @0 @1 @2))
> + (simplify
> +  (IFN_FNMS @0 @1 (negate @2))
> +  (IFN_FNMA @0 @1 @2))
> + (simplify
> +  (IFN_FNMS:c (negate @0) @1 (negate @2))
> +  (IFN_FMA @0 @1 @2))
> + (simplify
> +  (negate (IFN_FNMS@3 @0 @1 @2))
> (if (single_use (@3))
> -   (IFN_FNMA @0 @1 @2)))
> -
> -(simplify
> - (IFN_FNMA:c (negate @0) @1 @2)
> - (IFN_FMA @0 @1 @2))
> -(simplify
> - (IFN_FNMA @0 @1 (negate @2))
> - (IFN_FNMS @0 @1 @2))
> -(simplify
> - (IFN_FNMA:c (negate @0) @1 (negate @2))
> - (IFN_FMS @0 @1 @2))
> -(simplify
> - (negate (IFN_FNMA@3 @0 @1 @2))
> - (if (single_use (@3))
> -  (IFN_FMS @0 @1 @2)))
> -
> -(simplify
> - (IFN_FNMS:c (negate @0) @1 @2)
> - (IFN_FMS @0 @1 @2))
> -(simplify
> - (IFN_FNMS @0 @1 (negate @2))
> - (IFN_FNMA @0 @1 @2))
> -(simplify
> - (IFN_FNMS:c (negate @0) @1 (negate @2))
> - (IFN_FMA @0 @1 @2))
> -(simplify
> - (negate (IFN_FNMS@3 @0 @1 @2))
> - (if (single_use (@3))
> -  (IFN_FMA @0 @1 @2)))
> +   (IFN_FMA @0 @1 @2
> Index: gcc/testsuite/gcc.dg/vect/vect-fma-1.c
> ===
> --- /dev/null   2018-04-20 16:19:46.369131350 +0100
> +++ gcc/testsuite/gcc.dg/vect/vect-fma-1.c  2018-05-24
09:05:10.432158893 +0100
> @@ -0,0 +1,58 @@
> +/* { dg-require-effective-target scalar_all_fma } */
> +
> +#include "tree-vect.h"
> +
> +#define N (VECTOR_BITS * 11 / 64 + 3)
> +
> +#define DEF(INV)   \
> +  void __attribute__ ((noipa)) \
> +  f_##INV (double *restrict a, double *restrict b, \
> +  double *restrict c, double *restrict d)  \
> +  {\
> +for (int i = 0; i < N; ++i)\
> +  {\
> +   double mb = (INV & 1 ? -b[i] : b[i]);   \
> +   double mc = c[i];   \
> +   double md = (INV & 2 ? -d[i] : d[i]);   \
> +   double fma = __builtin_fma (mb, mc, md);\
> +   a[i] = (INV & 4 ? -fma : 

Use canonicalize_math_after_vectorization_p for FMA folds

2018-05-24 Thread Richard Sandiford
The folds in r260348 kicked in before vectorisation, which hurts
for two reasons:

(1) the current suboptimal handling of nothrow meant that we could
drop the flag early and so prevent if-conversion

(2) some architectures provide more scalar forms than vector forms
(true for Advanced SIMD)

(1) is a bug in itself that needs to be fixed eventually, but delaying
the folds is still needed for (2).

Tested on aarch64-linux-gnu (with and without SVE), aarch64_be-elf
and x86_64-linux-gnu.  OK to install?

(Patch is mostly just reindent.)

Richard


2018-05-24  Richard Sandiford  

gcc/
* match.pd: Delay FMA folds until after vectorization.

gcc/testsuite/
* gcc.dg/vect/vect-fma-1.c: New test.

Index: gcc/match.pd
===
--- gcc/match.pd2018-05-18 09:26:37.735714314 +0100
+++ gcc/match.pd2018-05-24 09:05:10.432158893 +0100
@@ -4703,59 +4703,60 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
  wi::to_wide (@ipos) + isize))
 (BIT_FIELD_REF @0 @rsize @rpos)
 
-(for fmas (FMA)
+(if (canonicalize_math_after_vectorization_p ())
+ (for fmas (FMA)
+  (simplify
+   (fmas:c (negate @0) @1 @2)
+   (IFN_FNMA @0 @1 @2))
+  (simplify
+   (fmas @0 @1 (negate @2))
+   (IFN_FMS @0 @1 @2))
+  (simplify
+   (fmas:c (negate @0) @1 (negate @2))
+   (IFN_FNMS @0 @1 @2))
+  (simplify
+   (negate (fmas@3 @0 @1 @2))
+   (if (single_use (@3))
+(IFN_FNMS @0 @1 @2
+
+ (simplify
+  (IFN_FMS:c (negate @0) @1 @2)
+  (IFN_FNMS @0 @1 @2))
  (simplify
-  (fmas:c (negate @0) @1 @2)
+  (IFN_FMS @0 @1 (negate @2))
+  (IFN_FMA @0 @1 @2))
+ (simplify
+  (IFN_FMS:c (negate @0) @1 (negate @2))
   (IFN_FNMA @0 @1 @2))
  (simplify
-  (fmas @0 @1 (negate @2))
-  (IFN_FMS @0 @1 @2))
+  (negate (IFN_FMS@3 @0 @1 @2))
+   (if (single_use (@3))
+(IFN_FNMA @0 @1 @2)))
+
+ (simplify
+  (IFN_FNMA:c (negate @0) @1 @2)
+  (IFN_FMA @0 @1 @2))
  (simplify
-  (fmas:c (negate @0) @1 (negate @2))
+  (IFN_FNMA @0 @1 (negate @2))
   (IFN_FNMS @0 @1 @2))
  (simplify
-  (negate (fmas@3 @0 @1 @2))
+  (IFN_FNMA:c (negate @0) @1 (negate @2))
+  (IFN_FMS @0 @1 @2))
+ (simplify
+  (negate (IFN_FNMA@3 @0 @1 @2))
   (if (single_use (@3))
-   (IFN_FNMS @0 @1 @2
+   (IFN_FMS @0 @1 @2)))
 
-(simplify
- (IFN_FMS:c (negate @0) @1 @2)
- (IFN_FNMS @0 @1 @2))
-(simplify
- (IFN_FMS @0 @1 (negate @2))
- (IFN_FMA @0 @1 @2))
-(simplify
- (IFN_FMS:c (negate @0) @1 (negate @2))
- (IFN_FNMA @0 @1 @2))
-(simplify
- (negate (IFN_FMS@3 @0 @1 @2))
+ (simplify
+  (IFN_FNMS:c (negate @0) @1 @2)
+  (IFN_FMS @0 @1 @2))
+ (simplify
+  (IFN_FNMS @0 @1 (negate @2))
+  (IFN_FNMA @0 @1 @2))
+ (simplify
+  (IFN_FNMS:c (negate @0) @1 (negate @2))
+  (IFN_FMA @0 @1 @2))
+ (simplify
+  (negate (IFN_FNMS@3 @0 @1 @2))
   (if (single_use (@3))
-   (IFN_FNMA @0 @1 @2)))
-
-(simplify
- (IFN_FNMA:c (negate @0) @1 @2)
- (IFN_FMA @0 @1 @2))
-(simplify
- (IFN_FNMA @0 @1 (negate @2))
- (IFN_FNMS @0 @1 @2))
-(simplify
- (IFN_FNMA:c (negate @0) @1 (negate @2))
- (IFN_FMS @0 @1 @2))
-(simplify
- (negate (IFN_FNMA@3 @0 @1 @2))
- (if (single_use (@3))
-  (IFN_FMS @0 @1 @2)))
-
-(simplify
- (IFN_FNMS:c (negate @0) @1 @2)
- (IFN_FMS @0 @1 @2))
-(simplify
- (IFN_FNMS @0 @1 (negate @2))
- (IFN_FNMA @0 @1 @2))
-(simplify
- (IFN_FNMS:c (negate @0) @1 (negate @2))
- (IFN_FMA @0 @1 @2))
-(simplify
- (negate (IFN_FNMS@3 @0 @1 @2))
- (if (single_use (@3))
-  (IFN_FMA @0 @1 @2)))
+   (IFN_FMA @0 @1 @2
Index: gcc/testsuite/gcc.dg/vect/vect-fma-1.c
===
--- /dev/null   2018-04-20 16:19:46.369131350 +0100
+++ gcc/testsuite/gcc.dg/vect/vect-fma-1.c  2018-05-24 09:05:10.432158893 
+0100
@@ -0,0 +1,58 @@
+/* { dg-require-effective-target scalar_all_fma } */
+
+#include "tree-vect.h"
+
+#define N (VECTOR_BITS * 11 / 64 + 3)
+
+#define DEF(INV)   \
+  void __attribute__ ((noipa)) \
+  f_##INV (double *restrict a, double *restrict b, \
+  double *restrict c, double *restrict d)  \
+  {\
+for (int i = 0; i < N; ++i)\
+  {\
+   double mb = (INV & 1 ? -b[i] : b[i]);   \
+   double mc = c[i];   \
+   double md = (INV & 2 ? -d[i] : d[i]);   \
+   double fma = __builtin_fma (mb, mc, md);\
+   a[i] = (INV & 4 ? -fma : fma);  \
+  }\
+  }
+
+#define TEST(INV)  \
+  {\
+f_##INV (a, b, c, d);  \
+for (int i = 0; i < N; ++i)\
+  {\
+   double