Re: [PATCH] aarch64: Add fcsel to cmov integer and csel to float cmov [PR98477]

2024-06-10 Thread Richard Sandiford
Andrew Pinski  writes:
> This patch adds an alternative to the integer cmov and one to floating
> point cmov so we avoid in some more moving
>
>   PR target/98477
>
> gcc/ChangeLog:
>
>   * config/aarch64/aarch64.md (*cmov_insn[GPI]): Add 'w'
>   alternative.
>   (*cmov_insn[GPF]): Add 'r' alternative.
>   * config/aarch64/iterators.md (wv): New mode attr.
>
> gcc/testsuite/ChangeLog:
>
>   * gcc.target/aarch64/csel_1.c: New test.
>   * gcc.target/aarch64/fcsel_2.c: New test.

This seems a bit dangerous while PR114766 remains unresolved.
The problem (AIUI) is that adding r and w alternatives to the
same pattern means that, when computing the cost of each register
class, r and w are equally cheap for each operand in isolation,
without the interdependencies being modelled.  (This is because
class preferences are calculated on a per-register basis.)

E.g. if:

- insn I1 is op0 = fn(op1, op2)
- I1 provides r and w alternatives
- other uses of op0 and op1 prefer r
- other uses of op2 prefer w

then I1 will not influence the costs of op0, op1 or op2, and so
I1 effectively will provide a zero-cost cross-over between r and w.

There again, we already have other patterns with this problem.  And I
think we do need to make it work somehow.  It's just a question of
whether we should pause adding more "r or w" alternatives until the
problem is fixed, or whether we should treat adding more alternatives
and fixing the RA problem as parallel work.

> Signed-off-by: Andrew Pinski 
> ---
>  gcc/config/aarch64/aarch64.md  | 13 +++
>  gcc/config/aarch64/iterators.md|  4 
>  gcc/testsuite/gcc.target/aarch64/csel_1.c  | 27 ++
>  gcc/testsuite/gcc.target/aarch64/fcsel_2.c | 20 
>  4 files changed, 59 insertions(+), 5 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/csel_1.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/fcsel_2.c
>
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 2bdd443e71d..a6cedd0f1b8 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -4404,6 +4404,7 @@ (define_insn "*cmov_insn"
>   [ r, Ui1 , rZ  ; csel] csinc\t%0, %4, zr, %M1
>   [ r, UsM , UsM ; mov_imm ] mov\t%0, -1
>   [ r, Ui1 , Ui1 ; mov_imm ] mov\t%0, 1
> + [ w, w   , w   ; fcsel   ] fcsel\t%0, %3, %4, 
> %m1
>}
>  )
>  
> @@ -4464,15 +4465,17 @@ (define_insn "*cmovdi_insn_uxtw"
>  )
>  
>  (define_insn "*cmov_insn"
> -  [(set (match_operand:GPF 0 "register_operand" "=w")
> +  [(set (match_operand:GPF 0 "register_operand" "=r,w")
>   (if_then_else:GPF
>(match_operator 1 "aarch64_comparison_operator"
> [(match_operand 2 "cc_register" "") (const_int 0)])
> -  (match_operand:GPF 3 "register_operand" "w")
> -  (match_operand:GPF 4 "register_operand" "w")))]
> +  (match_operand:GPF 3 "register_operand" "r,w")
> +  (match_operand:GPF 4 "register_operand" "r,w")))]
>"TARGET_FLOAT"
> -  "fcsel\\t%0, %3, %4, %m1"
> -  [(set_attr "type" "fcsel")]
> +  "@
> +   csel\t%0, %3, %4, %m1
> +   fcsel\\t%0, %3, %4, %m1"
> +  [(set_attr "type" "fcsel,csel")]
>  )

I think we should use the new syntax for all new insns with more than
one alternative.

Thanks,
Richard

>  
>  (define_expand "movcc"
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 99cde46f1ba..42303f2ec02 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -1147,6 +1147,10 @@ (define_mode_attr e [(CCFP "") (CCFPE "e")])
>  ;; 32-bit version and "%x0" in the 64-bit version.
>  (define_mode_attr w [(QI "w") (HI "w") (SI "w") (DI "x") (SF "s") (DF "d")])
>  
> +;; For cmov template to be used with fscel instruction
> +(define_mode_attr wv [(QI "s") (HI "s") (SI "s") (DI "d") (SF "s") (DF "d")])
> +
> +
>  ;; The size of access, in bytes.
>  (define_mode_attr ldst_sz [(SI "4") (DI "8")])
>  ;; Likewise for load/store pair.
> diff --git a/gcc/testsuite/gcc.target/aarch64/csel_1.c 
> b/gcc/testsuite/gcc.target/aarch64/csel_1.c
> new file mode 100644
> index 000..5848e5be2ff
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/csel_1.c
> @@ -0,0 +1,27 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fno-ssa-phiopt" } */
> +/* PR target/98477 */
> +
> +/* We should be able to produce csel followed by a store
> +   and not move between the GPRs and simd registers. */
> +/* Note -fno-ssa-phiopt is needed, otherwise the tree level
> +   does the VCE after the cmov which allowed to use the csel
> +   instruction. */
> +_Static_assert (sizeof(long long) == sizeof(double));
> +void
> +foo (int a, double *b, long long c, long long d)
> +{
> +  double ct;
> +  double dt;
> +  __builtin_memcpy(, , sizeof(long long));
> +  __builtin_memcpy(, , sizeof(long long));
> +  double t = a ? ct : dt;
> +  *b = t;
> +}
> +
> +/* { 

[PATCH] aarch64: Add fcsel to cmov integer and csel to float cmov [PR98477]

2024-05-06 Thread Andrew Pinski
This patch adds an alternative to the integer cmov and one to floating
point cmov so we avoid in some more moving

PR target/98477

gcc/ChangeLog:

* config/aarch64/aarch64.md (*cmov_insn[GPI]): Add 'w'
alternative.
(*cmov_insn[GPF]): Add 'r' alternative.
* config/aarch64/iterators.md (wv): New mode attr.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/csel_1.c: New test.
* gcc.target/aarch64/fcsel_2.c: New test.

Signed-off-by: Andrew Pinski 
---
 gcc/config/aarch64/aarch64.md  | 13 +++
 gcc/config/aarch64/iterators.md|  4 
 gcc/testsuite/gcc.target/aarch64/csel_1.c  | 27 ++
 gcc/testsuite/gcc.target/aarch64/fcsel_2.c | 20 
 4 files changed, 59 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/csel_1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/fcsel_2.c

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 2bdd443e71d..a6cedd0f1b8 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4404,6 +4404,7 @@ (define_insn "*cmov_insn"
  [ r, Ui1 , rZ  ; csel] csinc\t%0, %4, zr, %M1
  [ r, UsM , UsM ; mov_imm ] mov\t%0, -1
  [ r, Ui1 , Ui1 ; mov_imm ] mov\t%0, 1
+ [ w, w   , w   ; fcsel   ] fcsel\t%0, %3, %4, %m1
   }
 )
 
@@ -4464,15 +4465,17 @@ (define_insn "*cmovdi_insn_uxtw"
 )
 
 (define_insn "*cmov_insn"
-  [(set (match_operand:GPF 0 "register_operand" "=w")
+  [(set (match_operand:GPF 0 "register_operand" "=r,w")
(if_then_else:GPF
 (match_operator 1 "aarch64_comparison_operator"
  [(match_operand 2 "cc_register" "") (const_int 0)])
-(match_operand:GPF 3 "register_operand" "w")
-(match_operand:GPF 4 "register_operand" "w")))]
+(match_operand:GPF 3 "register_operand" "r,w")
+(match_operand:GPF 4 "register_operand" "r,w")))]
   "TARGET_FLOAT"
-  "fcsel\\t%0, %3, %4, %m1"
-  [(set_attr "type" "fcsel")]
+  "@
+   csel\t%0, %3, %4, %m1
+   fcsel\\t%0, %3, %4, %m1"
+  [(set_attr "type" "fcsel,csel")]
 )
 
 (define_expand "movcc"
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 99cde46f1ba..42303f2ec02 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -1147,6 +1147,10 @@ (define_mode_attr e [(CCFP "") (CCFPE "e")])
 ;; 32-bit version and "%x0" in the 64-bit version.
 (define_mode_attr w [(QI "w") (HI "w") (SI "w") (DI "x") (SF "s") (DF "d")])
 
+;; For cmov template to be used with fscel instruction
+(define_mode_attr wv [(QI "s") (HI "s") (SI "s") (DI "d") (SF "s") (DF "d")])
+
+
 ;; The size of access, in bytes.
 (define_mode_attr ldst_sz [(SI "4") (DI "8")])
 ;; Likewise for load/store pair.
diff --git a/gcc/testsuite/gcc.target/aarch64/csel_1.c 
b/gcc/testsuite/gcc.target/aarch64/csel_1.c
new file mode 100644
index 000..5848e5be2ff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/csel_1.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fno-ssa-phiopt" } */
+/* PR target/98477 */
+
+/* We should be able to produce csel followed by a store
+   and not move between the GPRs and simd registers. */
+/* Note -fno-ssa-phiopt is needed, otherwise the tree level
+   does the VCE after the cmov which allowed to use the csel
+   instruction. */
+_Static_assert (sizeof(long long) == sizeof(double));
+void
+foo (int a, double *b, long long c, long long d)
+{
+  double ct;
+  double dt;
+  __builtin_memcpy(, , sizeof(long long));
+  __builtin_memcpy(, , sizeof(long long));
+  double t = a ? ct : dt;
+  *b = t;
+}
+
+/* { dg-final { scan-assembler-not "\tfcsel\t"  } } */
+/* { dg-final { scan-assembler-times "\tcsel\t" 1 } } */
+/* The store should still happen from the GPRs */
+/* { dg-final { scan-assembler-not "\tstr\td"  } } */
+/* { dg-final { scan-assembler-times "\tstr\tx" 1 } } */
+/* { dg-final { scan-assembler-not "\tfmov\t" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/fcsel_2.c 
b/gcc/testsuite/gcc.target/aarch64/fcsel_2.c
new file mode 100644
index 000..309e8cbe37f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fcsel_2.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* PR target/98477 */
+
+#define vector16 __attribute__((vector_size(16)))
+/* We should be able to produce fscel followed by a store
+   and not move between the GPRs and simd registers. */
+void
+foo (int a, int *b, vector16 int c, vector16 int d)
+{
+  int t = a ? c[0] : d[0];
+  *b = t;
+}
+
+/* { dg-final { scan-assembler-times "\tfcsel\t" 1 } } */
+/* { dg-final { scan-assembler-not "\tcsel\t" } } */
+/* The store should still happen from the simd register */
+/* { dg-final { scan-assembler-times "\tstr\ts" 1 } } */
+/* { dg-final { scan-assembler-not "\tstr\tw" } } */
+/* { dg-final { scan-assembler-not "\tfmov\t" } } */
-- 
2.43.0