Spencer Abson <spencer.ab...@arm.com> writes: > This patch extends the splitting patterns for combining FP comparisons > with predicated logical operations such that they cover all of SVE_F. > > gcc/ChangeLog: > > * config/aarch64/aarch64-sve.md (*fcm<cmp_op><mode>_and_combine): > Extend from SVE_FULL_F to SVE_F. > (*fcmuo<mode>_and_combine): Likewise. > (*fcm<cmp_op><mode>_bic_combine): Likewise. > (*fcm<cmp_op><mode>_nor_combine): Likewise. > (*fcmuo<mode>_bic_combine): Likewise. > (*fcmuo<mode>_nor_combine): Likewise. Move the comment here to > above fcmuo<mode>_bic_combine, since it applies to both patterns. > > gcc/testsuite/ChangeLog: > > * gcc.target/aarch64/sve/unpacked_fcm_combines_1.c: New test. > * gcc.target/aarch64/sve/unpacked_fcm_combines_2.c: Likewise.
OK. Thanks for catching the extra optimisations. Richard > --- > gcc/config/aarch64/aarch64-sve.md | 26 +++++++------- > .../aarch64/sve/unpacked_fcm_combines_1.c | 17 +++++++++ > .../aarch64/sve/unpacked_fcm_combines_2.c | 35 +++++++++++++++++++ > 3 files changed, 65 insertions(+), 13 deletions(-) > create mode 100644 > gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_combines_1.c > create mode 100644 > gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_combines_2.c > > diff --git a/gcc/config/aarch64/aarch64-sve.md > b/gcc/config/aarch64/aarch64-sve.md > index 6b5113eb70f..10aecf1f190 100644 > --- a/gcc/config/aarch64/aarch64-sve.md > +++ b/gcc/config/aarch64/aarch64-sve.md > @@ -8690,8 +8690,8 @@ > (unspec:<VPRED> > [(match_operand:<VPRED> 1) > (const_int SVE_KNOWN_PTRUE) > - (match_operand:SVE_FULL_F 2 "register_operand" "w, w") > - (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" "Dz, w")] > + (match_operand:SVE_F 2 "register_operand" "w, w") > + (match_operand:SVE_F 3 "aarch64_simd_reg_or_zero" "Dz, w")] > SVE_COND_FP_CMP_I0) > (match_operand:<VPRED> 4 "register_operand" "Upl, Upl")))] > "TARGET_SVE" > @@ -8713,8 +8713,8 @@ > (unspec:<VPRED> > [(match_operand:<VPRED> 1) > (const_int SVE_KNOWN_PTRUE) > - (match_operand:SVE_FULL_F 2 "register_operand" "w") > - (match_operand:SVE_FULL_F 3 "register_operand" "w")] > + (match_operand:SVE_F 2 "register_operand" "w") > + (match_operand:SVE_F 3 "register_operand" "w")] > UNSPEC_COND_FCMUO) > (match_operand:<VPRED> 4 "register_operand" "Upl")))] > "TARGET_SVE" > @@ -8740,8 +8740,8 @@ > (unspec:<VPRED> > [(match_operand:<VPRED> 1) > (const_int SVE_KNOWN_PTRUE) > - (match_operand:SVE_FULL_F 2 "register_operand" "w") > - (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" "wDz")] > + (match_operand:SVE_F 2 "register_operand" "w") > + (match_operand:SVE_F 3 "aarch64_simd_reg_or_zero" "wDz")] > SVE_COND_FP_CMP_I0)) > (match_operand:<VPRED> 4 "register_operand" "Upa")) > (match_dup:<VPRED> 1))) > @@ -8777,8 +8777,8 @@ > (unspec:<VPRED> > [(match_operand:<VPRED> 1) > (const_int SVE_KNOWN_PTRUE) > - (match_operand:SVE_FULL_F 2 "register_operand" "w") > - (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" "wDz")] > + (match_operand:SVE_F 2 "register_operand" "w") > + (match_operand:SVE_F 3 "aarch64_simd_reg_or_zero" "wDz")] > SVE_COND_FP_CMP_I0)) > (not:<VPRED> > (match_operand:<VPRED> 4 "register_operand" "Upa"))) > @@ -8808,6 +8808,7 @@ > } > ) > > +;; Same for unordered comparisons. > (define_insn_and_split "*fcmuo<mode>_bic_combine" > [(set (match_operand:<VPRED> 0 "register_operand" "=Upa") > (and:<VPRED> > @@ -8816,8 +8817,8 @@ > (unspec:<VPRED> > [(match_operand:<VPRED> 1) > (const_int SVE_KNOWN_PTRUE) > - (match_operand:SVE_FULL_F 2 "register_operand" "w") > - (match_operand:SVE_FULL_F 3 "register_operand" "w")] > + (match_operand:SVE_F 2 "register_operand" "w") > + (match_operand:SVE_F 3 "register_operand" "w")] > UNSPEC_COND_FCMUO)) > (match_operand:<VPRED> 4 "register_operand" "Upa")) > (match_dup:<VPRED> 1))) > @@ -8843,7 +8844,6 @@ > } > ) > > -;; Same for unordered comparisons. > (define_insn_and_split "*fcmuo<mode>_nor_combine" > [(set (match_operand:<VPRED> 0 "register_operand" "=Upa") > (and:<VPRED> > @@ -8852,8 +8852,8 @@ > (unspec:<VPRED> > [(match_operand:<VPRED> 1) > (const_int SVE_KNOWN_PTRUE) > - (match_operand:SVE_FULL_F 2 "register_operand" "w") > - (match_operand:SVE_FULL_F 3 "register_operand" "w")] > + (match_operand:SVE_F 2 "register_operand" "w") > + (match_operand:SVE_F 3 "register_operand" "w")] > UNSPEC_COND_FCMUO)) > (not:<VPRED> > (match_operand:<VPRED> 4 "register_operand" "Upa"))) > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_combines_1.c > b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_combines_1.c > new file mode 100644 > index 00000000000..c1f729e9f0a > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_combines_1.c > @@ -0,0 +1,17 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -moverride=sve_width=2048 > --param=aarch64-autovec-preference=sve-only -fno-trapping-math" } */ > + > +#include "unpacked_fcm_1.c" > + > +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 32 } } */ > +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 32 } } */ > +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 32 } } */ > + > +/* Drop a PTRUE predicated AND with the loop mask and comparison result in > + favour of predicating the comparison with the loop mask. */ > +/* { dg-final { scan-assembler-not {\tand\t} } } */ > + > +/* Similarly, for codes that are implemented via an inversion, prefer > + NOT (predicated with the loop mask) over BIC+PTRUE. */ > +/* { dg-final { scan-assembler-not {\tbic\t} } } */ > +/* { dg-final { scan-assembler-times {\tnot\t} 15 } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_combines_2.c > b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_combines_2.c > new file mode 100644 > index 00000000000..e7f7680ce53 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_fcm_combines_2.c > @@ -0,0 +1,35 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -moverride=sve_width=2048 > --param=aarch64-autovec-preference=sve-only -fno-trapping-math" } */ > + > +#include <stdint.h> > + > +/* Ensure that we still emit NOR here, rather than two NOTs. */ > + > +#define TEST_FCM_NOR(TYPE0, TYPE1, CMP, COUNT) \ > + void \ > + f_##TYPE0##_##TYPE1##_##CMP (TYPE0 *__restrict out, \ > + TYPE1 *__restrict a, \ > + TYPE1 *__restrict b, \ > + TYPE1 *__restrict c) \ > + { \ > + for (unsigned int i = 0; i < COUNT; i++) \ > + out[i] = !(CMP (a[i], c[i]) | CMP (b[i], c[i])) ? 3 : out[i]; \ > + } > + > +#define GT(A, B) ((A) > (B)) > + > +TEST_FCM_NOR (uint64_t, float, GT, 32) > +TEST_FCM_NOR (uint64_t, _Float16, GT, 32) > +TEST_FCM_NOR (uint32_t, _Float16, GT, 64) > + > +TEST_FCM_NOR (uint64_t, float, __builtin_isunordered, 32) > +TEST_FCM_NOR (uint64_t, _Float16, __builtin_isunordered, 32) > +TEST_FCM_NOR (uint32_t, _Float16, __builtin_isunordered, 64) > + > +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d} 6 } } */ > +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s} 6 } } */ > +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.d} 6 } } */ > + > +/* { dg-final { scan-assembler-not {\tbic\t} } } */ > +/* { dg-final { scan-assembler-not {\tnot\t} } } */ > +/* { dg-final { scan-assembler-times {\tnor\tp[0-9]+\.b, p[0-9]+/z, > p[0-9]+\.b, p[0-9]+\.b\n} 6 } } */