On Fri, Jun 06, 2025 at 04:04:18PM +0100, Richard Sandiford wrote: > Spencer Abson <spencer.ab...@arm.com> writes: > > Extend the ternary op/UNSPEC_SEL combiner patterns from SVE_FULL_F/ > > SVE_FULL_F_BF to SVE_F/SVE_F_BF, where the strictness value is > > SVE_RELAXED_GP. > > > > We can only reliably test the 'merging with the third input' (addend) > > and 'independent value' patterns at this stage as the canocalisation that > > reorders the multiplicands based on the second SEL input would be performed > > by the conditional expander. > > > > Another difficulty is that we can't test these fused multiply/SEL combines > > without using __builtin_fma and friends. The reason for this is as follows: > > > > We support COND_ADD, COND_SUB, and COND_MUL optabs, so match.pd will > > canonicalize patterns like ADD/SUB/MUL combined with a VEC_COND_EXPR into > > these conditional forms. Later, when widening_mul tries to fold these into > > conditional fused multiply operations, the transformation fails - simply > > because we haven’t implemented those conditional fused multiply optabs yet. > > > > Hence why this patch lacks tests for BFloat16... > > > > gcc/ChangeLog: > > > > * config/aarch64/aarch64-sve.md (*cond_<optab><mode>_2_relaxed): > > Extend from SVE_FULL_F to SVE_F. > > (*cond_<optab><mode>_4_relaxed): Extend from SVE_FULL_F_B16B16 > > to SVE_F_B16B16. > > (*cond_<optab><mode>_any_relaxed): Likewise. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/aarch64/sve/unpacked_cond_fmla_1.c: New test. > > * gcc.target/aarch64/sve/unpacked_cond_fmls_1.c: Likewise. > > * gcc.target/aarch64/sve/unpacked_cond_fnmla_1.c: Likewise. > > * gcc.target/aarch64/sve/unpacked_cond_fnmls_1.c: Likewise. > > OK, thanks. > > BTW, I just realised that all my comments saying "please add a token > test that SEL doesn't get dropped" are probably completely bogus, > since those cases will instead be handed by the follow-on patches > to the conditional optabs. Is that right? Please ignore if so. :)
Yeah, that's true where we have a conditional optab, but that isn't the case for any of the interesting unary operations. Maybe I should include '_2.c' style tests for each of those, that explicitly test for SEL? Thanks, Spencer > > Sorry, I've been thinking about this a patch at a time while ignoring > the bigger picture. > > Richard > > > --- > > gcc/config/aarch64/aarch64-sve.md | 38 ++++++++-------- > > .../aarch64/sve/unpacked_cond_fmla_1.c | 43 +++++++++++++++++++ > > .../aarch64/sve/unpacked_cond_fmls_1.c | 43 +++++++++++++++++++ > > .../aarch64/sve/unpacked_cond_fnmla_1.c | 43 +++++++++++++++++++ > > .../aarch64/sve/unpacked_cond_fnmls_1.c | 43 +++++++++++++++++++ > > 5 files changed, 191 insertions(+), 19 deletions(-) > > create mode 100644 > > gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmla_1.c > > create mode 100644 > > gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmls_1.c > > create mode 100644 > > gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmla_1.c > > create mode 100644 > > gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmls_1.c > > > > diff --git a/gcc/config/aarch64/aarch64-sve.md > > b/gcc/config/aarch64/aarch64-sve.md > > index 8c1921ddf5c..e5443980e8b 100644 > > --- a/gcc/config/aarch64/aarch64-sve.md > > +++ b/gcc/config/aarch64/aarch64-sve.md > > @@ -7622,15 +7622,15 @@ > > ;; Predicated floating-point ternary operations, merging with the > > ;; first input. > > (define_insn_and_rewrite "*cond_<optab><mode>_2_relaxed" > > - [(set (match_operand:SVE_FULL_F 0 "register_operand") > > - (unspec:SVE_FULL_F > > + [(set (match_operand:SVE_F 0 "register_operand") > > + (unspec:SVE_F > > [(match_operand:<VPRED> 1 "register_operand") > > - (unspec:SVE_FULL_F > > + (unspec:SVE_F > > [(match_operand 5) > > (const_int SVE_RELAXED_GP) > > - (match_operand:SVE_FULL_F 2 "register_operand") > > - (match_operand:SVE_FULL_F 3 "register_operand") > > - (match_operand:SVE_FULL_F 4 "register_operand")] > > + (match_operand:SVE_F 2 "register_operand") > > + (match_operand:SVE_F 3 "register_operand") > > + (match_operand:SVE_F 4 "register_operand")] > > SVE_COND_FP_TERNARY) > > (match_dup 2)] > > UNSPEC_SEL))] > > @@ -7668,15 +7668,15 @@ > > ;; Predicated floating-point ternary operations, merging with the > > ;; third input. > > (define_insn_and_rewrite "*cond_<optab><mode>_4_relaxed" > > - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand") > > - (unspec:SVE_FULL_F_B16B16 > > + [(set (match_operand:SVE_F_B16B16 0 "register_operand") > > + (unspec:SVE_F_B16B16 > > [(match_operand:<VPRED> 1 "register_operand") > > - (unspec:SVE_FULL_F_B16B16 > > + (unspec:SVE_F_B16B16 > > [(match_operand 5) > > (const_int SVE_RELAXED_GP) > > - (match_operand:SVE_FULL_F_B16B16 2 "register_operand") > > - (match_operand:SVE_FULL_F_B16B16 3 "register_operand") > > - (match_operand:SVE_FULL_F_B16B16 4 "register_operand")] > > + (match_operand:SVE_F_B16B16 2 "register_operand") > > + (match_operand:SVE_F_B16B16 3 "register_operand") > > + (match_operand:SVE_F_B16B16 4 "register_operand")] > > SVE_COND_FP_TERNARY) > > (match_dup 4)] > > UNSPEC_SEL))] > > @@ -7714,17 +7714,17 @@ > > ;; Predicated floating-point ternary operations, merging with an > > ;; independent value. > > (define_insn_and_rewrite "*cond_<optab><mode>_any_relaxed" > > - [(set (match_operand:SVE_FULL_F_B16B16 0 "register_operand") > > - (unspec:SVE_FULL_F_B16B16 > > + [(set (match_operand:SVE_F_B16B16 0 "register_operand") > > + (unspec:SVE_F_B16B16 > > [(match_operand:<VPRED> 1 "register_operand") > > - (unspec:SVE_FULL_F_B16B16 > > + (unspec:SVE_F_B16B16 > > [(match_operand 6) > > (const_int SVE_RELAXED_GP) > > - (match_operand:SVE_FULL_F_B16B16 2 "register_operand") > > - (match_operand:SVE_FULL_F_B16B16 3 "register_operand") > > - (match_operand:SVE_FULL_F_B16B16 4 "register_operand")] > > + (match_operand:SVE_F_B16B16 2 "register_operand") > > + (match_operand:SVE_F_B16B16 3 "register_operand") > > + (match_operand:SVE_F_B16B16 4 "register_operand")] > > SVE_COND_FP_TERNARY) > > - (match_operand:SVE_FULL_F_B16B16 5 "aarch64_simd_reg_or_zero")] > > + (match_operand:SVE_F_B16B16 5 "aarch64_simd_reg_or_zero")] > > UNSPEC_SEL))] > > "TARGET_SVE > > && (<supports_bf16> || !<is_bf16>) > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmla_1.c > > b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmla_1.c > > new file mode 100644 > > index 00000000000..e2a5bdab1dd > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmla_1.c > > @@ -0,0 +1,43 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=2048 > > -fno-trapping-math" } */ > > + > > +#include <stdint.h> > > + > > +#define FMLA(SUFF) __builtin_fma##SUFF (a[i], b[i], c[i]) > > +#define FMLS(SUFF) __builtin_fma##SUFF (a[i], -b[i], c[i]) > > +#define FNMLA(SUFF) -FMLA (SUFF) > > +#define FNMLS(SUFF) -FMLS (SUFF) > > + > > +#define a_i a[i] > > +#define b_i b[i] > > +#define c_i c[i] > > + > > +#define TEST_FN(FN, TYPE0, TYPE1, COUNT, MERGE) \ > > + void \ > > + f_##TYPE0##_##TYPE1##_##MERGE (TYPE0 *__restrict out, \ > > + TYPE0 *__restrict a, \ > > + TYPE0 *__restrict b, \ > > + TYPE0 *__restrict c, \ > > + TYPE1 *__restrict p) \ > > + { \ > > + for (unsigned int i = 0; i < COUNT; i++) \ > > + out[i] = p[i] ? FN : MERGE; \ > > + } > > + > > +#define TEST_ALL(FN, TYPE0, TYPE1, COUNT) \ > > + TEST_FN (FN, TYPE0, TYPE1, COUNT, c_i) \ > > + TEST_FN (FN, TYPE0, TYPE1, COUNT, 5) > > + > > +TEST_ALL (FMLA (f16), _Float16, uint64_t, 32) > > + > > +TEST_ALL (FMLA (f16), _Float16, uint32_t, 64) > > + > > +TEST_ALL (FMLA (f32), float, uint64_t, 32) > > + > > +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, > > z[0-9]+\.s\n} 1 } } */ > > +/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.s, p[0-7]/m, > > z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */ > > + > > +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, > > z[0-9]+\.h\n} 2 } } */ > > +/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.h, p[0-7]/m, > > z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */ > > + > > +/* { dg-final { scan-assembler-not {\tsel\t} } } */ > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmls_1.c > > b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmls_1.c > > new file mode 100644 > > index 00000000000..5a83fab8ec7 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fmls_1.c > > @@ -0,0 +1,43 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=2048 > > -fno-trapping-math" } */ > > + > > +#include <stdint.h> > > + > > +#define FMLA(SUFF) __builtin_fma##SUFF (a[i], b[i], c[i]) > > +#define FMLS(SUFF) __builtin_fma##SUFF (a[i], -b[i], c[i]) > > +#define FNMLA(SUFF) -FMLA (SUFF) > > +#define FNMLS(SUFF) -FMLS (SUFF) > > + > > +#define a_i a[i] > > +#define b_i b[i] > > +#define c_i c[i] > > + > > +#define TEST_FN(FN, TYPE0, TYPE1, COUNT, MERGE) \ > > + void \ > > + f_##TYPE0##_##TYPE1##_##MERGE (TYPE0 *__restrict out, \ > > + TYPE0 *__restrict a, \ > > + TYPE0 *__restrict b, \ > > + TYPE0 *__restrict c, \ > > + TYPE1 *__restrict p) \ > > + { \ > > + for (unsigned int i = 0; i < COUNT; i++) \ > > + out[i] = p[i] ? FN : MERGE; \ > > + } > > + > > +#define TEST_ALL(FN, TYPE0, TYPE1, COUNT) \ > > + TEST_FN (FN, TYPE0, TYPE1, COUNT, c_i) \ > > + TEST_FN (FN, TYPE0, TYPE1, COUNT, 5) > > + > > +TEST_ALL (FMLS (f16), _Float16, uint64_t, 32) > > + > > +TEST_ALL (FMLS (f16), _Float16, uint32_t, 64) > > + > > +TEST_ALL (FMLS (f32), float, uint64_t, 32) > > + > > +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, > > z[0-9]+\.s\n} 1 } } */ > > +/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.s, p[0-7]/m, > > z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */ > > + > > +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, > > z[0-9]+\.h\n} 2 } } */ > > +/* { dg-final { scan-assembler-times {\tfmls\tz[0-9]+\.h, p[0-7]/m, > > z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */ > > + > > +/* { dg-final { scan-assembler-not {\tsel\t} } } */ > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmla_1.c > > b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmla_1.c > > new file mode 100644 > > index 00000000000..5d4eff806dd > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmla_1.c > > @@ -0,0 +1,43 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=2048 > > -fno-trapping-math" } */ > > + > > +#include <stdint.h> > > + > > +#define FMLA(SUFF) __builtin_fma##SUFF (a[i], b[i], c[i]) > > +#define FMLS(SUFF) __builtin_fma##SUFF (a[i], -b[i], c[i]) > > +#define FNMLA(SUFF) -FMLA (SUFF) > > +#define FNMLS(SUFF) -FMLS (SUFF) > > + > > +#define a_i a[i] > > +#define b_i b[i] > > +#define c_i c[i] > > + > > +#define TEST_FN(FN, TYPE0, TYPE1, COUNT, MERGE) \ > > + void \ > > + f_##TYPE0##_##TYPE1##_##MERGE (TYPE0 *__restrict out, \ > > + TYPE0 *__restrict a, \ > > + TYPE0 *__restrict b, \ > > + TYPE0 *__restrict c, \ > > + TYPE1 *__restrict p) \ > > + { \ > > + for (unsigned int i = 0; i < COUNT; i++) \ > > + out[i] = p[i] ? FN : MERGE; \ > > + } > > + > > +#define TEST_ALL(FN, TYPE0, TYPE1, COUNT) \ > > + TEST_FN (FN, TYPE0, TYPE1, COUNT, c_i) \ > > + TEST_FN (FN, TYPE0, TYPE1, COUNT, 5) > > + > > +TEST_ALL (FNMLA (f16), _Float16, uint64_t, 32) > > + > > +TEST_ALL (FNMLA (f16), _Float16, uint32_t, 64) > > + > > +TEST_ALL (FNMLA (f32), float, uint64_t, 32) > > + > > +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, > > z[0-9]+\.s\n} 1 } } */ > > +/* { dg-final { scan-assembler-times {\tfnmla\tz[0-9]+\.s, p[0-7]/m, > > z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */ > > + > > +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, > > z[0-9]+\.h\n} 2 } } */ > > +/* { dg-final { scan-assembler-times {\tfnmla\tz[0-9]+\.h, p[0-7]/m, > > z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */ > > + > > +/* { dg-final { scan-assembler-not {\tsel\t} } } */ > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmls_1.c > > b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmls_1.c > > new file mode 100644 > > index 00000000000..5569bbabd7d > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cond_fnmls_1.c > > @@ -0,0 +1,43 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=2048 > > -fno-trapping-math" } */ > > + > > +#include <stdint.h> > > + > > +#define FMLA(SUFF) __builtin_fma##SUFF (a[i], b[i], c[i]) > > +#define FMLS(SUFF) __builtin_fma##SUFF (a[i], -b[i], c[i]) > > +#define FNMLA(SUFF) -FMLA (SUFF) > > +#define FNMLS(SUFF) -FMLS (SUFF) > > + > > +#define a_i a[i] > > +#define b_i b[i] > > +#define c_i c[i] > > + > > +#define TEST_FN(FN, TYPE0, TYPE1, COUNT, MERGE) \ > > + void \ > > + f_##TYPE0##_##TYPE1##_##MERGE (TYPE0 *__restrict out, \ > > + TYPE0 *__restrict a, \ > > + TYPE0 *__restrict b, \ > > + TYPE0 *__restrict c, \ > > + TYPE1 *__restrict p) \ > > + { \ > > + for (unsigned int i = 0; i < COUNT; i++) \ > > + out[i] = p[i] ? FN : MERGE; \ > > + } > > + > > +#define TEST_ALL(FN, TYPE0, TYPE1, COUNT) \ > > + TEST_FN (FN, TYPE0, TYPE1, COUNT, c_i) \ > > + TEST_FN (FN, TYPE0, TYPE1, COUNT, 5) > > + > > +TEST_ALL (FNMLS (f16), _Float16, uint64_t, 32) > > + > > +TEST_ALL (FNMLS (f16), _Float16, uint32_t, 64) > > + > > +TEST_ALL (FNMLS (f32), float, uint64_t, 32) > > + > > +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.s, p[0-7]/m, > > z[0-9]+\.s\n} 1 } } */ > > +/* { dg-final { scan-assembler-times {\tfnmls\tz[0-9]+\.s, p[0-7]/m, > > z[0-9]+\.s, z[0-9]+\.s\n} 2 } } */ > > + > > +/* { dg-final { scan-assembler-times {\tmovprfx\tz[0-9]+\.h, p[0-7]/m, > > z[0-9]+\.h\n} 2 } } */ > > +/* { dg-final { scan-assembler-times {\tfnmls\tz[0-9]+\.h, p[0-7]/m, > > z[0-9]+\.h, z[0-9]+\.h\n} 4 } } */ > > + > > +/* { dg-final { scan-assembler-not {\tsel\t} } } */