Tamar Christina <tamar.christ...@arm.com> writes: > Hi All, > > The following example > > void f5(float * restrict z0, float * restrict z1, float *restrict x, > float * restrict y, float c, int n) > { > for (int i = 0; i < n; i++) { > float a = x[i]; > float b = y[i]; > if (a > b) { > z0[i] = a + b; > if (a > c) { > z1[i] = a - b; > } > } > } > } > > generates currently: > > ptrue p3.b, all > ld1w z1.s, p1/z, [x2, x5, lsl 2] > ld1w z2.s, p1/z, [x3, x5, lsl 2] > fcmgt p0.s, p3/z, z1.s, z0.s > fcmgt p2.s, p1/z, z1.s, z2.s > fcmgt p0.s, p0/z, z1.s, z2.s > > The conditions for a > b and a > c become separate comparisons. > > After this patch using a 2 -> 2 split we generate: > > ld1w z1.s, p0/z, [x2, x5, lsl 2] > ld1w z2.s, p0/z, [x3, x5, lsl 2] > fcmgt p1.s, p0/z, z1.s, z2.s > fcmgt p1.s, p1/z, z1.s, z0.s > > Where the condition a > b && a > c are folded by using the predicate result of > the previous compare and thus allows the removal of one of the compares. > > Note: This patch series is working incrementally towards generating the most > efficient code for this and other loops in small steps.
It looks like this could be done in the vectoriser via an extension of the scalar_cond_masked_set mechanism. We have: mask__54.13_59 = vect_a_15.9_55 > vect_b_17.12_58; vec_mask_and_60 = loop_mask_32 & mask__54.13_59; … mask__30.17_67 = vect_a_15.9_55 > vect_cst__66; mask__29.18_68 = mask__54.13_59 & mask__30.17_67; vec_mask_and_69 = loop_mask_32 & mask__29.18_68; When vectorising mask__29.18_68, we could test whether each side of the "&" is already in scalar_cond_masked_set and AND in the loop mask if so, like we do in vectorizable_condition. We could then separately record that the & result includes the loop mask. Thanks, Richard > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. > > Ok for master? > > Thanks, > Tamar > > gcc/ChangeLog: > > * config/aarch64/aarch64-sve.md (*mask_cmp_and_combine): New. > > gcc/testsuite/ChangeLog: > > * gcc.target/aarch64/sve/pred-combine-and.c: New test. > > --- inline copy of patch -- > diff --git a/gcc/config/aarch64/aarch64-sve.md > b/gcc/config/aarch64/aarch64-sve.md > index > 2c23c6b12bafb038d82920e7141a418e078a2c65..ee9d32c0a5534209689d9d3abaa560ee5b66347d > 100644 > --- a/gcc/config/aarch64/aarch64-sve.md > +++ b/gcc/config/aarch64/aarch64-sve.md > @@ -8162,6 +8162,48 @@ (define_insn_and_split "*mask_inv_combine" > } > ) > > +;; Combine multiple masks where the comparisons operators are the same and > +;; each comparison has one parameter shared. e.g. combine a > b && a > c > +(define_insn_and_split "*mask_cmp_and_combine" > + [(set (match_operand:<VPRED> 0 "register_operand" "=Upa") > + (and:<VPRED> > + (and:<VPRED> > + (unspec:<VPRED> > + [(match_operand:<VPRED> 1) > + (const_int SVE_KNOWN_PTRUE) > + (match_operand:SVE_FULL_F 2 "register_operand" "w") > + (match_operand:SVE_FULL_F 3 "aarch64_simd_reg_or_zero" "wDz")] > + SVE_COND_FP_CMP_I0) > + (unspec:<VPRED> > + [(match_dup 1) > + (const_int SVE_KNOWN_PTRUE) > + (match_dup 2) > + (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero" "wDz")] > + SVE_COND_FP_CMP_I0)) > + (match_operand:<VPRED> 5 "register_operand" "Upa"))) > + (clobber (match_scratch:<VPRED> 6 "=&Upa"))] > + "TARGET_SVE" > + "#" > + "&& 1" > + [(set (match_dup 6) > + (unspec:<VPRED> > + [(match_dup 5) > + (const_int SVE_MAYBE_NOT_PTRUE) > + (match_dup 2) > + (match_dup 3)] > + SVE_COND_FP_CMP_I0)) > + (set (match_dup 0) > + (unspec:<VPRED> > + [(match_dup 6) > + (const_int SVE_MAYBE_NOT_PTRUE) > + (match_dup 2) > + (match_dup 4)] > + SVE_COND_FP_CMP_I0))] > +{ > + operands[6] = gen_reg_rtx (<VPRED>mode); > +} > +) > + > ;; ------------------------------------------------------------------------- > ;; ---- [FP] Absolute comparisons > ;; ------------------------------------------------------------------------- > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c > b/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..d395b7f84bb15b588493611df5a47549726ac24a > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c > @@ -0,0 +1,18 @@ > +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ > +/* { dg-options "-O3 --save-temps" } */ > + > +void f5(float * restrict z0, float * restrict z1, float *restrict x, float * > restrict y, float c, int n) > +{ > + for (int i = 0; i < n; i++) { > + float a = x[i]; > + float b = y[i]; > + if (a > b) { > + z0[i] = a + b; > + if (a > c) { > + z1[i] = a - b; > + } > + } > + } > +} > + > +/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-9]+/z, > z[0-9]+\.s, z[0-9]+\.s} 2 } } */