On Thu, Jun 05, 2025 at 06:11:44PM +0100, Richard Sandiford wrote: > Spencer Abson <spencer.ab...@arm.com> writes: > > @@ -9487,21 +9489,39 @@ > > ;; - FCVTZU > > ;; > > ------------------------------------------------------------------------- > > > > -;; Unpredicated conversion of floats to integers of the same size (HF to > > HI, > > -;; SF to SI or DF to DI). > > -(define_expand "<optab><mode><v_int_equiv>2" > > - [(set (match_operand:<V_INT_EQUIV> 0 "register_operand") > > - (unspec:<V_INT_EQUIV> > > +;; Unpredicated conversion of floats to integers of the same size or wider, > > +;; excluding conversions from DF (see below). > > +(define_expand "<optab><SVE_HSF:mode><SVE_HSDI:mode>2" > > + [(set (match_operand:SVE_HSDI 0 "register_operand") > > + (unspec:SVE_HSDI > > + [(match_dup 2) > > + (match_dup 3) > > + (match_operand:SVE_HSF 1 "register_operand")] > > + SVE_COND_FCVTI))] > > + "TARGET_SVE > > + && (~(<SVE_HSDI:self_mask> | <SVE_HSDI:narrower_mask>) & > > <SVE_HSF:self_mask>) == 0" > > + { > > + operands[2] = aarch64_sve_fp_pred (<SVE_HSDI:MODE>mode, &operands[3]); > > + } > > +) > > + > > +;; SI <- DF can't use SI <- trunc (DI <- DF) without -ffast-math, so this > > +;; truncating variant of FCVTZ{S,U} is useful for auto-vectorization. > > +;; > > +;; DF is the only source mode for which the mask used above doesn't apply, > > +;; we define a separate pattern for it here. > > +(define_expand "<optab><VNx2DF_ONLY:mode><SVE_2SDI:mode>2" > > + [(set (match_operand:SVE_2SDI 0 "register_operand") > > + (unspec:SVE_2SDI > > [(match_dup 2) > > (const_int SVE_RELAXED_GP) > > - (match_operand:SVE_FULL_F 1 "register_operand")] > > + (match_operand:VNx2DF_ONLY 1 "register_operand")] > > SVE_COND_FCVTI))] > > "TARGET_SVE" > > { > > - operands[2] = aarch64_ptrue_reg (<VPRED>mode); > > + operands[2] = aarch64_ptrue_reg (VNx2BImode); > > } > > ) > > - > > Sorry for the formatting nit, but: please keep the blank line. > > > ;; Predicated float-to-integer conversion, either to the same width or > > wider. > > (define_insn > > "@aarch64_sve_<optab>_nontrunc<SVE_FULL_F:mode><SVE_FULL_HSDI:mode>" > > [(set (match_operand:SVE_FULL_HSDI 0 "register_operand") > > @@ -9517,18 +9537,34 @@ > > } > > ) > > > > +;; As above, for pairs used by the auto-vectorizer only. > > +(define_insn > > "*aarch64_sve_<optab>_nontrunc<SVE_PARTIAL_F:mode><SVE_HSDI:mode>" > > + [(set (match_operand:SVE_HSDI 0 "register_operand") > > + (unspec:SVE_HSDI > > + [(match_operand:<SVE_HSDI:VPRED> 1 "aarch64_predicate_operand") > > + (match_operand:SI 3 "aarch64_sve_gp_strictness") > > + (match_operand:SVE_PARTIAL_F 2 "register_operand")] > > + SVE_COND_FCVTI))] > > + "TARGET_SVE > > + && (~(<SVE_HSDI:self_mask> | <SVE_HSDI:narrower_mask>) & > > <SVE_PARTIAL_F:self_mask>) == 0" > > + {@ [ cons: =0 , 1 , 2 ; attrs: movprfx ] > > + [ w , Upl , 0 ; * ] > > fcvtz<su>\t%0.<SVE_HSDI:Vetype>, %1/m, %2.<SVE_PARTIAL_F:Vetype> > > + [ ?&w , Upl , w ; yes ] movprfx\t%0, > > %2\;fcvtz<su>\t%0.<SVE_HSDI:Vetype>, %1/m, %2.<SVE_PARTIAL_F:Vetype> > > + } > > +) > > + > > ;; Predicated narrowing float-to-integer conversion. > > I think it would be worth extending the comment here, in case it isn't > obvious what's going on: > > ;; Predicated narrowing float-to-integer conversion. The VNx2DF->VNx4SI > ;; variant is provided for the ACLE, where the zeroed odd-indexed lanes are > ;; significant. The VNx2DF->VNx2SI variant is provided for autovectorization, > ;; where the odd-indexed lanes are ignored.
Sounds good, thanks. > > > -(define_insn > > "@aarch64_sve_<optab>_trunc<VNx2DF_ONLY:mode><VNx4SI_ONLY:mode>" > > - [(set (match_operand:VNx4SI_ONLY 0 "register_operand") > > - (unspec:VNx4SI_ONLY > > +(define_insn "@aarch64_sve_<optab>_trunc<VNx2DF_ONLY:mode><SVE_SI:mode>" > > + [(set (match_operand:SVE_SI 0 "register_operand") > > + (unspec:SVE_SI > > [(match_operand:VNx2BI 1 "register_operand") > > (match_operand:SI 3 "aarch64_sve_gp_strictness") > > (match_operand:VNx2DF_ONLY 2 "register_operand")] > > SVE_COND_FCVTI))] > > "TARGET_SVE" > > {@ [ cons: =0 , 1 , 2 ; attrs: movprfx ] > > - [ w , Upl , 0 ; * ] > > fcvtz<su>\t%0.<VNx4SI_ONLY:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype> > > - [ ?&w , Upl , w ; yes ] movprfx\t%0, > > %2\;fcvtz<su>\t%0.<VNx4SI_ONLY:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype> > > + [ w , Upl , 0 ; * ] > > fcvtz<su>\t%0.<SVE_SI:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype> > > + [ ?&w , Upl , w ; yes ] movprfx\t%0, > > %2\;fcvtz<su>\t%0.<SVE_SI:Vetype>, %1/m, %2.<VNx2DF_ONLY:Vetype> > > } > > ) > > > > [...] > > diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc > > index 59ac08483f4..b13fce2a859 100644 > > --- a/gcc/config/aarch64/aarch64.cc > > +++ b/gcc/config/aarch64/aarch64.cc > > @@ -3855,6 +3855,44 @@ aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx > > *pred2) > > return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]); > > } > > > > + > > +/* Generate a predicate to control partial SVE mode DATA_MODE as if it > > + were fully packed, enabling the defined elements only. */ > > +rtx > > +aarch64_sve_packed_pred (machine_mode data_mode) > > +{ > > + unsigned int container_bytes > > + = aarch64_sve_container_bits (data_mode) / BITS_PER_UNIT; > > + /* Enable the significand of each container only. */ > > + rtx ptrue = force_reg (VNx16BImode, aarch64_ptrue_all (container_bytes)); > > + /* Predicate at the element size. */ > > + machine_mode pmode > > + = aarch64_sve_pred_mode (GET_MODE_UNIT_SIZE (data_mode)).require (); > > + return gen_lowpart (pmode, ptrue); > > +} > > + > > +/* Generate a predicate and strictness value to govern a floating-point > > + operation with SVE mode DATA_MODE. > > + > > + If DATA_MODE is partial vector mode, this pair prevents the operation > > is a partial vector mode > > > + from interpreting undefined elements - unless we don't need to control > > + it's trapping behavior. */ > > its > > > +rtx > > +aarch64_sve_fp_pred (machine_mode data_mode, rtx *strictness) > > +{ > > + unsigned int vec_flags = aarch64_classify_vector_mode (data_mode); > > + if (flag_trapping_math && (vec_flags & VEC_PARTIAL)) > > + { > > + if (strictness) > > + *strictness = gen_int_mode (SVE_STRICT_GP, SImode); > > + return aarch64_sve_packed_pred (data_mode); > > + } > > + if (strictness) > > + *strictness = gen_int_mode (SVE_RELAXED_GP, SImode); > > + /* Use the natural predicate mode. */ > > + return aarch64_ptrue_reg (aarch64_sve_pred_mode (data_mode)); > > +} > > + > > /* Emit a comparison CMP between OP0 and OP1, both of which have mode > > DATA_MODE, and return the result in a predicate of mode PRED_MODE. > > Use TARGET as the target register if nonnull and convenient. */ > > [...] > > diff --git a/gcc/config/aarch64/predicates.md > > b/gcc/config/aarch64/predicates.md > > index 2c6af831eae..ed36cd72ea3 100644 > > --- a/gcc/config/aarch64/predicates.md > > +++ b/gcc/config/aarch64/predicates.md > > @@ -587,6 +587,10 @@ > > return aarch64_simd_shift_imm_p (op, mode, false); > > }) > > > > +(define_special_predicate "aarch64_predicate_operand" > > + (and (match_test "register_operand (op, GET_MODE (op))") > > + (match_test "aarch64_sve_valid_pred_p (op, mode)"))) > > Since this is (rightly) invoking register_operand directly, > rather than using: > > (match_operand 0 "register_operand") > > (which would pass the wrong mode), the gen* tools won't be able to infer that > the predicate only accepts reg and subreg. So I think we should make it: > > (and (match_code "reg,subreg") > (match_test "register_operand (op, GET_MODE (op))") > (match_test "aarch64_sve_valid_pred_p (op, mode)"))) Right, thanks! > > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cvtf_1.c > > b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cvtf_1.c > > new file mode 100644 > > index 00000000000..115182faf3a > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cvtf_1.c > > @@ -0,0 +1,217 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -msve-vector-bits=2048 -fno-schedule-insns2" } */ > > Might be good to pass -fno-schedule-insns too, in case it gets enabled > again by default (e.g. if it is finally tuned for OoO cores, and if the > compile-time problems are addressed). > > Same for other tests in the series. > > > [...] > > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cvtf_3.c > > b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cvtf_3.c > > new file mode 100644 > > index 00000000000..964264c4114 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/sve/unpacked_cvtf_3.c > > @@ -0,0 +1,12 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2 -ftree-vectorize" } */ > > + > > +#include <stdint.h> > > + > > +void f64_i32 (double *restrict x, int32_t *restrict y, int n) > > +{ > > + for (int i = 0; i < n; i++) > > + x[i] = (double)y[i]; > > +} > > + > > +/* { dg-final { scan-assembler-times {\tscvtf\tz[0-9]+\.d, p[0-7]/m, > > z[0-9]+\.d\n} 1 } } */ > > I think we should accept \.[sd] for the source, since IMO it would be > more natural to do this without the sign-extension to 64 bits. > Or have I misunderstood the point of the test? Ah, sorry - I should have left a comment here. Just to recap, the condition used in the integer to float expander: && (~(<SVE_HSDI:self_mask> | <SVE_HSDI:narrower_mask>) & <SVE_F:self_mask>) == 0 Does not cover the extendng SI->DF variant of SCVTF. This means that a call to __builtin_convertvector with this pair will be lowered peicewise by veclower. However, the vectorizer can always use the DI->DF variant here (after a sign-extension), I was OK with this and just tested it separately here. Perhaps it's worth defining floatvnx2sivnx2df2. What do you think? Thanks, Spencer > > Looks really good otherwise, so ok with those changes. > > Thanks, > Richard