Dhruv Chawla <dhr...@nvidia.com> writes: > This patch modifies the intrinsic expanders to expand svlsl and svlsr to > unpredicated forms when the predicate is a ptrue. It also folds the > following pattern: > > lsl <y>, <x>, <shift> > lsr <z>, <x>, <shift> > orr <r>, <y>, <z> > > to: > > revb/h/w <r>, <x> > > when the shift amount is equal to half the bitwidth of the <x> > register. > > Bootstrapped and regtested on aarch64-linux-gnu. > > Signed-off-by: Dhruv Chawla <dhr...@nvidia.com> > > gcc/ChangeLog: > > * config/aarch64/aarch64-sve-builtins-base.cc > (svlsl_impl::expand): Define. > (svlsr_impl): New class. > (svlsr_impl::fold): Define. > (svlsr_impl::expand): Likewise. > * config/aarch64/aarch64-sve.md > (*v_rev<mode>): New pattern. > (*v_revvnx8hi): Likewise. > > gcc/testsuite/ChangeLog: > > * gcc.target/aarch64/sve/shift_rev_1.c: New test. > * gcc.target/aarch64/sve/shift_rev_2.c: Likewise. > --- > .../aarch64/aarch64-sve-builtins-base.cc | 33 +++++++- > gcc/config/aarch64/aarch64-sve.md | 49 +++++++++++ > .../gcc.target/aarch64/sve/shift_rev_1.c | 83 +++++++++++++++++++ > .../gcc.target/aarch64/sve/shift_rev_2.c | 63 ++++++++++++++ > 4 files changed, 227 insertions(+), 1 deletion(-) > create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/shift_rev_1.c > create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/shift_rev_2.c > > diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > index 927c5bbae21..938d010e11b 100644 > --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc > +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc > @@ -2086,6 +2086,37 @@ public: > { > return f.fold_const_binary (LSHIFT_EXPR); > } > + > + rtx expand (function_expander &e) const override > + { > + tree pred = TREE_OPERAND (e.call_expr, 3); > + tree shift = TREE_OPERAND (e.call_expr, 5); > + if (is_ptrue (pred, GET_MODE_UNIT_SIZE (e.result_mode ())) > + && uniform_integer_cst_p (shift)) > + return e.use_unpred_insn (e.direct_optab_handler (ashl_optab)); > + return rtx_code_function::expand (e); > + } > +}; > + > +class svlsr_impl : public rtx_code_function > +{ > +public: > + CONSTEXPR svlsr_impl () : rtx_code_function (LSHIFTRT, LSHIFTRT) {} > + > + gimple *fold (gimple_folder &f) const override > + { > + return f.fold_const_binary (RSHIFT_EXPR); > + } > + > + rtx expand (function_expander &e) const override > + { > + tree pred = TREE_OPERAND (e.call_expr, 3); > + tree shift = TREE_OPERAND (e.call_expr, 5); > + if (is_ptrue (pred, GET_MODE_UNIT_SIZE (e.result_mode ())) > + && uniform_integer_cst_p (shift)) > + return e.use_unpred_insn (e.direct_optab_handler (lshr_optab)); > + return rtx_code_function::expand (e); > + } > }; > > class svmad_impl : public function_base > @@ -3572,7 +3603,7 @@ FUNCTION (svldnt1, svldnt1_impl,) > FUNCTION (svlen, svlen_impl,) > FUNCTION (svlsl, svlsl_impl,) > FUNCTION (svlsl_wide, shift_wide, (ASHIFT, UNSPEC_ASHIFT_WIDE)) > -FUNCTION (svlsr, rtx_code_function, (LSHIFTRT, LSHIFTRT)) > +FUNCTION (svlsr, svlsr_impl, ) > FUNCTION (svlsr_wide, shift_wide, (LSHIFTRT, UNSPEC_LSHIFTRT_WIDE)) > FUNCTION (svmad, svmad_impl,) > FUNCTION (svmax, rtx_code_function, (SMAX, UMAX, UNSPEC_COND_FMAX,
I'm hoping that this won't be necessary after the changes I mentioned in patch 1. The expander should handle everything itself. > diff --git a/gcc/config/aarch64/aarch64-sve.md > b/gcc/config/aarch64/aarch64-sve.md > index 42802bac653..7cce18c024b 100644 > --- a/gcc/config/aarch64/aarch64-sve.md > +++ b/gcc/config/aarch64/aarch64-sve.md > @@ -3232,6 +3232,55 @@ > ;; - REVW > ;; ------------------------------------------------------------------------- > > +(define_insn_and_split "*v_rev<mode>" > + [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w") > + (rotate:SVE_FULL_HSDI > + (match_operand:SVE_FULL_HSDI 1 "register_operand" "w") > + (match_operand:SVE_FULL_HSDI 2 "aarch64_constant_vector_operand")))] > + "TARGET_SVE" > + "#" > + "&& !reload_completed" This is an ICE trap: a pattern that requires a split ("#") must have an unconditional split. Which makes this awkward...argh! However, since this is intended to match a 3-instruction combination, I think we can do without the define_insn and just use a define_split. That only works if we emit at most 2 instructions, but that can be done with a bit of work. I've attached a patch below that does that. > + [(set (match_dup 3) > + (ashift:SVE_FULL_HSDI (match_dup 1) > + (match_dup 2))) > + (set (match_dup 0) > + (plus:SVE_FULL_HSDI > + (lshiftrt:SVE_FULL_HSDI (match_dup 1) > + (match_dup 4)) > + (match_dup 3)))] This is an SVE2 instruction, but the guard is only for TARGET_SVE. For TARGET_SVE, we should FAIL if the aarch64_emit_opt_vec_rotate fails. > + { > + if (aarch64_emit_opt_vec_rotate (operands[0], operands[1], operands[2])) > + DONE; > + > + operands[3] = gen_reg_rtx (<MODE>mode); > + rtx shift_amount = unwrap_const_vec_duplicate (operands[2]); > + int bitwidth = GET_MODE_UNIT_BITSIZE (<MODE>mode); > + operands[4] = aarch64_simd_gen_const_vector_dup (<MODE>mode, > + bitwidth - INTVAL > (shift_amount)); Formatting nit, sorry, but: long line. It could be avoided by applying INTVAL directly to the unwrap_const_vec_duplicate. I can't remember if we discussed this before, but I think we could extend this to partial vector modes (rather than SVE_FULL...) by using the GET_MODE_UNIT_SIZE of the container mode. > + } > +) > + > +;; The RTL combiners are able to combine "ior (ashift, ashiftrt)" to a > "bswap". > +;; Match that as well. > +(define_insn_and_split "*v_revvnx8hi" > + [(set (match_operand:VNx8HI 0 "register_operand" "=w") > + (bswap:VNx8HI > + (match_operand 1 "register_operand" "w")))] Another formatting nit, sorry, but the line break after the bswap seems unnecessary. > + "TARGET_SVE" > + "#" > + "&& !reload_completed" A similar comment here about the ICE trap. Here we can handle it by adding: (clobber (match_scratch:VNx8BImode 2 "=Upl")) to the pattern above and: > + [(set (match_dup 0) > + (unspec:VNx8HI > + [(match_dup 2) > + (unspec:VNx8HI > + [(match_dup 1)] > + UNSPEC_REVB)] > + UNSPEC_PRED_X))] > + { > + operands[2] = aarch64_ptrue_reg (VNx8BImode); moving CONSTM1_RTX (VNx8BImode) into operands[2] if !can_create_pseudo_p (). > + } > +) > + > ;; Predicated integer unary operations. > (define_insn "@aarch64_pred_<optab><mode>" > [(set (match_operand:SVE_FULL_I 0 "register_operand") > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/shift_rev_1.c > b/gcc/testsuite/gcc.target/aarch64/sve/shift_rev_1.c > new file mode 100644 > index 00000000000..3a30f80d152 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/shift_rev_1.c > @@ -0,0 +1,83 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O3 -march=armv8.2-a+sve" } */ > +/* { dg-final { check-function-bodies "**" "" "" } } */ > + > +#include <arm_sve.h> > + > +/* > +** ror32_sve_lsl_imm: > +** ptrue p3.b, all > +** revw z0.d, p3/m, z0.d > +** ret > +*/ > +svuint64_t > +ror32_sve_lsl_imm (svuint64_t r) > +{ > + return svorr_u64_z (svptrue_b64 (), svlsl_n_u64_z (svptrue_b64 (), r, 32), > + svlsr_n_u64_z (svptrue_b64 (), r, 32)); > +} > + > +/* > +** ror32_sve_lsl_operand: > +** ptrue p3.b, all > +** revw z0.d, p3/m, z0.d > +** ret > +*/ > +svuint64_t > +ror32_sve_lsl_operand (svuint64_t r) > +{ > + svbool_t pt = svptrue_b64 (); > + return svorr_u64_z (pt, svlsl_n_u64_z (pt, r, 32), svlsr_n_u64_z (pt, r, > 32)); > +} > + > +/* > +** ror16_sve_lsl_imm: > +** ptrue p3.b, all > +** revh z0.s, p3/m, z0.s > +** ret > +*/ > +svuint32_t > +ror16_sve_lsl_imm (svuint32_t r) > +{ > + return svorr_u32_z (svptrue_b32 (), svlsl_n_u32_z (svptrue_b32 (), r, 16), > + svlsr_n_u32_z (svptrue_b32 (), r, 16)); > +} > + > +/* > +** ror16_sve_lsl_operand: > +** ptrue p3.b, all > +** revh z0.s, p3/m, z0.s > +** ret > +*/ > +svuint32_t > +ror16_sve_lsl_operand (svuint32_t r) > +{ > + svbool_t pt = svptrue_b32 (); > + return svorr_u32_z (pt, svlsl_n_u32_z (pt, r, 16), svlsr_n_u32_z (pt, r, > 16)); > +} > + > +/* > +** ror8_sve_lsl_imm: > +** ptrue p3.b, all > +** revb z0.h, p3/m, z0.h > +** ret > +*/ > +svuint16_t > +ror8_sve_lsl_imm (svuint16_t r) > +{ > + return svorr_u16_z (svptrue_b16 (), svlsl_n_u16_z (svptrue_b16 (), r, 8), > + svlsr_n_u16_z (svptrue_b16 (), r, 8)); > +} > + > +/* > +** ror8_sve_lsl_operand: > +** ptrue p3.b, all > +** revb z0.h, p3/m, z0.h > +** ret > +*/ > +svuint16_t > +ror8_sve_lsl_operand (svuint16_t r) > +{ > + svbool_t pt = svptrue_b16 (); > + return svorr_u16_z (pt, svlsl_n_u16_z (pt, r, 8), svlsr_n_u16_z (pt, r, > 8)); > +} This only tests the revb/h/w case, not the lsl/usra fallback in the split pattern. The lsl/usra fallback generates pretty awful code if the rotated value is passed in z0, so it might be worth adding an initial dummy argument for that case. Thanks, Richard diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index 91a9713e712..9709736864c 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -3266,14 +3266,12 @@ (define_insn "*cond_<optab><mode>_any" ;; - REVW ;; ------------------------------------------------------------------------- -(define_insn_and_split "*v_rev<mode>" - [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w") +(define_split + [(set (match_operand:SVE_FULL_HSDI 0 "register_operand") (rotate:SVE_FULL_HSDI - (match_operand:SVE_FULL_HSDI 1 "register_operand" "w") + (match_operand:SVE_FULL_HSDI 1 "register_operand") (match_operand:SVE_FULL_HSDI 2 "aarch64_constant_vector_operand")))] - "TARGET_SVE" - "#" - "&& !reload_completed" + "TARGET_SVE && can_create_pseudo_p ()" [(set (match_dup 3) (ashift:SVE_FULL_HSDI (match_dup 1) (match_dup 2))) Avoid forcing subregs into fresh registers unnecessarily: diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index fff8d9da49d..317dc106712 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -26900,11 +26900,17 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode, d.op_mode = op_mode; d.op_vec_flags = aarch64_classify_vector_mode (d.op_mode); d.target = target; - d.op0 = op0 ? force_reg (op_mode, op0) : NULL_RTX; + d.op0 = op0; + if (d.op0 && !register_operand (d.op0, op_mode)) + d.op0 = force_reg (op_mode, d.op0); if (op0 && d.one_vector_p) d.op1 = copy_rtx (d.op0); else - d.op1 = op1 ? force_reg (op_mode, op1) : NULL_RTX; + { + d.op1 = op1; + if (d.op1 && !register_operand (d.op1, op_mode)) + d.op1 = force_reg (op_mode, d.op1); + } d.testing_p = !target; if (!d.testing_p) Avoid a no-op move if the target already provided the result in the expected register: diff --git a/gcc/expmed.cc b/gcc/expmed.cc index 8cf10d9c73b..37a525b429c 100644 --- a/gcc/expmed.cc +++ b/gcc/expmed.cc @@ -6326,7 +6326,8 @@ expand_rotate_as_vec_perm (machine_mode mode, rtx dst, rtx x, rtx amt) qimode, perm_dst); if (!res) return NULL_RTX; - emit_move_insn (dst, lowpart_subreg (mode, res, qimode)); + if (!rtx_equal_p (res, perm_dst)) + emit_move_insn (dst, lowpart_subreg (mode, res, qimode)); return dst; }