From: Dhruv Chawla <dhr...@nvidia.com> This patch modifies the intrinsic expanders to expand svlsl and svlsr to unpredicated forms when the predicate is a ptrue. It also folds the following pattern:
lsl <y>, <x>, <shift> lsr <z>, <x>, <shift> orr <r>, <y>, <z> to: revb/h/w <r>, <x> when the shift amount is equal to half the bitwidth of the <x> register. Bootstrapped and regtested on aarch64-linux-gnu. Signed-off-by: Dhruv Chawla <dhr...@nvidia.com> Co-authored-by: Richard Sandiford <richard.sandif...@arm.com> gcc/ChangeLog: * expmed.cc (expand_rotate_as_vec_perm): Avoid a no-op move if the target already provided the result in the expected register. * config/aarch64/aarch64.cc (aarch64_vectorize_vec_perm_const): Avoid forcing subregs into fresh registers unnecessarily. * config/aarch64/aarch64-sve-builtins-base.cc (svlsl_impl::expand): Define. (svlsr_impl): New class. (svlsr_impl::fold): Define. (svlsr_impl::expand): Likewise. * config/aarch64/aarch64-sve.md: Add define_split for rotate. (*v_revvnx8hi): New pattern. gcc/testsuite/ChangeLog: * gcc.target/aarch64/sve/shift_rev_1.c: New test. * gcc.target/aarch64/sve/shift_rev_2.c: Likewise. * gcc.target/aarch64/sve/shift_rev_3.c: Likewise. --- .../aarch64/aarch64-sve-builtins-base.cc | 33 +++++++- gcc/config/aarch64/aarch64-sve.md | 55 ++++++++++++ gcc/config/aarch64/aarch64.cc | 10 ++- gcc/expmed.cc | 3 +- .../gcc.target/aarch64/sve/shift_rev_1.c | 83 +++++++++++++++++++ .../gcc.target/aarch64/sve/shift_rev_2.c | 63 ++++++++++++++ .../gcc.target/aarch64/sve/shift_rev_3.c | 83 +++++++++++++++++++ 7 files changed, 326 insertions(+), 4 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/shift_rev_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/shift_rev_2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/shift_rev_3.c diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config/aarch64/aarch64-sve-builtins-base.cc index b4396837c24..90dd5c97a10 100644 --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc @@ -2086,6 +2086,37 @@ public: { return f.fold_const_binary (LSHIFT_EXPR); } + + rtx expand (function_expander &e) const override + { + tree pred = TREE_OPERAND (e.call_expr, 3); + tree shift = TREE_OPERAND (e.call_expr, 5); + if (is_ptrue (pred, GET_MODE_UNIT_SIZE (e.result_mode ())) + && uniform_integer_cst_p (shift)) + return e.use_unpred_insn (e.direct_optab_handler (ashl_optab)); + return rtx_code_function::expand (e); + } +}; + +class svlsr_impl : public rtx_code_function +{ +public: + CONSTEXPR svlsr_impl () : rtx_code_function (LSHIFTRT, LSHIFTRT) {} + + gimple *fold (gimple_folder &f) const override + { + return f.fold_const_binary (RSHIFT_EXPR); + } + + rtx expand (function_expander &e) const override + { + tree pred = TREE_OPERAND (e.call_expr, 3); + tree shift = TREE_OPERAND (e.call_expr, 5); + if (is_ptrue (pred, GET_MODE_UNIT_SIZE (e.result_mode ())) + && uniform_integer_cst_p (shift)) + return e.use_unpred_insn (e.direct_optab_handler (lshr_optab)); + return rtx_code_function::expand (e); + } }; class svmad_impl : public function_base @@ -3586,7 +3617,7 @@ FUNCTION (svldnt1, svldnt1_impl,) FUNCTION (svlen, svlen_impl,) FUNCTION (svlsl, svlsl_impl,) FUNCTION (svlsl_wide, shift_wide, (ASHIFT, UNSPEC_ASHIFT_WIDE)) -FUNCTION (svlsr, rtx_code_function, (LSHIFTRT, LSHIFTRT)) +FUNCTION (svlsr, svlsr_impl,) FUNCTION (svlsr_wide, shift_wide, (LSHIFTRT, UNSPEC_LSHIFTRT_WIDE)) FUNCTION (svmad, svmad_impl,) FUNCTION (svmax, rtx_code_function, (SMAX, UMAX, UNSPEC_COND_FMAX, diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index cb88d6d95a6..0156afc1e7d 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -3317,6 +3317,61 @@ ;; - REVW ;; ------------------------------------------------------------------------- +(define_split + [(set (match_operand:SVE_FULL_HSDI 0 "register_operand") + (rotate:SVE_FULL_HSDI + (match_operand:SVE_FULL_HSDI 1 "register_operand") + (match_operand:SVE_FULL_HSDI 2 "aarch64_constant_vector_operand")))] + "TARGET_SVE && can_create_pseudo_p ()" + [(set (match_dup 3) + (ashift:SVE_FULL_HSDI (match_dup 1) + (match_dup 2))) + (set (match_dup 0) + (plus:SVE_FULL_HSDI + (lshiftrt:SVE_FULL_HSDI (match_dup 1) + (match_dup 4)) + (match_dup 3)))] + { + if (aarch64_emit_opt_vec_rotate (operands[0], operands[1], operands[2])) + DONE; + + if (!TARGET_SVE2) + FAIL; + + operands[3] = gen_reg_rtx (<MODE>mode); + HOST_WIDE_INT shift_amount = + INTVAL (unwrap_const_vec_duplicate (operands[2])); + int bitwidth = GET_MODE_UNIT_BITSIZE (<MODE>mode); + operands[4] = aarch64_simd_gen_const_vector_dup (<MODE>mode, + bitwidth - shift_amount); + } +) + +;; The RTL combiners are able to combine "ior (ashift, ashiftrt)" to a "bswap". +;; Match that as well. +(define_insn_and_split "*v_revvnx8hi" + [(parallel + [(set (match_operand:VNx8HI 0 "register_operand") + (bswap:VNx8HI (match_operand 1 "register_operand"))) + (clobber (match_scratch:VNx8BI 2))])] + "TARGET_SVE" + "#" + "" + [(set (match_dup 0) + (unspec:VNx8HI + [(match_dup 2) + (unspec:VNx8HI + [(match_dup 1)] + UNSPEC_REVB)] + UNSPEC_PRED_X))] + { + if (!can_create_pseudo_p ()) + operands[2] = CONSTM1_RTX (VNx8BImode); + else + operands[2] = aarch64_ptrue_reg (VNx8BImode); + } +) + ;; Predicated integer unary operations. (define_insn "@aarch64_pred_<optab><mode>" [(set (match_operand:SVE_FULL_I 0 "register_operand") diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 9e3f2885bcc..98715e34649 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -26984,11 +26984,17 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode, d.op_mode = op_mode; d.op_vec_flags = aarch64_classify_vector_mode (d.op_mode); d.target = target; - d.op0 = op0 ? force_reg (op_mode, op0) : NULL_RTX; + d.op0 = op0; + if (d.op0 && !register_operand (d.op0, op_mode)) + d.op0 = force_reg (op_mode, d.op0); if (op0 && d.one_vector_p) d.op1 = copy_rtx (d.op0); else - d.op1 = op1 ? force_reg (op_mode, op1) : NULL_RTX; + { + d.op1 = op1; + if (d.op1 && !register_operand (d.op1, op_mode)) + d.op1 = force_reg (op_mode, d.op1); + } d.testing_p = !target; if (!d.testing_p) diff --git a/gcc/expmed.cc b/gcc/expmed.cc index 8cf10d9c73b..37a525b429c 100644 --- a/gcc/expmed.cc +++ b/gcc/expmed.cc @@ -6326,7 +6326,8 @@ expand_rotate_as_vec_perm (machine_mode mode, rtx dst, rtx x, rtx amt) qimode, perm_dst); if (!res) return NULL_RTX; - emit_move_insn (dst, lowpart_subreg (mode, res, qimode)); + if (!rtx_equal_p (res, perm_dst)) + emit_move_insn (dst, lowpart_subreg (mode, res, qimode)); return dst; } diff --git a/gcc/testsuite/gcc.target/aarch64/sve/shift_rev_1.c b/gcc/testsuite/gcc.target/aarch64/sve/shift_rev_1.c new file mode 100644 index 00000000000..3a30f80d152 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/shift_rev_1.c @@ -0,0 +1,83 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=armv8.2-a+sve" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +#include <arm_sve.h> + +/* +** ror32_sve_lsl_imm: +** ptrue p3.b, all +** revw z0.d, p3/m, z0.d +** ret +*/ +svuint64_t +ror32_sve_lsl_imm (svuint64_t r) +{ + return svorr_u64_z (svptrue_b64 (), svlsl_n_u64_z (svptrue_b64 (), r, 32), + svlsr_n_u64_z (svptrue_b64 (), r, 32)); +} + +/* +** ror32_sve_lsl_operand: +** ptrue p3.b, all +** revw z0.d, p3/m, z0.d +** ret +*/ +svuint64_t +ror32_sve_lsl_operand (svuint64_t r) +{ + svbool_t pt = svptrue_b64 (); + return svorr_u64_z (pt, svlsl_n_u64_z (pt, r, 32), svlsr_n_u64_z (pt, r, 32)); +} + +/* +** ror16_sve_lsl_imm: +** ptrue p3.b, all +** revh z0.s, p3/m, z0.s +** ret +*/ +svuint32_t +ror16_sve_lsl_imm (svuint32_t r) +{ + return svorr_u32_z (svptrue_b32 (), svlsl_n_u32_z (svptrue_b32 (), r, 16), + svlsr_n_u32_z (svptrue_b32 (), r, 16)); +} + +/* +** ror16_sve_lsl_operand: +** ptrue p3.b, all +** revh z0.s, p3/m, z0.s +** ret +*/ +svuint32_t +ror16_sve_lsl_operand (svuint32_t r) +{ + svbool_t pt = svptrue_b32 (); + return svorr_u32_z (pt, svlsl_n_u32_z (pt, r, 16), svlsr_n_u32_z (pt, r, 16)); +} + +/* +** ror8_sve_lsl_imm: +** ptrue p3.b, all +** revb z0.h, p3/m, z0.h +** ret +*/ +svuint16_t +ror8_sve_lsl_imm (svuint16_t r) +{ + return svorr_u16_z (svptrue_b16 (), svlsl_n_u16_z (svptrue_b16 (), r, 8), + svlsr_n_u16_z (svptrue_b16 (), r, 8)); +} + +/* +** ror8_sve_lsl_operand: +** ptrue p3.b, all +** revb z0.h, p3/m, z0.h +** ret +*/ +svuint16_t +ror8_sve_lsl_operand (svuint16_t r) +{ + svbool_t pt = svptrue_b16 (); + return svorr_u16_z (pt, svlsl_n_u16_z (pt, r, 8), svlsr_n_u16_z (pt, r, 8)); +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/shift_rev_2.c b/gcc/testsuite/gcc.target/aarch64/sve/shift_rev_2.c new file mode 100644 index 00000000000..89d5a8a8b3e --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/shift_rev_2.c @@ -0,0 +1,63 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=armv8.2-a+sve" } */ + +#include <arm_sve.h> + +#define PTRUE_B(BITWIDTH) svptrue_b##BITWIDTH () + +#define ROR_SVE_LSL(NAME, INPUT_TYPE, SHIFT_AMOUNT, BITWIDTH) \ + INPUT_TYPE \ + NAME##_imm (INPUT_TYPE r) \ + { \ + return svorr_u##BITWIDTH##_z (PTRUE_B (BITWIDTH), \ + svlsl_n_u##BITWIDTH##_z (PTRUE_B (BITWIDTH), \ + r, SHIFT_AMOUNT), \ + svlsr_n_u##BITWIDTH##_z (PTRUE_B (BITWIDTH), \ + r, SHIFT_AMOUNT)); \ + } \ + \ + INPUT_TYPE \ + NAME##_operand (INPUT_TYPE r) \ + { \ + svbool_t pt = PTRUE_B (BITWIDTH); \ + return svorr_u##BITWIDTH##_z ( \ + pt, svlsl_n_u##BITWIDTH##_z (pt, r, SHIFT_AMOUNT), \ + svlsr_n_u##BITWIDTH##_z (pt, r, SHIFT_AMOUNT)); \ + } + +/* Make sure that the pattern doesn't match incorrect bit-widths, eg. a shift of + 8 matching the 32-bit mode. */ + +ROR_SVE_LSL (higher_ror32, svuint64_t, 64, 64); +ROR_SVE_LSL (higher_ror16, svuint32_t, 32, 32); +ROR_SVE_LSL (higher_ror8, svuint16_t, 16, 16); + +ROR_SVE_LSL (lower_ror32, svuint64_t, 16, 64); +ROR_SVE_LSL (lower_ror16, svuint32_t, 8, 32); +ROR_SVE_LSL (lower_ror8, svuint16_t, 4, 16); + +/* Check off-by-one cases. */ + +ROR_SVE_LSL (off_1_high_ror32, svuint64_t, 33, 64); +ROR_SVE_LSL (off_1_high_ror16, svuint32_t, 17, 32); +ROR_SVE_LSL (off_1_high_ror8, svuint16_t, 9, 16); + +ROR_SVE_LSL (off_1_low_ror32, svuint64_t, 31, 64); +ROR_SVE_LSL (off_1_low_ror16, svuint32_t, 15, 32); +ROR_SVE_LSL (off_1_low_ror8, svuint16_t, 7, 16); + +/* Check out of bounds cases. */ + +ROR_SVE_LSL (oob_ror32, svuint64_t, 65, 64); +ROR_SVE_LSL (oob_ror16, svuint32_t, 33, 32); +ROR_SVE_LSL (oob_ror8, svuint16_t, 17, 16); + +/* Check zero case. */ + +ROR_SVE_LSL (zero_ror32, svuint64_t, 0, 64); +ROR_SVE_LSL (zero_ror16, svuint32_t, 0, 32); +ROR_SVE_LSL (zero_ror8, svuint16_t, 0, 16); + +/* { dg-final { scan-assembler-times "revb" 0 } } */ +/* { dg-final { scan-assembler-times "revh" 0 } } */ +/* { dg-final { scan-assembler-times "revw" 0 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/shift_rev_3.c b/gcc/testsuite/gcc.target/aarch64/sve/shift_rev_3.c new file mode 100644 index 00000000000..126766d0a80 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/shift_rev_3.c @@ -0,0 +1,83 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=armv8.2-a+sve+sve2" } */ +/* { dg-final { check-function-bodies "**" "" "" } } */ + +#include <arm_sve.h> + +/* +** lsl_usra_32_sve_lsl_imm: +** lsl z0.d, z1.d, #34 +** usra z0.d, z1.d, #30 +** ret +*/ +svuint64_t +lsl_usra_32_sve_lsl_imm (svuint64_t __attribute__ ((unused)) dummy, svuint64_t r) +{ + return svorr_u64_z (svptrue_b64 (), svlsl_n_u64_z (svptrue_b64 (), r, 34), + svlsr_n_u64_z (svptrue_b64 (), r, 30)); +} + +/* +** lsl_usra_32_sve_lsl_operand: +** lsl z0.d, z1.d, #34 +** usra z0.d, z1.d, #30 +** ret +*/ +svuint64_t +lsl_usra_32_sve_lsl_operand (svuint64_t __attribute__ ((unused)) dummy, svuint64_t r) +{ + svbool_t pt = svptrue_b64 (); + return svorr_u64_z (pt, svlsl_n_u64_z (pt, r, 34), svlsr_n_u64_z (pt, r, 30)); +} + +/* +** lsl_usra_16_sve_lsl_imm: +** lsl z0.s, z1.s, #14 +** usra z0.s, z1.s, #18 +** ret +*/ +svuint32_t +lsl_usra_16_sve_lsl_imm (svuint32_t __attribute__ ((unused)) dummy, svuint32_t r) +{ + return svorr_u32_z (svptrue_b32 (), svlsl_n_u32_z (svptrue_b32 (), r, 14), + svlsr_n_u32_z (svptrue_b32 (), r, 18)); +} + +/* +** lsl_usra_16_sve_lsl_operand: +** lsl z0.s, z1.s, #14 +** usra z0.s, z1.s, #18 +** ret +*/ +svuint32_t +lsl_usra_16_sve_lsl_operand (svuint32_t __attribute__ ((unused)) dummy, svuint32_t r) +{ + svbool_t pt = svptrue_b32 (); + return svorr_u32_z (pt, svlsl_n_u32_z (pt, r, 14), svlsr_n_u32_z (pt, r, 18)); +} + +/* +** lsl_usra_8_sve_lsl_imm: +** lsl z0.h, z1.h, #6 +** usra z0.h, z1.h, #10 +** ret +*/ +svuint16_t +lsl_usra_8_sve_lsl_imm (svuint16_t __attribute__ ((unused)) dummy, svuint16_t r) +{ + return svorr_u16_z (svptrue_b16 (), svlsl_n_u16_z (svptrue_b16 (), r, 6), + svlsr_n_u16_z (svptrue_b16 (), r, 10)); +} + +/* +** lsl_usra_8_sve_lsl_operand: +** lsl z0.h, z1.h, #6 +** usra z0.h, z1.h, #10 +** ret +*/ +svuint16_t +lsl_usra_8_sve_lsl_operand (svuint16_t __attribute__ ((unused)) dummy, svuint16_t r) +{ + svbool_t pt = svptrue_b16 (); + return svorr_u16_z (pt, svlsl_n_u16_z (pt, r, 6), svlsr_n_u16_z (pt, r, 10)); +} -- 2.44.0