Add a fold at gimple_fold_builtin to prefer the highpart variant of a builtin if the arguments are better suited to it. This helps us avoid copying data between lanes before operation.
E.g. We prefer to use UMULL2 rather than DUP+UMULL for the following: uint16x8_t foo(const uint8x16_t s) { const uint8x16_t f0 = vdupq_n_u8(4); return vmull_u8(vget_high_u8(s), vget_high_u8(f0)); } gcc/ChangeLog: * config/aarch64/aarch64-builtins.cc (LO_HI_PAIRINGS): New macro. Cover every lo/hi pairing in builtin-pairs.def. (aarch64_get_highpart_builtin): New function. Get the fndecl for the hi builtin paired with FCODE. (LO_HI_PAIR): New macro. (aarch64_object_of_bfr): New function. Parse BIT_FIELD_REF expressions. (aarch64_duplicate_vector_cst): New function. (aarch64_nbit_vector_type_p): New function. Check if a type describes an n-bit vector. (aarch64_vq_high_half): New function. Helper to identify vector highparts. (aarch64_fold_lo_call_to_hi): New function. Perform the fold described here. (aarch64_general_gimple_fold_builtin): Add cases for lo builtins. * config/aarch64/aarch64-builtin-pairs.def: New file. Declare pairings of lo/hi builtins. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/vabal_combine.c: Removed. * gcc.target/aarch64/simd/fold_to_highpart_1.c: New test. * gcc.target/aarch64/simd/fold_to_highpart_2.c: New test. * gcc.target/aarch64/simd/fold_to_highpart_3.c: New test. * gcc.target/aarch64/simd/fold_to_highpart_4.c: New test. * gcc.target/aarch64/simd/fold_to_highpart_5.c: New test. * gcc.target/aarch64/simd/fold_to_highpart_6.c: New test. * gcc.target/aarch64/simd/fold_to_highpart_7.c: New test. --- gcc/config/aarch64/aarch64-builtin-pairs.def | 81 ++ gcc/config/aarch64/aarch64-builtins.cc | 206 +++++ .../aarch64/simd/fold_to_highpart_1.c | 733 ++++++++++++++++++ .../aarch64/simd/fold_to_highpart_2.c | 86 ++ .../aarch64/simd/fold_to_highpart_3.c | 81 ++ .../aarch64/simd/fold_to_highpart_4.c | 77 ++ .../aarch64/simd/fold_to_highpart_5.c | 38 + .../aarch64/simd/fold_to_highpart_6.c | 94 +++ .../aarch64/simd/fold_to_highpart_7.c | 36 + .../gcc.target/aarch64/simd/vabal_combine.c | 72 -- 10 files changed, 1432 insertions(+), 72 deletions(-) create mode 100644 gcc/config/aarch64/aarch64-builtin-pairs.def create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_1.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_3.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_4.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_5.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_7.c delete mode 100644 gcc/testsuite/gcc.target/aarch64/simd/vabal_combine.c diff --git a/gcc/config/aarch64/aarch64-builtin-pairs.def b/gcc/config/aarch64/aarch64-builtin-pairs.def new file mode 100644 index 00000000000..e1dc0b71a1c --- /dev/null +++ b/gcc/config/aarch64/aarch64-builtin-pairs.def @@ -0,0 +1,81 @@ +/* Pairings of AArch64 builtins that can be folded into each other. + Copyright (C) 2025 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + <http://www.gnu.org/licenses/>. */ + +/* LO/HI widenable integer modes. */ +#define LO_HI_PAIR_V_WI(T, LO, HI) \ + LO_HI_PAIR (T##_##LO##v2si, T##_##HI##v4si) \ + LO_HI_PAIR (T##_##LO##v4hi, T##_##HI##v8hi) \ + LO_HI_PAIR (T##_##LO##v8qi, T##_##HI##v16qi) + +/* LO/HI Single/Half integer modes. */ +#define LO_HI_PAIR_V_HSI(T, LO, HI) \ + LO_HI_PAIR (T##_##LO##v2si, T##_##HI##v4si) \ + LO_HI_PAIR (T##_##LO##v4hi, T##_##HI##v8hi) + +#define UNOP_LONG_LH_PAIRS \ + LO_HI_PAIR (UNOP_sxtlv8hi, UNOP_vec_unpacks_hi_v16qi) \ + LO_HI_PAIR (UNOP_sxtlv4si, UNOP_vec_unpacks_hi_v8hi) \ + LO_HI_PAIR (UNOP_sxtlv2di, UNOP_vec_unpacks_hi_v4si) \ + LO_HI_PAIR (UNOPU_uxtlv8hi, UNOPU_vec_unpacku_hi_v16qi) \ + LO_HI_PAIR (UNOPU_uxtlv4si, UNOPU_vec_unpacku_hi_v8hi) \ + LO_HI_PAIR (UNOPU_uxtlv2di, UNOPU_vec_unpacku_hi_v4si) \ + LO_HI_PAIR (UNOP_float_extend_lo_v4sf, UNOP_vec_unpacks_hi_v8hf) \ + LO_HI_PAIR (UNOP_float_extend_lo_v2df, UNOP_vec_unpacks_hi_v4sf) \ + LO_HI_PAIR (UNOP_vbfcvtv4bf, UNOP_vbfcvt_highv8bf) + +#define BINOP_LONG_LH_PAIRS \ + LO_HI_PAIR_V_WI (BINOP, saddl, saddl2) \ + LO_HI_PAIR_V_WI (BINOPU, uaddl, uaddl2) \ + LO_HI_PAIR_V_WI (BINOP, ssubl, ssubl2) \ + LO_HI_PAIR_V_WI (BINOPU, usubl, usubl2) \ + LO_HI_PAIR_V_WI (BINOP, sabdl, sabdl2) \ + LO_HI_PAIR_V_WI (BINOPU, uabdl, uabdl2) \ + LO_HI_PAIR_V_WI (BINOP, intrinsic_vec_smult_lo_, vec_widen_smult_hi_) \ + LO_HI_PAIR_V_WI (BINOPU, intrinsic_vec_umult_lo_, vec_widen_umult_hi_) \ + LO_HI_PAIR_V_HSI (BINOP, sqdmull, sqdmull2) \ + LO_HI_PAIR (BINOPP_pmullv8qi, BINOPP_pmull_hiv16qi) + +#define BINOP_LONG_N_LH_PAIRS \ + LO_HI_PAIR_V_HSI (BINOP, smull_n, smull_hi_n) \ + LO_HI_PAIR_V_HSI (BINOPU, umull_n, umull_hi_n) \ + LO_HI_PAIR_V_HSI (BINOP, sqdmull_n, sqdmull2_n) \ + +#define BINOP_WIDE_LH_PAIRS \ + LO_HI_PAIR_V_WI (BINOP, ssubw, ssubw2) \ + LO_HI_PAIR_V_WI (BINOPU, usubw, usubw2) \ + LO_HI_PAIR_V_WI (BINOP, saddw, saddw2) \ + LO_HI_PAIR_V_WI (BINOPU, uaddw, uaddw2) + +#define TERNOP_LONG_LH_PAIRS \ + LO_HI_PAIR_V_WI (TERNOP, smlal, smlal_hi) \ + LO_HI_PAIR_V_WI (TERNOPU, umlal, umlal_hi) \ + LO_HI_PAIR_V_WI (TERNOP, smlsl, smlsl_hi) \ + LO_HI_PAIR_V_WI (TERNOPU, umlsl, umlsl_hi) \ + LO_HI_PAIR_V_WI (TERNOP, sabal, sabal2) \ + LO_HI_PAIR_V_WI (TERNOPU, uabal, uabal2) \ + LO_HI_PAIR_V_HSI (TERNOP, sqdmlal, sqdmlal2) \ + LO_HI_PAIR_V_HSI (TERNOP, sqdmlsl, sqdmlsl2) + +#define TERNOP_LONG_N_LH_PAIRS \ + LO_HI_PAIR_V_HSI (TERNOP, smlal_n, smlal_hi_n) \ + LO_HI_PAIR_V_HSI (TERNOPU, umlal_n, umlal_hi_n) \ + LO_HI_PAIR_V_HSI (TERNOP, smlsl_n, smlsl_hi_n) \ + LO_HI_PAIR_V_HSI (TERNOPU, umlsl_n, umlsl_hi_n) \ + LO_HI_PAIR_V_HSI (TERNOP, sqdmlal_n, sqdmlal2_n) \ + LO_HI_PAIR_V_HSI (TERNOP, sqdmlsl_n, sqdmlsl2_n) diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc index 128cc365d3d..6cffbdb79a9 100644 --- a/gcc/config/aarch64/aarch64-builtins.cc +++ b/gcc/config/aarch64/aarch64-builtins.cc @@ -48,6 +48,8 @@ #include "attribs.h" #include "gimple-fold.h" #include "builtins.h" +#include "tree-pass.h" +#include "tree-vector-builder.h" #include "aarch64-builtins.h" using namespace aarch64; @@ -737,6 +739,16 @@ static aarch64_simd_builtin_datum aarch64_simd_builtin_data[] = { VGET_HIGH_BUILTIN(u64) \ VGET_HIGH_BUILTIN(bf16) +#include "aarch64-builtin-pairs.def" + +#define LO_HI_PAIRINGS \ + UNOP_LONG_LH_PAIRS \ + BINOP_WIDE_LH_PAIRS \ + BINOP_LONG_LH_PAIRS \ + BINOP_LONG_N_LH_PAIRS \ + TERNOP_LONG_LH_PAIRS \ + TERNOP_LONG_N_LH_PAIRS \ + typedef struct { const char *name; @@ -4982,6 +4994,196 @@ aarch64_gimple_fold_pragma_builtin } } +/* Return the fndecl of the builtin paired with FCODE_LO if one + exists (see aarch64-builtin-pairs.def), or NULL_TREE if not. */ +static inline tree +aarch64_get_highpart_builtin (unsigned int fcode_lo) +{ +#undef LO_HI_PAIR +#define LO_HI_PAIR(A, B) case AARCH64_SIMD_BUILTIN_##A: \ + return aarch64_builtin_decls[AARCH64_SIMD_BUILTIN_##B]; + + switch (fcode_lo) + { + LO_HI_PAIRINGS + default: + return NULL_TREE; + } +} + +/* If the SSA_NAME_DEF_STMT of ARG is an assignement to a + BIT_FIELD_REF with SIZE and OFFSET, return the object of the + BIT_FIELD_REF. Otherwise, return NULL_TREE. */ +static tree +aarch64_object_of_bfr (const_tree arg, unsigned HOST_WIDE_INT size, + unsigned HOST_WIDE_INT offset) +{ + if (TREE_CODE (arg) != SSA_NAME) + return NULL_TREE; + + gassign *stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (arg)); + + if (!stmt) + return NULL_TREE; + + if (gimple_assign_rhs_code (stmt) != BIT_FIELD_REF) + return NULL_TREE; + + tree bf_ref = gimple_assign_rhs1 (stmt); + + if (bit_field_size (bf_ref).to_constant () != size + || bit_field_offset (bf_ref).to_constant () != offset) + return NULL_TREE; + + return TREE_OPERAND (bf_ref, 0); +} + +/* Build and return a new VECTOR_CST of type OUT_TY using the + elements of VEC_IN. */ +static tree +aarch64_duplicate_vector_cst (const_tree vec_in, tree out_ty) +{ + gcc_assert (TREE_CODE (vec_in) == VECTOR_CST + && VECTOR_TYPE_P (out_ty)); + unsigned HOST_WIDE_INT nelts + = VECTOR_CST_NELTS (vec_in).to_constant (); + + tree_vector_builder vec_out (out_ty, nelts, 1); + for (unsigned i = 0; i < nelts; i++) + vec_out.quick_push (VECTOR_CST_ELT (vec_in, i)); + + return vec_out.build (); +} + +/* Return true if TYPE denotes a vector type with a known + and constant size in bits N. Return false otherwise. */ +static inline bool +aarch64_nbit_vector_type_p (const_tree type, + unsigned HOST_WIDE_INT n) +{ + if (!VECTOR_TYPE_P (type)) + return false; + + return (tree_fits_uhwi_p (TYPE_SIZE (type)) + && wi::to_widest (TYPE_SIZE (type)) == n); +} + +/* Helper for aarch64_fold_lo_call_to_hi; if ARG is a reference to the + upper half of a 128b vector then return the 128b vector. Otherwise, + return NULL_TREE. */ +static tree +aarch64_vq_high_half (const_tree arg) +{ + unsigned int offset = BYTES_BIG_ENDIAN ? 0 : 64; + tree base = aarch64_object_of_bfr (arg, 64, offset); + + if (!base || !aarch64_nbit_vector_type_p (TREE_TYPE (base), 128)) + return NULL_TREE; + + return base; +} + +/* Fold a builtin call to it's hi equivalent if the arguments + are better suited to it. + + Return the new call if so, otherwise nullptr. */ +static gcall * +aarch64_fold_lo_call_to_hi (unsigned int fcode, gcall *stmt, + gimple_stmt_iterator *gsi) +{ + /* Punt until as late as possible: + 1) By folding away BIT_FIELD_REFs we remove information about the + operands that may be useful to other optimizers. + 2) For simplicity, we'd like the expression + + x = BIT_FIELD_REF<a, 64, 64> + + to imply that A is not a VECTOR_CST. This assumption is unlikely + to hold before constant propagation/folding. */ + if (!(cfun->curr_properties & PROP_last_full_fold)) + return nullptr; + + tree vectype_hi = NULL_TREE; + tree builtin_hi = aarch64_get_highpart_builtin (fcode); + gcc_assert (builtin_hi != NULL_TREE); + + /* Prefer to use the highpart builtin when at least one vector + argument is a reference to the upper half of a 128b vector, and + all others are VECTOR_CSTs. */ + auto_vec<unsigned int, 2> vec_constants; + auto_vec<unsigned int, 2> vec_highparts; + auto_vec<tree, 4> new_args; + + /* The interesting args are those that differ between the lo/hi + builtins. Walk the function signatures to find these. */ + tree types_hi = TYPE_ARG_TYPES (TREE_TYPE (builtin_hi)); + tree types_lo = TYPE_ARG_TYPES (gimple_call_fntype (stmt)); + unsigned int argno = 0; + while (types_lo != void_list_node && types_hi != void_list_node) + { + tree type_lo = TREE_VALUE (types_lo); + tree type_hi = TREE_VALUE (types_hi); + tree curr_arg = gimple_call_arg (stmt, argno); + if (!types_compatible_p (type_lo, type_hi)) + { + /* Check our assumptions about this pair. */ + gcc_assert (aarch64_nbit_vector_type_p (type_lo, 64)); + if (!vectype_hi) + { + gcc_assert (aarch64_nbit_vector_type_p (type_hi, 128)); + vectype_hi = type_hi; + } + else + gcc_assert (type_hi == vectype_hi); + + if (tree vq = aarch64_vq_high_half (curr_arg)) + { + curr_arg = vq; + vec_highparts.safe_push (argno); + } + else if (TREE_CODE (curr_arg) == VECTOR_CST) + vec_constants.safe_push (argno); + else + return nullptr; + } + new_args.safe_push (curr_arg); + argno++; + types_hi = TREE_CHAIN (types_hi); + types_lo = TREE_CHAIN (types_lo); + } + gcc_assert (types_lo == void_list_node + && types_hi == void_list_node); + if (vec_highparts.is_empty ()) + return nullptr; + + /* Build a valid call to BUILTIN_HI. */ + for (auto i : vec_constants) + new_args[i] = aarch64_duplicate_vector_cst (new_args[i], + vectype_hi); + for (auto i : vec_highparts) + { + if (!types_compatible_p (TREE_TYPE (new_args[i]), vectype_hi)) + { + /* Reinterpret this vector to VECTYPE_HI. */ + tree vce_ssa = make_ssa_name (vectype_hi); + tree vce_expr = build1 (VIEW_CONVERT_EXPR, vectype_hi, + new_args[i]); + gsi_insert_before (gsi, + gimple_build_assign (vce_ssa, vce_expr), + GSI_SAME_STMT); + new_args[i] = vce_ssa; + } + } + + gcall *new_call + = gimple_build_call_vec (builtin_hi, new_args); + gimple_call_set_lhs (new_call, gimple_call_lhs (stmt)); + return new_call; +} + +#undef LO_HI_PAIR +#define LO_HI_PAIR(A, B) case AARCH64_SIMD_BUILTIN_##A: + /* Try to fold STMT, given that it's a call to the built-in function with subcode FCODE. Return the new statement on success and null on failure. */ @@ -5168,6 +5370,10 @@ aarch64_general_gimple_fold_builtin (unsigned int fcode, gcall *stmt, } break; } + break; + LO_HI_PAIRINGS + new_stmt = aarch64_fold_lo_call_to_hi (fcode, stmt, gsi); + break; case AARCH64_SIMD_BUILTIN_LANE_CHECK: if (aarch64_fold_builtin_lane_check (args[0], args[1], args[2])) { diff --git a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_1.c b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_1.c new file mode 100644 index 00000000000..f6dc4e52362 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_1.c @@ -0,0 +1,733 @@ +/* { dg-do compile } */ +/* { dg-options "-O -march=armv9-a+bf16" } */ + +#include <arm_neon.h> + +/* Prefer the highpart variant of a builtin when it's arguments + are vector highparts. */ + +#ifndef TEST_UN_HIGHPARTS +#define TEST_UN_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) \ + RETTYPE test_ ## FN ## _ ## SUFF (INTYPE a) \ + { \ + return FN ## _ ## SUFF (vget_high_ ## SUFF (a)); \ + } +#endif + +#ifndef TEST_BIN_W_HIGHPARTS +#define TEST_BIN_W_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) \ + RETTYPE test_ ## FN ## _ ## SUFF (RETTYPE a, INTYPE b) \ + { \ + return FN ## _ ## SUFF (a, vget_high_ ## SUFF (b)); \ + } +#endif + +#ifndef TEST_BIN_N_HIGHPARTS +#define TEST_BIN_N_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) \ + RETTYPE test_ ## FN ## _ ## SUFF (INTYPE a) \ + { \ + return FN ## _ ## SUFF (vget_high_ ## SUFF (a), a[1]); \ + } +#endif + +#ifndef TEST_TERN_N_HIGHPARTS +#define TEST_TERN_N_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) \ + RETTYPE test_ ## FN ## _ ## SUFF (RETTYPE a, INTYPE b) \ + { \ + return FN ## _ ## SUFF (a, vget_high_ ## SUFF (b), b[1]); \ + } +#endif + +#ifndef TEST_BIN_HIGHPARTS +#define TEST_BIN_HIGHPARTS(FN, RETTYPE, INTYPE, H_INTYPE, SUFF) \ + RETTYPE test_ ## FN ## _ ## SUFF (INTYPE a, INTYPE b) \ + { \ + return FN ## _ ## SUFF (vget_high_ ## SUFF (a), \ + vget_high_ ## SUFF (b)); \ + } +#endif + +#ifndef TEST_TERN_HIGHPARTS +#define TEST_TERN_HIGHPARTS(FN, RETTYPE, INTYPE, H_INTYPE, SUFF) \ + RETTYPE test_ ## FN ## _ ## SUFF (RETTYPE a, INTYPE b, INTYPE c) \ + { \ + return FN ## _ ## SUFF (a, vget_high_ ## SUFF (b), \ + vget_high_ ## SUFF (c)); \ + } +#endif + +#define TEST_UN_VQW(FN) \ + TEST_UN_HIGHPARTS (FN, int16x8_t, int8x16_t, s8) \ + TEST_UN_HIGHPARTS (FN, uint16x8_t, uint8x16_t, u8) \ + TEST_UN_HIGHPARTS (FN, int32x4_t, int16x8_t, s16) \ + TEST_UN_HIGHPARTS (FN, uint32x4_t, uint16x8_t, u16) \ + TEST_UN_HIGHPARTS (FN, int64x2_t, int32x4_t, s32) \ + TEST_UN_HIGHPARTS (FN, uint64x2_t, uint32x4_t, u32) + +#define TEST_BIN_VQW(FN) \ + TEST_BIN_HIGHPARTS (FN, int16x8_t, int8x16_t, int8x8_t, s8) \ + TEST_BIN_HIGHPARTS (FN, uint16x8_t, uint8x16_t, uint8x8_t, u8) \ + TEST_BIN_HIGHPARTS (FN, int32x4_t, int16x8_t, int16x4_t, s16) \ + TEST_BIN_HIGHPARTS (FN, uint32x4_t, uint16x8_t, uint16x4_t, u16) \ + TEST_BIN_HIGHPARTS (FN, int64x2_t, int32x4_t, int32x2_t, s32) \ + TEST_BIN_HIGHPARTS (FN, uint64x2_t, uint32x4_t, uint32x2_t, u32) + +#define TEST_BIN_N_VQW(FN) \ + TEST_BIN_N_HIGHPARTS (FN, int32x4_t, int16x8_t, s16) \ + TEST_BIN_N_HIGHPARTS (FN, uint32x4_t, uint16x8_t, u16) \ + TEST_BIN_N_HIGHPARTS (FN, int64x2_t, int32x4_t, s32) \ + TEST_BIN_N_HIGHPARTS (FN, uint64x2_t, uint32x4_t, u32) + +#define TEST_BIN_W_VQW(FN) \ + TEST_BIN_W_HIGHPARTS (FN, int16x8_t, int8x16_t, s8) \ + TEST_BIN_W_HIGHPARTS (FN, uint16x8_t, uint8x16_t, u8) \ + TEST_BIN_W_HIGHPARTS (FN, int32x4_t, int16x8_t, s16) \ + TEST_BIN_W_HIGHPARTS (FN, uint32x4_t, uint16x8_t, u16) \ + TEST_BIN_W_HIGHPARTS (FN, int64x2_t, int32x4_t, s32) \ + TEST_BIN_W_HIGHPARTS (FN, uint64x2_t, uint32x4_t, u32) + +#define TEST_TERN_N_VQW(FN) \ + TEST_TERN_N_HIGHPARTS (FN, int32x4_t, int16x8_t, s16) \ + TEST_TERN_N_HIGHPARTS (FN, uint32x4_t, uint16x8_t, u16) \ + TEST_TERN_N_HIGHPARTS (FN, int64x2_t, int32x4_t, s32) \ + TEST_TERN_N_HIGHPARTS (FN, uint64x2_t, uint32x4_t, u32) + +#define TEST_TERN_VQW(FN) \ + TEST_TERN_HIGHPARTS (FN, int16x8_t, int8x16_t, int8x8_t, s8) \ + TEST_TERN_HIGHPARTS (FN, uint16x8_t, uint8x16_t, uint8x8_t, u8) \ + TEST_TERN_HIGHPARTS (FN, int32x4_t, int16x8_t, int16x4_t, s16) \ + TEST_TERN_HIGHPARTS (FN, uint32x4_t, uint16x8_t, uint16x4_t, u16) \ + TEST_TERN_HIGHPARTS (FN, int64x2_t, int32x4_t, int32x2_t, s32) \ + TEST_TERN_HIGHPARTS (FN, uint64x2_t, uint32x4_t, uint32x2_t, u32) + +#define TEST_VQDMULL \ + TEST_BIN_HIGHPARTS (vqdmull, int32x4_t, int16x8_t, int16x4_t, s16) \ + TEST_BIN_HIGHPARTS (vqdmull, int64x2_t, int32x4_t, int32x2_t, s32) + +#define TEST_VQDMULL_N \ + TEST_BIN_N_HIGHPARTS (vqdmull_n, int32x4_t, int16x8_t, s16) \ + TEST_BIN_N_HIGHPARTS (vqdmull_n, int64x2_t, int32x4_t, s32) + +#define TEST_VQMLAL \ + TEST_TERN_HIGHPARTS (vqdmlal, int32x4_t, int16x8_t, int16x4_t, s16) \ + TEST_TERN_HIGHPARTS (vqdmlal, int64x2_t, int32x4_t, int32x2_t, s32) + +#define TEST_VQMLAL_N \ + TEST_TERN_N_HIGHPARTS (vqdmlal_n, int32x4_t, int16x8_t, s16) \ + TEST_TERN_N_HIGHPARTS (vqdmlal_n, int64x2_t, int32x4_t, s32) + +#define TEST_VQMLSL \ + TEST_TERN_HIGHPARTS (vqdmlsl, int32x4_t, int16x8_t, int16x4_t, s16) \ + TEST_TERN_HIGHPARTS (vqdmlsl, int64x2_t, int32x4_t, int32x2_t, s32) + +#define TEST_VQMLSL_N \ + TEST_TERN_N_HIGHPARTS (vqdmlsl_n, int32x4_t, int16x8_t, s16) \ + TEST_TERN_N_HIGHPARTS (vqdmlsl_n, int64x2_t, int32x4_t, s32) + +#define TEST_VMOVL \ + TEST_UN_VQW (vmovl) + +#define TEST_VCVT \ + TEST_UN_HIGHPARTS (vcvt_f32, float32x4_t, float16x8_t, f16) \ + TEST_UN_HIGHPARTS (vcvt_f32, float32x4_t, bfloat16x8_t, bf16) \ + TEST_UN_HIGHPARTS (vcvt_f64, float64x2_t, float32x4_t, f32) + +#define TEST_VMULL \ + TEST_BIN_VQW (vmull) + +#define TEST_VMULL_N \ + TEST_BIN_N_VQW (vmull_n) + +#define TEST_VADDL \ + TEST_BIN_VQW (vaddl) + +#define TEST_VSUBL \ + TEST_BIN_VQW (vsubl) + +#define TEST_VMLAL \ + TEST_TERN_VQW (vmlal) + +#define TEST_VMLAL_N \ + TEST_TERN_N_VQW (vmlal_n) + +#define TEST_VMLSL \ + TEST_TERN_VQW (vmlsl) + +#define TEST_VMLSL_N \ + TEST_TERN_N_VQW (vmlsl_n) + +#define TEST_VABDL \ + TEST_BIN_VQW (vabdl) + +#define TEST_VABAL \ + TEST_TERN_VQW (vabal) + +#define TEST_VSUBW \ + TEST_BIN_W_VQW (vsubw) + +#define TEST_VADDW \ + TEST_BIN_W_VQW (vaddw) + +/* +** test_vmovl_s8: +** sxtl2 v0\.8h, v0\.16b +** ret +*/ + +/* +** test_vmovl_u8: +** uxtl2 v0\.8h, v0\.16b +** ret +*/ + +/* +** test_vmovl_s16: +** sxtl2 v0\.4s, v0\.8h +** ret +*/ + +/* +** test_vmovl_u16: +** uxtl2 v0\.4s, v0\.8h +** ret +*/ + +/* +** test_vmovl_s32: +** sxtl2 v0\.2d, v0\.4s +** ret +*/ + +/* +** test_vmovl_u32: +** uxtl2 v0\.2d, v0\.4s +** ret +*/ + +TEST_VMOVL + +/* +** test_vcvt_f32_f16: +** fcvtl2 v0\.4s, v0\.8h +** ret +*/ + +/* +** test_vcvt_f32_bf16: +** shll2 v0\.4s, v0\.8h, #16 +** ret +*/ + +/* +** test_vcvt_f64_f32: +** fcvtl2 v0\.2d, v0\.4s +** ret +*/ + +TEST_VCVT + +/* +** test_vmull_s8: +** smull2 v0\.8h, (v0\.16b, v1\.16b|v1\.16b, v0\.16b) +** ret +*/ + +/* +** test_vmull_u8: +** umull2 v0\.8h, (v0\.16b, v1\.16b|v1\.16b, v0\.16b) +** ret +*/ + +/* +** test_vmull_s16: +** smull2 v0\.4s, (v0\.8h, v1\.8h|v1\.8h, v0\.8h) +** ret +*/ + +/* +** test_vmull_u16: +** umull2 v0\.4s, (v0\.8h, v1\.8h|v1\.8h, v0\.8h) +** ret +*/ + +/* +** test_vmull_s32: +** smull2 v0\.2d, (v0\.4s, v1\.4s|v1\.4s, v0\.4s) +** ret +*/ + +/* +** test_vmull_u32: +** umull2 v0\.2d, (v0\.4s, v1\.4s|v1\.4s, v0\.4s) +** ret +*/ + +TEST_VMULL + +/* +** test_vmull_n_s16: +** smull2 v0\.4s, v0\.8h, v0\.h\[[0-7]\] +** ret +*/ + +/* +** test_vmull_n_u16: +** umull2 v0\.4s, v0\.8h, v0\.h\[[0-7]\] +** ret +*/ + +/* +** test_vmull_n_s32: +** smull2 v0\.2d, v0\.4s, v0\.s\[[0-3]\] +** ret +*/ + +/* +** test_vmull_n_u32: +** umull2 v0\.2d, v0\.4s, v0\.s\[[0-3]\] +** ret +*/ + +TEST_VMULL_N + +/* +** test_vaddl_s8: +** saddl2 v0\.8h, (v0\.16b, v1\.16b|v1\.16b, v0\.16b) +** ret +*/ + +/* +** test_vaddl_u8: +** uaddl2 v0\.8h, (v0\.16b, v1\.16b|v1\.16b, v0\.16b) +** ret +*/ + +/* +** test_vaddl_s16: +** saddl2 v0\.4s, (v0\.8h, v1\.8h|v1\.8h, v0\.8h) +** ret +*/ + +/* +** test_vaddl_u16: +** uaddl2 v0\.4s, (v0\.8h, v1\.8h|v1\.8h, v0\.8h) +** ret +*/ + +/* +** test_vaddl_s32: +** saddl2 v0\.2d, (v0\.4s, v1\.4s|v1\.4s, v0\.4s) +** ret +*/ + +/* +** test_vaddl_u32: +** uaddl2 v0\.2d, (v0\.4s, v1\.4s|v1\.4s, v0\.4s) +** ret +*/ + +TEST_VADDL + +/* +** test_vsubl_s8: +** ssubl2 v0\.8h, v0\.16b, v1\.16b +** ret +*/ + +/* +** test_vsubl_u8: +** usubl2 v0\.8h, v0\.16b, v1\.16b +** ret +*/ + +/* +** test_vsubl_s16: +** ssubl2 v0\.4s, v0\.8h, v1\.8h +** ret +*/ + +/* +** test_vsubl_u16: +** usubl2 v0\.4s, v0\.8h, v1\.8h +** ret +*/ + +/* +** test_vsubl_s32: +** ssubl2 v0\.2d, v0\.4s, v1\.4s +** ret +*/ + +/* +** test_vsubl_u32: +** usubl2 v0\.2d, v0\.4s, v1\.4s +** ret +*/ + +TEST_VSUBL + +/* +** test_vabal_s8: +** sabal2 v0\.8h, (v1\.16b, v2\.16b|v2\.16b, v1\.16b) +** ret +*/ + +/* +** test_vabal_u8: +** uabal2 v0\.8h, (v1\.16b, v2\.16b|v2\.16b, v1\.16b) +** ret +*/ + +/* +** test_vabal_s16: +** sabal2 v0\.4s, (v1\.8h, v2\.8h|v2\.8h, v1\.8h) +** ret +*/ + +/* +** test_vabal_u16: +** uabal2 v0\.4s, (v1\.8h, v2\.8h|v2\.8h, v1\.8h) +** ret +*/ + +/* +** test_vabal_s32: +** sabal2 v0\.2d, (v1\.4s, v2\.4s|v2\.4s, v1\.4s) +** ret +*/ + +/* +** test_vabal_u32: +** uabal2 v0\.2d, (v1\.4s, v2\.4s|v2\.4s, v1\.4s) +** ret +*/ + +TEST_VABAL + +/* +** test_vsubw_s8: +** ssubw2 v0\.8h, v0\.8h, v1\.16b +** ret +*/ + +/* +** test_vsubw_u8: +** usubw2 v0\.8h, v0\.8h, v1\.16b +** ret +*/ + +/* +** test_vsubw_s16: +** ssubw2 v0\.4s, v0\.4s, v1\.8h +** ret +*/ + +/* +** test_vsubw_u16: +** usubw2 v0\.4s, v0\.4s, v1\.8h +** ret +*/ + +/* +** test_vsubw_s32: +** ssubw2 v0\.2d, v0\.2d, v1\.4s +** ret +*/ + +/* +** test_vsubw_u32: +** usubw2 v0\.2d, v0\.2d, v1\.4s +** ret +*/ + +TEST_VSUBW + +/* +** test_vaddw_s8: +** saddw2 v0\.8h, v0\.8h, v1\.16b +** ret +*/ + +/* +** test_vaddw_u8: +** uaddw2 v0\.8h, v0\.8h, v1\.16b +** ret +*/ + +/* +** test_vaddw_s16: +** saddw2 v0\.4s, v0\.4s, v1\.8h +** ret +*/ + +/* +** test_vaddw_u16: +** uaddw2 v0\.4s, v0\.4s, v1\.8h +** ret +*/ + +/* +** test_vaddw_s32: +** saddw2 v0\.2d, v0\.2d, v1\.4s +** ret +*/ + +/* +** test_vaddw_u32: +** uaddw2 v0\.2d, v0\.2d, v1\.4s +** ret +*/ + +TEST_VADDW + +/* +** test_vabdl_s8: +** sabdl2 v0\.8h, (v0\.16b, v1\.16b|v1\.16b, v0\.16b) +** ret +*/ + +/* +** test_vabdl_u8: +** uabdl2 v0\.8h, (v0\.16b, v1\.16b|v1\.16b, v0\.16b) +** ret +*/ + +/* +** test_vabdl_s16: +** sabdl2 v0\.4s, (v0\.8h, v1\.8h|v1\.8h, v0\.8h) +** ret +*/ + +/* +** test_vabdl_u16: +** uabdl2 v0\.4s, (v0\.8h, v1\.8h|v1\.8h, v0\.8h) +** ret +*/ + +/* +** test_vabdl_s32: +** sabdl2 v0\.2d, (v0\.4s, v1\.4s|v1\.4s, v0\.4s) +** ret +*/ + +/* +** test_vabdl_u32: +** uabdl2 v0\.2d, (v0\.4s, v1\.4s|v1\.4s, v0\.4s) +** ret +*/ + +TEST_VABDL + +/* +** test_vmlal_s8: +** smlal2 v0\.8h, (v1\.16b, v2\.16b|v2\.16b, v1\.16b) +** ret +*/ + +/* +** test_vmlal_u8: +** umlal2 v0\.8h, (v1\.16b, v2\.16b|v2\.16b, v1\.16b) +** ret +*/ + +/* +** test_vmlal_s16: +** smlal2 v0\.4s, (v1\.8h, v2\.8h|v2\.8h, v1\.8h) +** ret +*/ + +/* +** test_vmlal_u16: +** umlal2 v0\.4s, (v1\.8h, v2\.8h|v2\.8h, v1\.8h) +** ret +*/ + +/* +** test_vmlal_s32: +** smlal2 v0\.2d, (v1\.4s, v2\.4s|v2\.4s, v1\.4s) +** ret +*/ + +/* +** test_vmlal_u32: +** umlal2 v0\.2d, (v1\.4s, v2\.4s|v2\.4s, v1\.4s) +** ret +*/ + +TEST_VMLAL + +/* +** test_vmlal_n_s16: +** smlal2 v0\.4s, v1\.8h, v1\.h\[[0-7]\] +** ret +*/ + +/* +** test_vmlal_n_u16: +** umlal2 v0\.4s, v1\.8h, v1\.h\[[0-7]\] +** ret +*/ + +/* +** test_vmlal_n_s32: +** smlal2 v0\.2d, v1\.4s, v1\.s\[[0-3]\] +** ret +*/ + +/* +** test_vmlal_n_u32: +** umlal2 v0\.2d, v1\.4s, v1\.s\[[0-3]\] +** ret +*/ + +TEST_VMLAL_N + +/* +** test_vmlsl_s8: +** smlsl2 v0\.8h, v1\.16b, v2\.16b +** ret +*/ + +/* +** test_vmlsl_u8: +** umlsl2 v0\.8h, v1\.16b, v2\.16b +** ret +*/ + +/* +** test_vmlsl_s16: +** smlsl2 v0\.4s, v1\.8h, v2\.8h +** ret +*/ + +/* +** test_vmlsl_u16: +** umlsl2 v0\.4s, v1\.8h, v2\.8h +** ret +*/ + +/* +** test_vmlsl_s32: +** smlsl2 v0\.2d, v1\.4s, v2\.4s +** ret +*/ + +/* +** test_vmlsl_u32: +** umlsl2 v0\.2d, v1\.4s, v2\.4s +** ret +*/ + +TEST_VMLSL + +/* +** test_vmlsl_n_s16: +** smlsl2 v0\.4s, v1\.8h, v1\.h\[[0-7]\] +** ret +*/ + +/* +** test_vmlsl_n_u16: +** umlsl2 v0\.4s, v1\.8h, v1\.h\[[0-7]\] +** ret +*/ + +/* +** test_vmlsl_n_s32: +** smlsl2 v0\.2d, v1\.4s, v1\.s\[[0-3]\] +** ret +*/ + +/* +** test_vmlsl_n_u32: +** umlsl2 v0\.2d, v1\.4s, v1\.s\[[0-3]\] +** ret +*/ + +TEST_VMLSL_N + +/* +** test_vqdmull_s16: +** sqdmull2 v0\.4s, (v0\.8h, v1\.8h|v1\.8h, v0\.8h) +** ret +*/ + +/* +** test_vqdmull_s32: +** sqdmull2 v0\.2d, (v0\.4s, v1\.4s|v1\.4s, v0\.4s) +** ret +*/ + +TEST_VQDMULL + +/* +** test_vqdmull_n_s16: +** sqdmull2 v0\.4s, v0\.8h, v0\.h\[[0-7]\] +** ret +*/ + +/* +** test_vqdmull_n_s32: +** sqdmull2 v0\.2d, v0\.4s, v0\.s\[[0-3]\] +** ret +*/ + +TEST_VQDMULL_N + +/* +** test_vqdmlal_s16: +** sqdmlal2 v0\.4s, (v1\.8h, v2\.8h|v2\.8h, v1\.8h) +** ret +*/ + +/* +** test_vqdmlal_s32: +** sqdmlal2 v0\.2d, (v1\.4s, v2\.4s|v2\.4s, v1\.4s) +** ret +*/ + +TEST_VQMLAL + +/* +** test_vqdmlal_n_s16: +** sqdmlal2 v0\.4s, v1\.8h, v1\.h\[[0-7]\] +** ret +*/ + +/* +** test_vqdmlal_n_s32: +** sqdmlal2 v0\.2d, v1\.4s, v1\.s\[[0-3]\] +** ret +*/ + +TEST_VQMLAL_N + +/* +** test_vqdmlsl_s16: +** sqdmlsl2 v0\.4s, v1\.8h, v2\.8h +** ret +*/ + +/* +** test_vqdmlsl_s32: +** sqdmlsl2 v0\.2d, v1\.4s, v2\.4s +** ret +*/ + +TEST_VQMLSL + +/* +** test_vqdmlsl_n_s16: +** sqdmlsl2 v0\.4s, v1\.8h, v1\.h\[[0-7]\] +** ret +*/ + +/* +** test_vqdmlsl_n_s32: +** sqdmlsl2 v0\.2d, v1\.4s, v1\.s\[[0-3]\] +** ret +*/ + +TEST_VQMLSL_N + +/* { dg-final { check-function-bodies "**" ""} } */ diff --git a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_2.c b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_2.c new file mode 100644 index 00000000000..2dd3eb3268c --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_2.c @@ -0,0 +1,86 @@ +/* { dg-do compile } */ +/* { dg-options "-O -march=armv9-a+bf16" } */ + +/* Don't fold to the hi builtin unless at least one argument is a true + highpart (not that of a VECTOR_CST). */ + +#define TEST_UN_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) \ + RETTYPE test_ ## FN ## _ ## SUFF () \ + { \ + INTYPE a = vdupq_n_ ## SUFF (0x1A); \ + return FN ## _ ## SUFF (vget_high_ ## SUFF (a)); \ + } + +#define TEST_BIN_W_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) \ + RETTYPE test_ ## FN ## _ ## SUFF (RETTYPE a) \ + { \ + INTYPE b = vdupq_n_ ## SUFF (0x1A); \ + return FN ## _ ## SUFF (a, vget_high_ ## SUFF (b)); \ + } + +#define TEST_BIN_N_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) \ + RETTYPE test_ ## FN ## _ ## SUFF (INTYPE c) \ + { \ + INTYPE a = vdupq_n_ ## SUFF (0x1A); \ + return FN ## _ ## SUFF (vget_high_ ## SUFF (a), c[1]); \ + } \ + +#define TEST_TERN_N_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) \ + RETTYPE test_ ## FN ## _ ## SUFF (RETTYPE a) \ + { \ + INTYPE b = vdupq_n_ ## SUFF (0x1A); \ + return FN ## _ ## SUFF (a, vget_high_ ## SUFF (b), b[1]); \ + } \ + +#define TEST_BIN_HIGHPARTS(FN, RETTYPE, INTYPE, H_INTYPE, SUFF) \ + RETTYPE test_ ## FN ## _ ## SUFF (H_INTYPE b) \ + { \ + INTYPE a = vdupq_n_ ## SUFF (0x1A); \ + return FN ## _ ## SUFF (vget_high_ ## SUFF (a), b); \ + } \ + +#define TEST_TERN_HIGHPARTS(FN, RETTYPE, INTYPE, H_INTYPE, SUFF) \ + RETTYPE test_ ## FN ## _ ## SUFF (RETTYPE a, H_INTYPE b) \ + { \ + INTYPE c = vdupq_n_ ## SUFF (0x1A); \ + return FN ## _ ## SUFF (a, vget_high_ ## SUFF (c), b); \ + } \ + +#include "fold_to_highpart_1.c" + + +/* { dg-final { scan-assembler-not {uxtl2\t} } } */ +/* { dg-final { scan-assembler-not {sxtl2\t} } } */ + +/* { dg-final { scan-assembler-not {fcvtl2\t} } } */ +/* { dg-final { scan-assembler-not {shll2\t} } } */ + +/* { dg-final { scan-assembler-not {umull2\t} } } */ +/* { dg-final { scan-assembler-not {smull2\t} } } */ + +/* { dg-final { scan-assembler-not {uaddl2\t} } } */ +/* { dg-final { scan-assembler-not {saddl2\t} } } */ + +/* { dg-final { scan-assembler-not {usubl2\t} } } */ +/* { dg-final { scan-assembler-not {ssubl2\t} } } */ + +/* { dg-final { scan-assembler-not {uabdl2\t} } } */ +/* { dg-final { scan-assembler-not {sabdl2\t} } } */ + +/* { dg-final { scan-assembler-not {usubw2\t} } } */ +/* { dg-final { scan-assembler-not {ssubw2\t} } } */ + +/* { dg-final { scan-assembler-not {uaddw2\t} } } */ +/* { dg-final { scan-assembler-not {saddw2\t} } } */ + +/* { dg-final { scan-assembler-not {umlal2\t} } } */ +/* { dg-final { scan-assembler-not {smlal2\t} } } */ + +/* { dg-final { scan-assembler-not {umlsl2\t} } } */ +/* { dg-final { scan-assembler-not {smlsl2\t} } } */ + +/* { dg-final { scan-assembler-not {sqdmull2\t} } } */ + +/* { dg-final { scan-assembler-not {sqdmlal2\t} } } */ + +/* { dg-final { scan-assembler-not {sqdmlsl2\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_3.c b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_3.c new file mode 100644 index 00000000000..07c79ca1608 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_3.c @@ -0,0 +1,81 @@ +/* { dg-do compile } */ +/* { dg-options "-O" } */ + +/* PR117850 */ + +/* For builtins with multiple lo arguments, prefer the hi builtin if + at least one is a true highpart and all others are VECTOR_CSTs. */ + +#define TEST_UN_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) +#define TEST_BIN_W_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) +#define TEST_BIN_N_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) +#define TEST_TERN_N_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) + +#define TEST_BIN_HIGHPART_A1(FN, RETTYPE, INTYPE, SUFF) \ + RETTYPE test_a1_ ## FN ## _ ## SUFF (INTYPE a) \ + { \ + INTYPE b = vdupq_n_ ## SUFF (0x1A); \ + return FN ## _ ## SUFF (vget_high_ ## SUFF (a), \ + vget_high_ ## SUFF (b)); \ + } + +#define TEST_BIN_HIGHPART_A2(FN, RETTYPE, INTYPE, SUFF) \ + RETTYPE test_a2_ ## FN ## _ ## SUFF (INTYPE a) \ + { \ + INTYPE b = vdupq_n_ ## SUFF (0x1A); \ + return FN ## _ ## SUFF (vget_high_ ## SUFF (b), \ + vget_high_ ## SUFF (a)); \ + } + +#define TEST_TERN_HIGHPART_A1(FN, RETTYPE, INTYPE, SUFF) \ + RETTYPE test_a1_ ## FN ## _ ## SUFF (RETTYPE a, INTYPE b) \ + { \ + INTYPE c = vdupq_n_ ## SUFF (0x1A); \ + return FN ## _ ## SUFF (a, vget_high_ ## SUFF (b), \ + vget_high_ ## SUFF (c)); \ + } + +#define TEST_TERN_HIGHPART_A2(FN, RETTYPE, INTYPE, SUFF) \ + RETTYPE test_a2_ ## FN ## _ ## SUFF (RETTYPE a, INTYPE b) \ + { \ + INTYPE c = vdupq_n_ ## SUFF (0x1A); \ + return FN ## _ ## SUFF (a, vget_high_ ## SUFF (c), \ + vget_high_ ## SUFF (b)); \ + } + +#define TEST_BIN_HIGHPARTS(FN, RETTYPE, INTYPE, H_INTYPE, SUFF) \ + TEST_BIN_HIGHPART_A1 (FN, RETTYPE, INTYPE, SUFF) \ + TEST_BIN_HIGHPART_A2 (FN, RETTYPE, INTYPE, SUFF) + +#define TEST_TERN_HIGHPARTS(FN, RETTYPE, INTYPE, H_INTYPE, SUFF) \ + TEST_TERN_HIGHPART_A1 (FN, RETTYPE, INTYPE, SUFF) \ + TEST_TERN_HIGHPART_A2 (FN, RETTYPE, INTYPE, SUFF) + + +#include "fold_to_highpart_1.c" + +/* { dg-final { scan-assembler-not {dup\t} } } */ + +/* { dg-final { scan-assembler-times {smull2\t} 6} } */ +/* { dg-final { scan-assembler-times {umull2\t} 6} } */ + +/* { dg-final { scan-assembler-times {saddl2\t} 6} } */ +/* { dg-final { scan-assembler-times {uaddl2\t} 6} } */ + +/* { dg-final { scan-assembler-times {ssubl2\t} 6} } */ +/* { dg-final { scan-assembler-times {usubl2\t} 6} } */ + +/* { dg-final { scan-assembler-times {sabdl2\t} 6} } */ +/* { dg-final { scan-assembler-times {uabdl2\t} 6} } */ + +/* { dg-final { scan-assembler-times {smlal2\t} 6} } */ +/* { dg-final { scan-assembler-times {umlal2\t} 6} } */ + +/* { dg-final { scan-assembler-times {smlsl2\t} 6} } */ +/* { dg-final { scan-assembler-times {umlsl2\t} 6} } */ + +/* { dg-final { scan-assembler-times {sqdmull2\t} 4} } */ + +/* { dg-final { scan-assembler-times {sqdmlal2\t} 4} } */ + +/* { dg-final { scan-assembler-times {sqdmlsl2\t} 4} } */ diff --git a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_4.c b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_4.c new file mode 100644 index 00000000000..f77b2355fcf --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_4.c @@ -0,0 +1,77 @@ +/* { dg-do compile } */ +/* { dg-options "-O" } */ + +/* For builtins with multiple lo arguments, prefer the hi builtin if + at least one is a true highpart and all others are VECTOR_CSTs. */ + +#define VEC_64b 0x1A2E4A4FFFED773E + +#define TEST_UN_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) +#define TEST_BIN_W_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) +#define TEST_BIN_N_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) +#define TEST_TERN_N_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) + +#define TEST_BIN_HIGHPART_A1(FN, RETTYPE, INTYPE, SUFF) \ + RETTYPE test_a1_ ## FN ## _ ## SUFF (INTYPE a) \ + { \ + return FN ## _ ## SUFF (vget_high_ ## SUFF (a), \ + vcreate_ ## SUFF (VEC_64b)); \ + } + +#define TEST_BIN_HIGHPART_A2(FN, RETTYPE, INTYPE, SUFF) \ + RETTYPE test_a2_ ## FN ## _ ## SUFF (INTYPE a) \ + { \ + return FN ## _ ## SUFF (vcreate_ ## SUFF (VEC_64b), \ + vget_high_ ## SUFF (a)); \ + } + +#define TEST_TERN_HIGHPART_A1(FN, RETTYPE, INTYPE, SUFF) \ + RETTYPE test_a1_ ## FN ## _ ## SUFF (RETTYPE a, INTYPE b) \ + { \ + return FN ## _ ## SUFF (a, vget_high_ ## SUFF (b), \ + vcreate_ ## SUFF (VEC_64b)); \ + } + +#define TEST_TERN_HIGHPART_A2(FN, RETTYPE, INTYPE, SUFF) \ + RETTYPE test_a2_ ## FN ## _ ## SUFF (RETTYPE a, INTYPE b) \ + { \ + return FN ## _ ## SUFF (a, vcreate_ ## SUFF (VEC_64b), \ + vget_high_ ## SUFF (b)); \ + } + +#define TEST_BIN_HIGHPARTS(FN, RETTYPE, INTYPE, H_INTYPE, SUFF) \ + TEST_BIN_HIGHPART_A1 (FN, RETTYPE, INTYPE, SUFF) \ + TEST_BIN_HIGHPART_A2 (FN, RETTYPE, INTYPE, SUFF) + +#define TEST_TERN_HIGHPARTS(FN, RETTYPE, INTYPE, H_INTYPE, SUFF) \ + TEST_TERN_HIGHPART_A1 (FN, RETTYPE, INTYPE, SUFF) \ + TEST_TERN_HIGHPART_A2 (FN, RETTYPE, INTYPE, SUFF) + + +#include "fold_to_highpart_1.c" + +/* { dg-final { scan-assembler-not {dup\t} } } */ + +/* { dg-final { scan-assembler-times {smull2\t} 6} } */ +/* { dg-final { scan-assembler-times {umull2\t} 6} } */ + +/* { dg-final { scan-assembler-times {saddl2\t} 6} } */ +/* { dg-final { scan-assembler-times {uaddl2\t} 6} } */ + +/* { dg-final { scan-assembler-times {ssubl2\t} 6} } */ +/* { dg-final { scan-assembler-times {usubl2\t} 6} } */ + +/* { dg-final { scan-assembler-times {sabdl2\t} 6} } */ +/* { dg-final { scan-assembler-times {uabdl2\t} 6} } */ + +/* { dg-final { scan-assembler-times {smlal2\t} 6} } */ +/* { dg-final { scan-assembler-times {umlal2\t} 6} } */ + +/* { dg-final { scan-assembler-times {smlsl2\t} 6} } */ +/* { dg-final { scan-assembler-times {umlsl2\t} 6} } */ + +/* { dg-final { scan-assembler-times {sqdmull2\t} 4} } */ + +/* { dg-final { scan-assembler-times {sqdmlal2\t} 4} } */ + +/* { dg-final { scan-assembler-times {sqdmlsl2\t} 4} } */ diff --git a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_5.c b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_5.c new file mode 100644 index 00000000000..046c7a00def --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_5.c @@ -0,0 +1,38 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target aarch64_little_endian } */ +/* { dg-options "-O -fdump-tree-optimized" } */ + +#include "arm_neon.h" + +#define VEC_CST_u8 0x0102030405060708 +#define VEC_CST_u16 0x0001000200030004 +#define VEC_CST_u32 0x0000000100000002 + +/* Extend the 64b VECTOR_CST to the type required by the hi builtin. */ + +uint16x8_t +test_u8 (uint8x16_t a) +{ + const uint8x8_t b = vcreate_u8 (VEC_CST_u8); + return vmull_u8 (vget_high_u8 (a), b); +} + +/* { dg-final { scan-tree-dump-times "\{ 8, 7, 6, 5, 4, 3, 2, 1, 8, 7, 6, 5, 4, 3, 2, 1 \}" 1 "optimized" } } */ + +uint32x4_t +test_u16 (uint16x8_t a) +{ + const uint16x4_t b = vcreate_u16 (VEC_CST_u16); + return vmull_u16 (vget_high_u16 (a), b); +} + +/* { dg-final { scan-tree-dump-times "\{ 4, 3, 2, 1, 4, 3, 2, 1 \}" 1 "optimized" } } */ + +uint64x2_t +test_u32 (uint32x4_t a) +{ + const uint32x2_t b = vcreate_u32 (VEC_CST_u32); + return vmull_u32 (vget_high_u32 (a), b); +} + +/* { dg-final { scan-tree-dump-times "\{ 2, 1, 2, 1 \}" 1 "optimized" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c new file mode 100644 index 00000000000..5d41cc4e5fd --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c @@ -0,0 +1,94 @@ +/* { dg-do compile } */ +/* { dg-options "-O -march=armv9-a+bf16" } */ + +/* Test that we can still fold when the base type of the vector who's + highpart we are referring to is incompatible with that of the hi builtin. + + Use float64x2_t as it is never INTYPE. */ + +#define TEST_UN_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) \ + RETTYPE test_ ## FN ## _ ## SUFF (float64x2_t a) \ + { \ + INTYPE x = vreinterpretq_ ## SUFF ## _f64 (a); \ + return FN ## _ ## SUFF (vget_high_ ## SUFF (x)); \ + } + +#define TEST_BIN_W_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) \ + RETTYPE test_ ## FN ## _ ## SUFF (RETTYPE a, float64x2_t b) \ + { \ + INTYPE x = vreinterpretq_ ## SUFF ## _f64 (b); \ + return FN ## _ ## SUFF (a, vget_high_ ## SUFF (x)); \ + } + +#define TEST_BIN_N_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) \ + RETTYPE test_ ## FN ## _ ## SUFF (float64x2_t a) \ + { \ + INTYPE x = vreinterpretq_ ## SUFF ## _f64 (a); \ + return FN ## _ ## SUFF (vget_high_ ## SUFF (x), x[1]); \ + } + +#define TEST_TERN_N_HIGHPARTS(FN, RETTYPE, INTYPE, SUFF) \ + RETTYPE test_ ## FN ## _ ## SUFF (RETTYPE a, float64x2_t b) \ + { \ + INTYPE x = vreinterpretq_ ## SUFF ## _f64 (b); \ + return FN ## _ ## SUFF (a, vget_high_ ## SUFF (x), x[1]); \ + } + +#define TEST_BIN_HIGHPARTS(FN, RETTYPE, INTYPE, H_INTYPE, SUFF) \ + RETTYPE test_ ## FN ## _ ## SUFF (float64x2_t a, float64x2_t b) \ + { \ + INTYPE x = vreinterpretq_ ## SUFF ## _f64 (a); \ + INTYPE y = vreinterpretq_ ## SUFF ## _f64 (b); \ + return FN ## _ ## SUFF (vget_high_ ## SUFF (x), \ + vget_high_ ## SUFF (y)); \ + } + +#define TEST_TERN_HIGHPARTS(FN, RETTYPE, INTYPE, H_INTYPE, SUFF) \ + RETTYPE test_ ## FN ## _ ## SUFF (RETTYPE a, float64x2_t b, float64x2_t c) \ + { \ + INTYPE x = vreinterpretq_ ## SUFF ## _f64 (b); \ + INTYPE y = vreinterpretq_ ## SUFF ## _f64 (c); \ + return FN ## _ ## SUFF (a, vget_high_ ## SUFF (x), \ + vget_high_ ## SUFF (y)); \ + } + +#include "fold_to_highpart_1.c" + +/* { dg-final { scan-assembler-times {sxtl2\t} 3} } */ +/* { dg-final { scan-assembler-times {uxtl2\t} 3} } */ + +/* { dg-final { scan-assembler-times {fcvtl2\t} 2} } */ +/* { dg-final { scan-assembler-times {shll2\t} 1} } */ + +/* { dg-final { scan-assembler-times {smull2\t} 5} } */ +/* { dg-final { scan-assembler-times {umull2\t} 5} } */ + +/* { dg-final { scan-assembler-times {saddl2\t} 3} } */ +/* { dg-final { scan-assembler-times {uaddl2\t} 3} } */ + +/* { dg-final { scan-assembler-times {ssubl2\t} 3} } */ +/* { dg-final { scan-assembler-times {usubl2\t} 3} } */ + +/* { dg-final { scan-assembler-times {sabdl2\t} 3} } */ +/* { dg-final { scan-assembler-times {uabdl2\t} 3} } */ + +/* { dg-final { scan-assembler-times {saddw2\t} 3} } */ +/* { dg-final { scan-assembler-times {uaddw2\t} 3} } */ + +/* { dg-final { scan-assembler-times {ssubw2\t} 3} } */ +/* { dg-final { scan-assembler-times {usubw2\t} 3} } */ + +/* { dg-final { scan-assembler-times {sabdl2\t} 3} } */ +/* { dg-final { scan-assembler-times {uabdl2\t} 3} } */ + +/* { dg-final { scan-assembler-times {smlal2\t} 5} } */ +/* { dg-final { scan-assembler-times {umlal2\t} 5} } */ + +/* { dg-final { scan-assembler-times {smlsl2\t} 5} } */ +/* { dg-final { scan-assembler-times {umlsl2\t} 5} } */ + +/* { dg-final { scan-assembler-times {sqdmull2\t} 4} } */ + +/* { dg-final { scan-assembler-times {sqdmlal2\t} 4} } */ + +/* { dg-final { scan-assembler-times {sqdmlsl2\t} 4} } */ diff --git a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_7.c b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_7.c new file mode 100644 index 00000000000..a8daa46ce76 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_7.c @@ -0,0 +1,36 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target aarch64_little_endian } */ +/* { dg-options "-O2 -march=armv8-a+sve" } */ + +#include <arm_neon_sve_bridge.h> + +typedef int32_t int32x8_t __attribute__ ((vector_size (32))); +typedef int16_t int16x16_t __attribute__ ((vector_size (32))); + +/* Edge cases where we don't/can't fold, reject these gracefully. */ + +int16x8_t +test_sizeless_type (svint8_t scalable) +{ + return vmovl_s8 (vget_high_s8 (svget_neonq_s8 (scalable))); +} + +int16x8_t +test_scalar_type (poly128_t foo) +{ + return vmovl_s8 (vget_high_s8 (vreinterpretq_s8_p128 (foo))); +} + +int32x4_t +test_256b_type_1 (int16x16_t foo) +{ + return vmovl_s16 ((int16x4_t) { foo[4], foo[5], foo[6], foo[7] }); +} + +int64x2_t +test_256b_type_2 (int32x8_t foo) +{ + return vmovl_s32 (vget_high_s32 ((int32x4_t) {foo[0], foo[1], foo[2], foo[3]})); +} + +/* { dg-final { scan-assembler-not {sxtl2\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vabal_combine.c b/gcc/testsuite/gcc.target/aarch64/simd/vabal_combine.c deleted file mode 100644 index c51878aa226..00000000000 --- a/gcc/testsuite/gcc.target/aarch64/simd/vabal_combine.c +++ /dev/null @@ -1,72 +0,0 @@ -/* { dg-do compile } */ -/* { dg-options "-O" } */ -/* { dg-final { check-function-bodies "**" "" "" } } */ - -#include <arm_neon.h> - -/* -** test_vabal_s8: -** sabal2 v0.8h, v2.16b, v1.16b -** ret -*/ -int16x8_t -test_vabal_s8 (int16x8_t sadv, int8x16_t pv, int8x16_t sv) -{ - return vabal_s8 (sadv, vget_high_s8 (pv), vget_high_s8 (sv)); -} - -/* -** test_vabal_u8: -** uabal2 v0.8h, v2.16b, v1.16b -** ret -*/ -uint16x8_t -test_vabal_u8 (uint16x8_t sadv, uint8x16_t pv, uint8x16_t sv) -{ - return vabal_u8 (sadv, vget_high_u8 (pv), vget_high_u8 (sv)); -} - -/* -** test_vabal_s16: -** sabal2 v0.4s, v2.8h, v1.8h -** ret -*/ -int32x4_t -test_vabal_s16 (int32x4_t sadv, int16x8_t pv, int16x8_t sv) -{ - return vabal_s16 (sadv, vget_high_s16 (pv), vget_high_s16 (sv)); -} - -/* -** test_vabal_u16: -** uabal2 v0.4s, v2.8h, v1.8h -** ret -*/ -uint32x4_t -test_vabal_u16 (uint32x4_t sadv, uint16x8_t pv, uint16x8_t sv) -{ - return vabal_u16 (sadv, vget_high_u16 (pv), vget_high_u16 (sv)); -} - -/* -** test_vabal_s32: -** sabal2 v0.2d, v2.4s, v1.4s -** ret -*/ -int64x2_t -test_vabal_s32 (int64x2_t sadv, int32x4_t pv, int32x4_t sv) -{ - return vabal_s32 (sadv, vget_high_s32 (pv), vget_high_s32 (sv)); -} - -/* -** test_vabal_u32: -** uabal2 v0.2d, v2.4s, v1.4s -** ret -*/ -uint64x2_t -test_vabal_u32 (uint64x2_t sadv, uint32x4_t pv, uint32x4_t sv) -{ - return vabal_u32 (sadv, vget_high_u32 (pv), vget_high_u32 (sv)); -} - -- 2.34.1