This patch adds the bare minimum needed to support autovectorisation of partial SVE vectors, namely moves and integer addition. Later patches add more interesting cases.
Tested on aarch64-linux-gnu and applied as r278341. Richard 2019-11-16 Richard Sandiford <richard.sandif...@arm.com> gcc/ * config/aarch64/aarch64-modes.def: Define partial SVE vector float modes. * config/aarch64/aarch64-protos.h (aarch64_sve_pred_mode): New function. * config/aarch64/aarch64.c (aarch64_classify_vector_mode): Handle the new vector float modes. (aarch64_sve_container_bits): New function. (aarch64_sve_pred_mode): Likewise. (aarch64_get_mask_mode): Use it. (aarch64_sve_element_int_mode): Handle structure modes and partial modes. (aarch64_sve_container_int_mode): New function. (aarch64_vectorize_related_mode): Return SVE modes when given SVE modes. Handle partial modes, taking the preferred number of units from the size of the given mode. (aarch64_hard_regno_mode_ok): Allow partial modes to be stored in registers. (aarch64_expand_sve_ld1rq): Use the mode form of aarch64_sve_pred_mode. (aarch64_expand_sve_const_vector): Handle partial SVE vectors. (aarch64_split_sve_subreg_move): Use the mode form of aarch64_sve_pred_mode. (aarch64_secondary_reload): Handle partial modes in the same way as full big-endian vectors. (aarch64_vector_mode_supported_p): Allow partial SVE vectors. (aarch64_autovectorize_vector_modes): Try unpacked SVE vectors, merging with the Advanced SIMD modes. If two modes have the same size, try the Advanced SIMD mode first. (aarch64_simd_valid_immediate): Use the container rather than the element mode for INDEX constants. (aarch64_simd_vector_alignment): Make the alignment of partial SVE vector modes the same as their minimum size. (aarch64_evpc_sel): Use the mode form of aarch64_sve_pred_mode. * config/aarch64/aarch64-sve.md (mov<SVE_FULL:mode>): Extend to... (mov<SVE_ALL:mode>): ...this. (movmisalign<SVE_FULL:mode>): Extend to... (movmisalign<SVE_ALL:mode>): ...this. (*aarch64_sve_mov<mode>_le): Rename to... (*aarch64_sve_mov<mode>_ldr_str): ...this. (*aarch64_sve_mov<SVE_FULL:mode>_be): Rename and extend to... (*aarch64_sve_mov<SVE_ALL:mode>_no_ldr_str): ...this. Handle partial modes regardless of endianness. (aarch64_sve_reload_be): Rename to... (aarch64_sve_reload_mem): ...this and enable for little-endian. Use aarch64_sve_pred_mode to get the appropriate predicate mode. (@aarch64_pred_mov<SVE_FULL:mode>): Extend to... (@aarch64_pred_mov<SVE_ALL:mode>): ...this. (*aarch64_sve_mov<SVE_FULL:mode>_subreg_be): Extend to... (*aarch64_sve_mov<SVE_ALL:mode>_subreg_be): ...this. (@aarch64_sve_reinterpret<SVE_FULL:mode>): Extend to... (@aarch64_sve_reinterpret<SVE_ALL:mode>): ...this. (*aarch64_sve_reinterpret<SVE_FULL:mode>): Extend to... (*aarch64_sve_reinterpret<SVE_ALL:mode>): ...this. (maskload<SVE_FULL:mode><vpred>): Extend to... (maskload<SVE_ALL:mode><vpred>): ...this. (maskstore<SVE_FULL:mode><vpred>): Extend to... (maskstore<SVE_ALL:mode><vpred>): ...this. (vec_duplicate<SVE_FULL:mode>): Extend to... (vec_duplicate<SVE_ALL:mode>): ...this. (*vec_duplicate<SVE_FULL:mode>_reg): Extend to... (*vec_duplicate<SVE_ALL:mode>_reg): ...this. (sve_ld1r<SVE_FULL:mode>): Extend to... (sve_ld1r<SVE_ALL:mode>): ...this. (vec_series<SVE_FULL_I:mode>): Extend to... (vec_series<SVE_I:mode>): ...this. (*vec_series<SVE_FULL_I:mode>_plus): Extend to... (*vec_series<SVE_I:mode>_plus): ...this. (@aarch64_pred_sxt<SVE_FULL_HSDI:mode><SVE_PARTIAL_I:mode>): Avoid new VPRED ambiguity. (@aarch64_cond_sxt<SVE_FULL_HSDI:mode><SVE_PARTIAL_I:mode>): Likewise. (add<SVE_FULL_I:mode>3): Extend to... (add<SVE_I:mode>3): ...this. * config/aarch64/iterators.md (SVE_ALL, SVE_I): New mode iterators. (Vetype, Vesize, VEL, Vel, vwcore): Handle partial SVE vector modes. (VPRED, vpred): Likewise. (Vctype): New iterator. (vw): Remove SVE modes. gcc/testsuite/ * gcc.target/aarch64/sve/mixed_size_1.c: New test. * gcc.target/aarch64/sve/mixed_size_2.c: Likewise. * gcc.target/aarch64/sve/mixed_size_3.c: Likewise. * gcc.target/aarch64/sve/mixed_size_4.c: Likewise. * gcc.target/aarch64/sve/mixed_size_5.c: Likewise. Index: gcc/config/aarch64/aarch64-modes.def =================================================================== --- gcc/config/aarch64/aarch64-modes.def 2019-10-16 11:53:03.681148277 +0100 +++ gcc/config/aarch64/aarch64-modes.def 2019-11-16 10:59:20.722514516 +0000 @@ -123,13 +123,18 @@ SVE_MODES (4, VNx64, VNx32, VNx16, VNx8) VECTOR_MODES_WITH_PREFIX (VNx, INT, 2, 1); VECTOR_MODES_WITH_PREFIX (VNx, INT, 4, 1); VECTOR_MODES_WITH_PREFIX (VNx, INT, 8, 1); +VECTOR_MODES_WITH_PREFIX (VNx, FLOAT, 4, 1); +VECTOR_MODES_WITH_PREFIX (VNx, FLOAT, 8, 1); ADJUST_NUNITS (VNx2QI, aarch64_sve_vg); ADJUST_NUNITS (VNx2HI, aarch64_sve_vg); ADJUST_NUNITS (VNx2SI, aarch64_sve_vg); +ADJUST_NUNITS (VNx2HF, aarch64_sve_vg); +ADJUST_NUNITS (VNx2SF, aarch64_sve_vg); ADJUST_NUNITS (VNx4QI, aarch64_sve_vg * 2); ADJUST_NUNITS (VNx4HI, aarch64_sve_vg * 2); +ADJUST_NUNITS (VNx4HF, aarch64_sve_vg * 2); ADJUST_NUNITS (VNx8QI, aarch64_sve_vg * 4); @@ -139,8 +144,11 @@ ADJUST_ALIGNMENT (VNx8QI, 1); ADJUST_ALIGNMENT (VNx2HI, 2); ADJUST_ALIGNMENT (VNx4HI, 2); +ADJUST_ALIGNMENT (VNx2HF, 2); +ADJUST_ALIGNMENT (VNx4HF, 2); ADJUST_ALIGNMENT (VNx2SI, 4); +ADJUST_ALIGNMENT (VNx2SF, 4); /* Quad float: 128-bit floating mode for long doubles. */ FLOAT_MODE (TF, 16, ieee_quad_format); Index: gcc/config/aarch64/aarch64-protos.h =================================================================== --- gcc/config/aarch64/aarch64-protos.h 2019-11-13 08:39:21.000000000 +0000 +++ gcc/config/aarch64/aarch64-protos.h 2019-11-16 10:59:20.722514516 +0000 @@ -512,6 +512,7 @@ bool aarch64_zero_extend_const_eq (machi bool aarch64_move_imm (HOST_WIDE_INT, machine_mode); machine_mode aarch64_sve_int_mode (machine_mode); opt_machine_mode aarch64_sve_pred_mode (unsigned int); +machine_mode aarch64_sve_pred_mode (machine_mode); opt_machine_mode aarch64_sve_data_mode (scalar_mode, poly_uint64); bool aarch64_sve_mode_p (machine_mode); HOST_WIDE_INT aarch64_fold_sve_cnt_pat (aarch64_svpattern, unsigned int); Index: gcc/config/aarch64/aarch64.c =================================================================== --- gcc/config/aarch64/aarch64.c 2019-11-16 10:43:45.589105879 +0000 +++ gcc/config/aarch64/aarch64.c 2019-11-16 10:59:20.730514460 +0000 @@ -1625,6 +1625,11 @@ aarch64_classify_vector_mode (machine_mo case E_VNx4HImode: /* Partial SVE SI vector. */ case E_VNx2SImode: + /* Partial SVE HF vectors. */ + case E_VNx2HFmode: + case E_VNx4HFmode: + /* Partial SVE SF vector. */ + case E_VNx2SFmode: return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0; case E_VNx16QImode: @@ -1753,6 +1758,22 @@ aarch64_array_mode_supported_p (machine_ return false; } +/* MODE is some form of SVE vector mode. For data modes, return the number + of vector register bits that each element of MODE occupies, such as 64 + for both VNx2DImode and VNx2SImode (where each 32-bit value is stored + in a 64-bit container). For predicate modes, return the number of + data bits controlled by each significant predicate bit. */ + +static unsigned int +aarch64_sve_container_bits (machine_mode mode) +{ + unsigned int vec_flags = aarch64_classify_vector_mode (mode); + poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED) + ? BITS_PER_SVE_VECTOR + : GET_MODE_BITSIZE (mode)); + return vector_element_size (vector_bits, GET_MODE_NUNITS (mode)); +} + /* Return the SVE predicate mode to use for elements that have ELEM_NBYTES bytes, if such a mode exists. */ @@ -1773,6 +1794,16 @@ aarch64_sve_pred_mode (unsigned int elem return opt_machine_mode (); } +/* Return the SVE predicate mode that should be used to control + SVE mode MODE. */ + +machine_mode +aarch64_sve_pred_mode (machine_mode mode) +{ + unsigned int bits = aarch64_sve_container_bits (mode); + return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require (); +} + /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */ static opt_machine_mode @@ -1780,7 +1811,7 @@ aarch64_get_mask_mode (machine_mode mode { unsigned int vec_flags = aarch64_classify_vector_mode (mode); if (vec_flags & VEC_SVE_DATA) - return aarch64_sve_pred_mode (GET_MODE_UNIT_SIZE (mode)); + return aarch64_sve_pred_mode (mode); return default_get_mask_mode (mode); } @@ -1806,11 +1837,25 @@ aarch64_sve_data_mode (scalar_mode inner static scalar_int_mode aarch64_sve_element_int_mode (machine_mode mode) { - unsigned int elt_bits = vector_element_size (BITS_PER_SVE_VECTOR, + poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL + ? BITS_PER_SVE_VECTOR + : GET_MODE_BITSIZE (mode)); + unsigned int elt_bits = vector_element_size (vector_bits, GET_MODE_NUNITS (mode)); return int_mode_for_size (elt_bits, 0).require (); } +/* Return an integer element mode that contains exactly + aarch64_sve_container_bits (MODE) bits. This is wider than + aarch64_sve_element_int_mode if MODE is a partial vector, + otherwise it's the same. */ + +static scalar_int_mode +aarch64_sve_container_int_mode (machine_mode mode) +{ + return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require (); +} + /* Return the integer vector mode associated with SVE mode MODE. Unlike related_int_vector_mode, this can handle the case in which MODE is a predicate (and thus has a different total size). */ @@ -1831,6 +1876,37 @@ aarch64_vectorize_related_mode (machine_ { unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode); + /* If we're operating on SVE vectors, try to return an SVE mode. */ + poly_uint64 sve_nunits; + if ((vec_flags & VEC_SVE_DATA) + && multiple_p (BYTES_PER_SVE_VECTOR, + GET_MODE_SIZE (element_mode), &sve_nunits)) + { + machine_mode sve_mode; + if (maybe_ne (nunits, 0U)) + { + /* Try to find a full or partial SVE mode with exactly + NUNITS units. */ + if (multiple_p (sve_nunits, nunits) + && aarch64_sve_data_mode (element_mode, + nunits).exists (&sve_mode)) + return sve_mode; + } + else + { + /* Take the preferred number of units from the number of bytes + that fit in VECTOR_MODE. We always start by "autodetecting" + a full vector mode with preferred_simd_mode, so vectors + chosen here will also be full vector modes. Then + autovectorize_vector_modes tries smaller starting modes + and thus smaller preferred numbers of units. */ + sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode)); + if (aarch64_sve_data_mode (element_mode, + sve_nunits).exists (&sve_mode)) + return sve_mode; + } + } + /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors. */ if ((vec_flags & VEC_ADVSIMD) && known_eq (nunits, 0U) @@ -1907,11 +1983,6 @@ aarch64_hard_regno_mode_ok (unsigned reg return mode == DImode; unsigned int vec_flags = aarch64_classify_vector_mode (mode); - /* At the moment, partial vector modes are only useful for memory - references, but that could change in future. */ - if (vec_flags & VEC_PARTIAL) - return false; - if (vec_flags & VEC_SVE_PRED) return pr_or_ffr_regnum_p (regno); @@ -4015,8 +4086,7 @@ aarch64_expand_sve_ld1rq (rtx dest, rtx } machine_mode mode = GET_MODE (dest); - unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode); - machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require (); + machine_mode pred_mode = aarch64_sve_pred_mode (mode); rtx ptrue = aarch64_ptrue_reg (pred_mode); emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue)); return true; @@ -4037,7 +4107,26 @@ aarch64_expand_sve_const_vector (rtx tar unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src); scalar_mode elt_mode = GET_MODE_INNER (mode); unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode); - unsigned int encoded_bits = npatterns * nelts_per_pattern * elt_bits; + unsigned int container_bits = aarch64_sve_container_bits (mode); + unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits; + + if (nelts_per_pattern == 1 + && encoded_bits <= 128 + && container_bits != elt_bits) + { + /* We have a partial vector mode and a constant whose full-vector + equivalent would occupy a repeating 128-bit sequence. Build that + full-vector equivalent instead, so that we have the option of + using LD1RQ and Advanced SIMD operations. */ + unsigned int repeat = container_bits / elt_bits; + machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require (); + rtx_vector_builder builder (full_mode, npatterns * repeat, 1); + for (unsigned int i = 0; i < npatterns; ++i) + for (unsigned int j = 0; j < repeat; ++j) + builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i)); + target = aarch64_target_reg (target, full_mode); + return aarch64_expand_sve_const_vector (target, builder.build ()); + } if (nelts_per_pattern == 1 && encoded_bits == 128) { @@ -4730,8 +4819,7 @@ aarch64_split_sve_subreg_move (rtx dest, std::swap (mode_with_wider_elts, mode_with_narrower_elts); unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts); - unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts); - machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require (); + machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts); /* Get the operands in the appropriate modes and emit the instruction. */ ptrue = gen_lowpart (pred_mode, ptrue); @@ -9971,19 +10059,21 @@ aarch64_secondary_reload (bool in_p ATTR machine_mode mode, secondary_reload_info *sri) { - /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled - directly by the *aarch64_sve_mov<mode>_[lb]e move patterns. See the - comment at the head of aarch64-sve.md for more details about the - big-endian handling. */ - if (BYTES_BIG_ENDIAN - && reg_class_subset_p (rclass, FP_REGS) + /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use + LDR and STR. See the comment at the head of aarch64-sve.md for + more details about the big-endian handling. */ + if (reg_class_subset_p (rclass, FP_REGS) && !((REG_P (x) && HARD_REGISTER_P (x)) || aarch64_simd_valid_immediate (x, NULL)) - && mode != VNx16QImode - && aarch64_sve_data_mode_p (mode)) + && mode != VNx16QImode) { - sri->icode = CODE_FOR_aarch64_sve_reload_be; - return NO_REGS; + unsigned int vec_flags = aarch64_classify_vector_mode (mode); + if ((vec_flags & VEC_SVE_DATA) + && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN)) + { + sri->icode = CODE_FOR_aarch64_sve_reload_mem; + return NO_REGS; + } } /* If we have to disable direct literal pool loads and stores because the @@ -15837,7 +15927,7 @@ aarch64_struct_value_rtx (tree fndecl AT aarch64_vector_mode_supported_p (machine_mode mode) { unsigned int vec_flags = aarch64_classify_vector_mode (mode); - return vec_flags != 0 && (vec_flags & (VEC_STRUCT | VEC_PARTIAL)) == 0; + return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0; } /* Return the full-width SVE vector mode for element mode MODE, if one @@ -15938,29 +16028,72 @@ aarch64_preferred_simd_mode (scalar_mode static unsigned int aarch64_autovectorize_vector_modes (vector_modes *modes, bool) { - if (TARGET_SVE) - modes->safe_push (VNx16QImode); - - /* Try using 128-bit vectors for all element types. */ - modes->safe_push (V16QImode); + static const machine_mode sve_modes[] = { + /* Try using full vectors for all element types. */ + VNx16QImode, + + /* Try using 16-bit containers for 8-bit elements and full vectors + for wider elements. */ + VNx8QImode, + + /* Try using 32-bit containers for 8-bit and 16-bit elements and + full vectors for wider elements. */ + VNx4QImode, - /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors - for wider elements. */ - modes->safe_push (V8QImode); - - /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors - for wider elements. + /* Try using 64-bit containers for all element types. */ + VNx2QImode + }; - TODO: We could support a limited form of V4QImode too, so that - we use 32-bit vectors for 8-bit elements. */ - modes->safe_push (V4HImode); + static const machine_mode advsimd_modes[] = { + /* Try using 128-bit vectors for all element types. */ + V16QImode, + + /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors + for wider elements. */ + V8QImode, + + /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors + for wider elements. + + TODO: We could support a limited form of V4QImode too, so that + we use 32-bit vectors for 8-bit elements. */ + V4HImode, + + /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors + for 64-bit elements. + + TODO: We could similarly support limited forms of V2QImode and V2HImode + for this case. */ + V2SImode + }; - /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors - for 64-bit elements. + /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode. + This is because: - TODO: We could similarly support limited forms of V2QImode and V2HImode - for this case. */ - modes->safe_push (V2SImode); + - If we can't use N-byte Advanced SIMD vectors then the placement + doesn't matter; we'll just continue as though the Advanced SIMD + entry didn't exist. + + - If an SVE main loop with N bytes ends up being cheaper than an + Advanced SIMD main loop with N bytes then by default we'll replace + the Advanced SIMD version with the SVE one. + + - If an Advanced SIMD main loop with N bytes ends up being cheaper + than an SVE main loop with N bytes then by default we'll try to + use the SVE loop to vectorize the epilogue instead. */ + unsigned int sve_i = TARGET_SVE ? 0 : ARRAY_SIZE (sve_modes); + unsigned int advsimd_i = 0; + while (advsimd_i < ARRAY_SIZE (advsimd_modes)) + { + if (sve_i < ARRAY_SIZE (sve_modes) + && maybe_gt (GET_MODE_NUNITS (sve_modes[sve_i]), + GET_MODE_NUNITS (advsimd_modes[advsimd_i]))) + modes->safe_push (sve_modes[sve_i++]); + else + modes->safe_push (advsimd_modes[advsimd_i++]); + } + while (sve_i < ARRAY_SIZE (sve_modes)) + modes->safe_push (sve_modes[sve_i++]); unsigned int flags = 0; /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we @@ -16507,7 +16640,14 @@ aarch64_simd_valid_immediate (rtx op, si return false; if (info) - *info = simd_immediate_info (elt_mode, base, step); + { + /* Get the corresponding container mode. E.g. an INDEX on V2SI + should yield two integer values per 128-bit block, meaning + that we need to treat it in the same way as V2DI and then + ignore the upper 32 bits of each element. */ + elt_mode = aarch64_sve_container_int_mode (mode); + *info = simd_immediate_info (elt_mode, base, step); + } return true; } else if (GET_CODE (op) == CONST_VECTOR @@ -16976,9 +17116,9 @@ aarch64_simd_vector_alignment (const_tre direct way we have of identifying real SVE predicate types. */ if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL) return 16; - if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST) - return 128; - return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi (); + widest_int min_size + = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type))); + return wi::umin (min_size, 128).to_uhwi (); } /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */ @@ -19154,7 +19294,7 @@ aarch64_evpc_sel (struct expand_vec_perm if (d->testing_p) return true; - machine_mode pred_mode = aarch64_sve_pred_mode (unit_size).require (); + machine_mode pred_mode = aarch64_sve_pred_mode (vmode); rtx_vector_builder builder (pred_mode, n_patterns, 2); for (int i = 0; i < n_patterns * 2; i++) Index: gcc/config/aarch64/aarch64-sve.md =================================================================== --- gcc/config/aarch64/aarch64-sve.md 2019-11-16 10:55:37.212088720 +0000 +++ gcc/config/aarch64/aarch64-sve.md 2019-11-16 10:59:20.726514488 +0000 @@ -546,8 +546,8 @@ ;; ------------------------------------------------------------------------- (define_expand "mov<mode>" - [(set (match_operand:SVE_FULL 0 "nonimmediate_operand") - (match_operand:SVE_FULL 1 "general_operand"))] + [(set (match_operand:SVE_ALL 0 "nonimmediate_operand") + (match_operand:SVE_ALL 1 "general_operand"))] "TARGET_SVE" { /* Use the predicated load and store patterns where possible. @@ -576,8 +576,8 @@ (define_expand "mov<mode>" ) (define_expand "movmisalign<mode>" - [(set (match_operand:SVE_FULL 0 "nonimmediate_operand") - (match_operand:SVE_FULL 1 "general_operand"))] + [(set (match_operand:SVE_ALL 0 "nonimmediate_operand") + (match_operand:SVE_ALL 1 "general_operand"))] "TARGET_SVE" { /* Equivalent to a normal move for our purpooses. */ @@ -586,10 +586,11 @@ (define_expand "movmisalign<mode>" } ) -;; Unpredicated moves (bytes or little-endian). Only allow memory operations -;; during and after RA; before RA we want the predicated load and store -;; patterns to be used instead. -(define_insn "*aarch64_sve_mov<mode>_le" +;; Unpredicated moves that can use LDR and STR, i.e. full vectors for which +;; little-endian ordering is acceptable. Only allow memory operations during +;; and after RA; before RA we want the predicated load and store patterns to +;; be used instead. +(define_insn "*aarch64_sve_mov<mode>_ldr_str" [(set (match_operand:SVE_FULL 0 "aarch64_sve_nonimmediate_operand" "=w, Utr, w, w") (match_operand:SVE_FULL 1 "aarch64_sve_general_operand" "Utr, w, w, Dn"))] "TARGET_SVE @@ -604,35 +605,37 @@ (define_insn "*aarch64_sve_mov<mode>_le" * return aarch64_output_sve_mov_immediate (operands[1]);" ) -;; Unpredicated moves (non-byte big-endian). Memory accesses require secondary -;; reloads. -(define_insn "*aarch64_sve_mov<mode>_be" - [(set (match_operand:SVE_FULL 0 "register_operand" "=w, w") - (match_operand:SVE_FULL 1 "aarch64_nonmemory_operand" "w, Dn"))] - "TARGET_SVE && BYTES_BIG_ENDIAN && <MODE>mode != VNx16QImode" +;; Unpredicated moves that cannot use LDR and STR, i.e. partial vectors +;; or vectors for which little-endian ordering isn't acceptable. Memory +;; accesses require secondary reloads. +(define_insn "*aarch64_sve_mov<mode>_no_ldr_str" + [(set (match_operand:SVE_ALL 0 "register_operand" "=w, w") + (match_operand:SVE_ALL 1 "aarch64_nonmemory_operand" "w, Dn"))] + "TARGET_SVE + && <MODE>mode != VNx16QImode + && (BYTES_BIG_ENDIAN + || maybe_ne (BYTES_PER_SVE_VECTOR, GET_MODE_SIZE (<MODE>mode)))" "@ mov\t%0.d, %1.d * return aarch64_output_sve_mov_immediate (operands[1]);" ) -;; Handle big-endian memory reloads. We use byte PTRUE for all modes -;; to try to encourage reuse. -;; This pattern needs constraints due to TARGET_SECONDARY_RELOAD hook. -(define_expand "aarch64_sve_reload_be" +;; Handle memory reloads for modes that can't use LDR and STR. We use +;; byte PTRUE for all modes to try to encourage reuse. This pattern +;; needs constraints because it is returned by TARGET_SECONDARY_RELOAD. +(define_expand "aarch64_sve_reload_mem" [(parallel [(set (match_operand 0) (match_operand 1)) (clobber (match_operand:VNx16BI 2 "register_operand" "=Upl"))])] - "TARGET_SVE && BYTES_BIG_ENDIAN" + "TARGET_SVE" { /* Create a PTRUE. */ emit_move_insn (operands[2], CONSTM1_RTX (VNx16BImode)); /* Refer to the PTRUE in the appropriate mode for this move. */ machine_mode mode = GET_MODE (operands[0]); - machine_mode pred_mode - = aarch64_sve_pred_mode (GET_MODE_UNIT_SIZE (mode)).require (); - rtx pred = gen_lowpart (pred_mode, operands[2]); + rtx pred = gen_lowpart (aarch64_sve_pred_mode (mode), operands[2]); /* Emit a predicated load or store. */ aarch64_emit_sve_pred_move (operands[0], pred, operands[1]); @@ -644,18 +647,18 @@ (define_expand "aarch64_sve_reload_be" ;; Note that this pattern is generated directly by aarch64_emit_sve_pred_move, ;; so changes to this pattern will need changes there as well. (define_insn_and_split "@aarch64_pred_mov<mode>" - [(set (match_operand:SVE_FULL 0 "nonimmediate_operand" "=w, w, m") - (unspec:SVE_FULL + [(set (match_operand:SVE_ALL 0 "nonimmediate_operand" "=w, w, m") + (unspec:SVE_ALL [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl") - (match_operand:SVE_FULL 2 "nonimmediate_operand" "w, m, w")] + (match_operand:SVE_ALL 2 "nonimmediate_operand" "w, m, w")] UNSPEC_PRED_X))] "TARGET_SVE && (register_operand (operands[0], <MODE>mode) || register_operand (operands[2], <MODE>mode))" "@ # - ld1<Vesize>\t%0.<Vetype>, %1/z, %2 - st1<Vesize>\t%2.<Vetype>, %1, %0" + ld1<Vesize>\t%0.<Vctype>, %1/z, %2 + st1<Vesize>\t%2.<Vctype>, %1, %0" "&& register_operand (operands[0], <MODE>mode) && register_operand (operands[2], <MODE>mode)" [(set (match_dup 0) (match_dup 2))] @@ -666,8 +669,8 @@ (define_insn_and_split "@aarch64_pred_mo ;; for details. We use a special predicate for operand 2 to reduce ;; the number of patterns. (define_insn_and_split "*aarch64_sve_mov<mode>_subreg_be" - [(set (match_operand:SVE_FULL 0 "aarch64_sve_nonimmediate_operand" "=w") - (unspec:SVE_FULL + [(set (match_operand:SVE_ALL 0 "aarch64_sve_nonimmediate_operand" "=w") + (unspec:SVE_ALL [(match_operand:VNx16BI 1 "register_operand" "Upl") (match_operand 2 "aarch64_any_register_operand" "w")] UNSPEC_REV_SUBREG))] @@ -685,8 +688,8 @@ (define_insn_and_split "*aarch64_sve_mov ;; This is equivalent to a subreg on little-endian targets but not for ;; big-endian; see the comment at the head of the file for details. (define_expand "@aarch64_sve_reinterpret<mode>" - [(set (match_operand:SVE_FULL 0 "register_operand") - (unspec:SVE_FULL + [(set (match_operand:SVE_ALL 0 "register_operand") + (unspec:SVE_ALL [(match_operand 1 "aarch64_any_register_operand")] UNSPEC_REINTERPRET))] "TARGET_SVE" @@ -702,8 +705,8 @@ (define_expand "@aarch64_sve_reinterpret ;; A pattern for handling type punning on big-endian targets. We use a ;; special predicate for operand 1 to reduce the number of patterns. (define_insn_and_split "*aarch64_sve_reinterpret<mode>" - [(set (match_operand:SVE_FULL 0 "register_operand" "=w") - (unspec:SVE_FULL + [(set (match_operand:SVE_ALL 0 "register_operand" "=w") + (unspec:SVE_ALL [(match_operand 1 "aarch64_any_register_operand" "w")] UNSPEC_REINTERPRET))] "TARGET_SVE" @@ -1141,13 +1144,13 @@ (define_insn "aarch64_update_ffrt" ;; Predicated LD1. (define_insn "maskload<mode><vpred>" - [(set (match_operand:SVE_FULL 0 "register_operand" "=w") - (unspec:SVE_FULL + [(set (match_operand:SVE_ALL 0 "register_operand" "=w") + (unspec:SVE_ALL [(match_operand:<VPRED> 2 "register_operand" "Upl") - (match_operand:SVE_FULL 1 "memory_operand" "m")] + (match_operand:SVE_ALL 1 "memory_operand" "m")] UNSPEC_LD1_SVE))] "TARGET_SVE" - "ld1<Vesize>\t%0.<Vetype>, %2/z, %1" + "ld1<Vesize>\t%0.<Vctype>, %2/z, %1" ) ;; Unpredicated LD[234]. @@ -1940,14 +1943,14 @@ (define_insn "*aarch64_sve_gather_prefet ;; Predicated ST1. (define_insn "maskstore<mode><vpred>" - [(set (match_operand:SVE_FULL 0 "memory_operand" "+m") - (unspec:SVE_FULL + [(set (match_operand:SVE_ALL 0 "memory_operand" "+m") + (unspec:SVE_ALL [(match_operand:<VPRED> 2 "register_operand" "Upl") - (match_operand:SVE_FULL 1 "register_operand" "w") + (match_operand:SVE_ALL 1 "register_operand" "w") (match_dup 0)] UNSPEC_ST1_SVE))] "TARGET_SVE" - "st1<Vesize>\t%1.<Vetype>, %2, %0" + "st1<Vesize>\t%1.<Vctype>, %2, %0" ) ;; Unpredicated ST[234]. This is always a full update, so the dependence @@ -2283,8 +2286,8 @@ (define_insn "*aarch64_scatter_store_tru (define_expand "vec_duplicate<mode>" [(parallel - [(set (match_operand:SVE_FULL 0 "register_operand") - (vec_duplicate:SVE_FULL + [(set (match_operand:SVE_ALL 0 "register_operand") + (vec_duplicate:SVE_ALL (match_operand:<VEL> 1 "aarch64_sve_dup_operand"))) (clobber (scratch:VNx16BI))])] "TARGET_SVE" @@ -2304,8 +2307,8 @@ (define_expand "vec_duplicate<mode>" ;; the load at the first opportunity in order to allow the PTRUE to be ;; optimized with surrounding code. (define_insn_and_split "*vec_duplicate<mode>_reg" - [(set (match_operand:SVE_FULL 0 "register_operand" "=w, w, w") - (vec_duplicate:SVE_FULL + [(set (match_operand:SVE_ALL 0 "register_operand" "=w, w, w") + (vec_duplicate:SVE_ALL (match_operand:<VEL> 1 "aarch64_sve_dup_operand" "r, w, Uty"))) (clobber (match_scratch:VNx16BI 2 "=X, X, Upl"))] "TARGET_SVE" @@ -2364,12 +2367,12 @@ (define_insn "@aarch64_vec_duplicate_vq< ;; be used by combine to optimize selects of a a vec_duplicate<mode> ;; with zero. (define_insn "sve_ld1r<mode>" - [(set (match_operand:SVE_FULL 0 "register_operand" "=w") - (unspec:SVE_FULL + [(set (match_operand:SVE_ALL 0 "register_operand" "=w") + (unspec:SVE_ALL [(match_operand:<VPRED> 1 "register_operand" "Upl") - (vec_duplicate:SVE_FULL + (vec_duplicate:SVE_ALL (match_operand:<VEL> 2 "aarch64_sve_ld1r_operand" "Uty")) - (match_operand:SVE_FULL 3 "aarch64_simd_imm_zero")] + (match_operand:SVE_ALL 3 "aarch64_simd_imm_zero")] UNSPEC_SEL))] "TARGET_SVE" "ld1r<Vesize>\t%0.<Vetype>, %1/z, %2" @@ -2431,29 +2434,29 @@ (define_insn "vec_shl_insert_<mode>" ;; ------------------------------------------------------------------------- (define_insn "vec_series<mode>" - [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, w, w") - (vec_series:SVE_FULL_I + [(set (match_operand:SVE_I 0 "register_operand" "=w, w, w") + (vec_series:SVE_I (match_operand:<VEL> 1 "aarch64_sve_index_operand" "Usi, r, r") (match_operand:<VEL> 2 "aarch64_sve_index_operand" "r, Usi, r")))] "TARGET_SVE" "@ - index\t%0.<Vetype>, #%1, %<vw>2 - index\t%0.<Vetype>, %<vw>1, #%2 - index\t%0.<Vetype>, %<vw>1, %<vw>2" + index\t%0.<Vctype>, #%1, %<vwcore>2 + index\t%0.<Vctype>, %<vwcore>1, #%2 + index\t%0.<Vctype>, %<vwcore>1, %<vwcore>2" ) ;; Optimize {x, x, x, x, ...} + {0, n, 2*n, 3*n, ...} if n is in range ;; of an INDEX instruction. (define_insn "*vec_series<mode>_plus" - [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w") - (plus:SVE_FULL_I - (vec_duplicate:SVE_FULL_I + [(set (match_operand:SVE_I 0 "register_operand" "=w") + (plus:SVE_I + (vec_duplicate:SVE_I (match_operand:<VEL> 1 "register_operand" "r")) - (match_operand:SVE_FULL_I 2 "immediate_operand")))] + (match_operand:SVE_I 2 "immediate_operand")))] "TARGET_SVE && aarch64_check_zero_based_sve_index_immediate (operands[2])" { operands[2] = aarch64_check_zero_based_sve_index_immediate (operands[2]); - return "index\t%0.<Vetype>, %<vw>1, #%2"; + return "index\t%0.<Vctype>, %<vwcore>1, #%2"; } ) @@ -2821,7 +2824,7 @@ (define_insn "@cond_<optab><mode>" (define_insn "@aarch64_pred_sxt<SVE_FULL_HSDI:mode><SVE_PARTIAL_I:mode>" [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w") (unspec:SVE_FULL_HSDI - [(match_operand:<VPRED> 1 "register_operand" "Upl") + [(match_operand:<SVE_FULL_HSDI:VPRED> 1 "register_operand" "Upl") (sign_extend:SVE_FULL_HSDI (truncate:SVE_PARTIAL_I (match_operand:SVE_FULL_HSDI 2 "register_operand" "w")))] @@ -2834,7 +2837,7 @@ (define_insn "@aarch64_pred_sxt<SVE_FULL (define_insn "@aarch64_cond_sxt<SVE_FULL_HSDI:mode><SVE_PARTIAL_I:mode>" [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w, ?&w, ?&w") (unspec:SVE_FULL_HSDI - [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl") + [(match_operand:<SVE_FULL_HSDI:VPRED> 1 "register_operand" "Upl, Upl, Upl") (sign_extend:SVE_FULL_HSDI (truncate:SVE_PARTIAL_I (match_operand:SVE_FULL_HSDI 2 "register_operand" "w, w, w"))) @@ -3386,10 +3389,10 @@ (define_insn_and_rewrite "*cond_<optab>< ;; ------------------------------------------------------------------------- (define_insn "add<mode>3" - [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, w, w, ?w, ?w, w") - (plus:SVE_FULL_I - (match_operand:SVE_FULL_I 1 "register_operand" "%0, 0, 0, w, w, w") - (match_operand:SVE_FULL_I 2 "aarch64_sve_add_operand" "vsa, vsn, vsi, vsa, vsn, w")))] + [(set (match_operand:SVE_I 0 "register_operand" "=w, w, w, ?w, ?w, w") + (plus:SVE_I + (match_operand:SVE_I 1 "register_operand" "%0, 0, 0, w, w, w") + (match_operand:SVE_I 2 "aarch64_sve_add_operand" "vsa, vsn, vsi, vsa, vsn, w")))] "TARGET_SVE" "@ add\t%0.<Vetype>, %0.<Vetype>, #%D2 Index: gcc/config/aarch64/iterators.md =================================================================== --- gcc/config/aarch64/iterators.md 2019-11-16 10:55:37.212088720 +0000 +++ gcc/config/aarch64/iterators.md 2019-11-16 10:59:20.730514460 +0000 @@ -344,6 +344,21 @@ (define_mode_iterator SVE_PARTIAL_I [VNx VNx4HI VNx2HI VNx2SI]) +;; All SVE vector modes. +(define_mode_iterator SVE_ALL [VNx16QI VNx8QI VNx4QI VNx2QI + VNx8HI VNx4HI VNx2HI + VNx8HF VNx4HF VNx2HF + VNx4SI VNx2SI + VNx4SF VNx2SF + VNx2DI + VNx2DF]) + +;; All SVE integer vector modes. +(define_mode_iterator SVE_I [VNx16QI VNx8QI VNx4QI VNx2QI + VNx8HI VNx4HI VNx2HI + VNx4SI VNx2SI + VNx2DI]) + ;; Modes involved in extending or truncating SVE data, for 8 elements per ;; 128-bit block. (define_mode_iterator VNx8_NARROW [VNx8QI]) @@ -776,28 +791,37 @@ (define_mode_attr Vmntype [(V8HI ".8b") (HI "")]) ;; Mode-to-individual element type mapping. -(define_mode_attr Vetype [(V8QI "b") (V16QI "b") (VNx16QI "b") (VNx16BI "b") - (V4HI "h") (V8HI "h") (VNx8HI "h") (VNx8BI "h") - (V2SI "s") (V4SI "s") (VNx4SI "s") (VNx4BI "s") - (V2DI "d") (VNx2DI "d") (VNx2BI "d") - (V4HF "h") (V8HF "h") (VNx8HF "h") - (V2SF "s") (V4SF "s") (VNx4SF "s") - (V2DF "d") (VNx2DF "d") - (HF "h") - (SF "s") (DF "d") - (QI "b") (HI "h") - (SI "s") (DI "d")]) +(define_mode_attr Vetype [(V8QI "b") (V16QI "b") + (V4HI "h") (V8HI "h") + (V2SI "s") (V4SI "s") + (V2DI "d") + (V4HF "h") (V8HF "h") + (V2SF "s") (V4SF "s") + (V2DF "d") + (VNx16BI "b") (VNx8BI "h") (VNx4BI "s") (VNx2BI "d") + (VNx16QI "b") (VNx8QI "b") (VNx4QI "b") (VNx2QI "b") + (VNx8HI "h") (VNx4HI "h") (VNx2HI "h") + (VNx8HF "h") (VNx4HF "h") (VNx2HF "h") + (VNx4SI "s") (VNx2SI "s") + (VNx4SF "s") (VNx2SF "s") + (VNx2DI "d") + (VNx2DF "d") + (HF "h") + (SF "s") (DF "d") + (QI "b") (HI "h") + (SI "s") (DI "d")]) ;; Like Vetype, but map to types that are a quarter of the element size. (define_mode_attr Vetype_fourth [(VNx4SI "b") (VNx2DI "h")]) ;; Equivalent of "size" for a vector element. -(define_mode_attr Vesize [(VNx16QI "b") (VNx8QI "b") - (VNx4QI "b") (VNx2QI "b") - (VNx8HI "h") (VNx4HI "h") - (VNx2HI "h") (VNx8HF "h") - (VNx4SI "w") (VNx2SI "w") (VNx4SF "w") - (VNx2DI "d") (VNx2DF "d") +(define_mode_attr Vesize [(VNx16QI "b") (VNx8QI "b") (VNx4QI "b") (VNx2QI "b") + (VNx8HI "h") (VNx4HI "h") (VNx2HI "h") + (VNx8HF "h") (VNx4HF "h") (VNx2HF "h") + (VNx4SI "w") (VNx2SI "w") + (VNx4SF "w") (VNx2SF "w") + (VNx2DI "d") + (VNx2DF "d") (VNx32QI "b") (VNx48QI "b") (VNx64QI "b") (VNx16HI "h") (VNx24HI "h") (VNx32HI "h") (VNx16HF "h") (VNx24HF "h") (VNx32HF "h") @@ -806,6 +830,16 @@ (define_mode_attr Vesize [(VNx16QI "b") (VNx4DI "d") (VNx6DI "d") (VNx8DI "d") (VNx4DF "d") (VNx6DF "d") (VNx8DF "d")]) +;; The Z register suffix for an SVE mode's element container, i.e. the +;; Vetype of full SVE modes that have the same number of elements. +(define_mode_attr Vctype [(VNx16QI "b") (VNx8QI "h") (VNx4QI "s") (VNx2QI "d") + (VNx8HI "h") (VNx4HI "s") (VNx2HI "d") + (VNx8HF "h") (VNx4HF "s") (VNx2HF "d") + (VNx4SI "s") (VNx2SI "d") + (VNx4SF "s") (VNx2SF "d") + (VNx2DI "d") + (VNx2DF "d")]) + ;; Vetype is used everywhere in scheduling type and assembly output, ;; sometimes they are not the same, for example HF modes on some ;; instructions. stype is defined to represent scheduling type @@ -827,26 +861,40 @@ (define_mode_attr Vbtype [(V8QI "8b") ( (SI "8b") (SF "8b")]) ;; Define element mode for each vector mode. -(define_mode_attr VEL [(V8QI "QI") (V16QI "QI") (VNx16QI "QI") - (V4HI "HI") (V8HI "HI") (VNx8HI "HI") - (V2SI "SI") (V4SI "SI") (VNx4SI "SI") - (DI "DI") (V2DI "DI") (VNx2DI "DI") - (V4HF "HF") (V8HF "HF") (VNx8HF "HF") - (V2SF "SF") (V4SF "SF") (VNx4SF "SF") - (DF "DF") (V2DF "DF") (VNx2DF "DF") - (SI "SI") (HI "HI") - (QI "QI")]) +(define_mode_attr VEL [(V8QI "QI") (V16QI "QI") + (V4HI "HI") (V8HI "HI") + (V2SI "SI") (V4SI "SI") + (DI "DI") (V2DI "DI") + (V4HF "HF") (V8HF "HF") + (V2SF "SF") (V4SF "SF") + (DF "DF") (V2DF "DF") + (SI "SI") (HI "HI") + (QI "QI") + (VNx16QI "QI") (VNx8QI "QI") (VNx4QI "QI") (VNx2QI "QI") + (VNx8HI "HI") (VNx4HI "HI") (VNx2HI "HI") + (VNx8HF "HF") (VNx4HF "HF") (VNx2HF "HF") + (VNx4SI "SI") (VNx2SI "SI") + (VNx4SF "SF") (VNx2SF "SF") + (VNx2DI "DI") + (VNx2DF "DF")]) ;; Define element mode for each vector mode (lower case). -(define_mode_attr Vel [(V8QI "qi") (V16QI "qi") (VNx16QI "qi") - (V4HI "hi") (V8HI "hi") (VNx8HI "hi") - (V2SI "si") (V4SI "si") (VNx4SI "si") - (DI "di") (V2DI "di") (VNx2DI "di") - (V4HF "hf") (V8HF "hf") (VNx8HF "hf") - (V2SF "sf") (V4SF "sf") (VNx4SF "sf") - (V2DF "df") (DF "df") (VNx2DF "df") - (SI "si") (HI "hi") - (QI "qi")]) +(define_mode_attr Vel [(V8QI "qi") (V16QI "qi") + (V4HI "hi") (V8HI "hi") + (V2SI "si") (V4SI "si") + (DI "di") (V2DI "di") + (V4HF "hf") (V8HF "hf") + (V2SF "sf") (V4SF "sf") + (V2DF "df") (DF "df") + (SI "si") (HI "hi") + (QI "qi") + (VNx16QI "qi") (VNx8QI "qi") (VNx4QI "qi") (VNx2QI "qi") + (VNx8HI "hi") (VNx4HI "hi") (VNx2HI "hi") + (VNx8HF "hf") (VNx4HF "hf") (VNx2HF "hf") + (VNx4SI "si") (VNx2SI "si") + (VNx4SF "sf") (VNx2SF "sf") + (VNx2DI "di") + (VNx2DF "df")]) ;; Element mode with floating-point values replaced by like-sized integers. (define_mode_attr VEL_INT [(VNx16QI "QI") @@ -994,23 +1042,29 @@ (define_mode_attr Vhalftype [(V16QI "8b" (V4SF "2s")]) ;; Define corresponding core/FP element mode for each vector mode. -(define_mode_attr vw [(V8QI "w") (V16QI "w") (VNx16QI "w") - (V4HI "w") (V8HI "w") (VNx8HI "w") - (V2SI "w") (V4SI "w") (VNx4SI "w") - (DI "x") (V2DI "x") (VNx2DI "x") - (VNx8HF "h") - (V2SF "s") (V4SF "s") (VNx4SF "s") - (V2DF "d") (VNx2DF "d")]) +(define_mode_attr vw [(V8QI "w") (V16QI "w") + (V4HI "w") (V8HI "w") + (V2SI "w") (V4SI "w") + (DI "x") (V2DI "x") + (V2SF "s") (V4SF "s") + (V2DF "d")]) ;; Corresponding core element mode for each vector mode. This is a ;; variation on <vw> mapping FP modes to GP regs. -(define_mode_attr vwcore [(V8QI "w") (V16QI "w") (VNx16QI "w") - (V4HI "w") (V8HI "w") (VNx8HI "w") - (V2SI "w") (V4SI "w") (VNx4SI "w") - (DI "x") (V2DI "x") (VNx2DI "x") - (V4HF "w") (V8HF "w") (VNx8HF "w") - (V2SF "w") (V4SF "w") (VNx4SF "w") - (V2DF "x") (VNx2DF "x")]) +(define_mode_attr vwcore [(V8QI "w") (V16QI "w") + (V4HI "w") (V8HI "w") + (V2SI "w") (V4SI "w") + (DI "x") (V2DI "x") + (V4HF "w") (V8HF "w") + (V2SF "w") (V4SF "w") + (V2DF "x") + (VNx16QI "w") (VNx8QI "w") (VNx4QI "w") (VNx2QI "w") + (VNx8HI "w") (VNx4HI "w") (VNx2HI "w") + (VNx8HF "w") (VNx4HF "w") (VNx2HF "w") + (VNx4SI "w") (VNx2SI "w") + (VNx4SF "w") (VNx2SF "w") + (VNx2DI "x") + (VNx2DF "x")]) ;; Double vector types for ALLX. (define_mode_attr Vallxd [(QI "8b") (HI "4h") (SI "2s")]) @@ -1248,10 +1302,14 @@ (define_mode_attr vsingle [(VNx32QI "vnx ;; The predicate mode associated with an SVE data mode. For structure modes ;; this is equivalent to the <VPRED> of the subvector mode. -(define_mode_attr VPRED [(VNx16QI "VNx16BI") - (VNx8HI "VNx8BI") (VNx8HF "VNx8BI") - (VNx4SI "VNx4BI") (VNx4SF "VNx4BI") - (VNx2DI "VNx2BI") (VNx2DF "VNx2BI") +(define_mode_attr VPRED [(VNx16QI "VNx16BI") (VNx8QI "VNx8BI") + (VNx4QI "VNx4BI") (VNx2QI "VNx2BI") + (VNx8HI "VNx8BI") (VNx4HI "VNx4BI") (VNx2HI "VNx2BI") + (VNx8HF "VNx8BI") (VNx4HF "VNx4BI") (VNx2HF "VNx2BI") + (VNx4SI "VNx4BI") (VNx2SI "VNx2BI") + (VNx4SF "VNx4BI") (VNx2SF "VNx2BI") + (VNx2DI "VNx2BI") + (VNx2DF "VNx2BI") (VNx32QI "VNx16BI") (VNx16HI "VNx8BI") (VNx16HF "VNx8BI") (VNx8SI "VNx4BI") (VNx8SF "VNx4BI") @@ -1266,10 +1324,14 @@ (define_mode_attr VPRED [(VNx16QI "VNx16 (VNx8DI "VNx2BI") (VNx8DF "VNx2BI")]) ;; ...and again in lower case. -(define_mode_attr vpred [(VNx16QI "vnx16bi") - (VNx8HI "vnx8bi") (VNx8HF "vnx8bi") - (VNx4SI "vnx4bi") (VNx4SF "vnx4bi") - (VNx2DI "vnx2bi") (VNx2DF "vnx2bi") +(define_mode_attr vpred [(VNx16QI "vnx16bi") (VNx8QI "vnx8bi") + (VNx4QI "vnx4bi") (VNx2QI "vnx2bi") + (VNx8HI "vnx8bi") (VNx4HI "vnx4bi") (VNx2HI "vnx2bi") + (VNx8HF "vnx8bi") (VNx4HF "vnx4bi") (VNx2HF "vnx2bi") + (VNx4SI "vnx4bi") (VNx2SI "vnx2bi") + (VNx4SF "vnx4bi") (VNx2SF "vnx2bi") + (VNx2DI "vnx2bi") + (VNx2DF "vnx2bi") (VNx32QI "vnx16bi") (VNx16HI "vnx8bi") (VNx16HF "vnx8bi") (VNx8SI "vnx4bi") (VNx8SF "vnx4bi") Index: gcc/testsuite/gcc.target/aarch64/sve/mixed_size_1.c =================================================================== --- /dev/null 2019-09-17 11:41:18.176664108 +0100 +++ gcc/testsuite/gcc.target/aarch64/sve/mixed_size_1.c 2019-11-16 10:59:20.730514460 +0000 @@ -0,0 +1,39 @@ +/* { dg-options "-O2 -ftree-vectorize -fno-tree-loop-distribute-patterns" } */ + +#include <stdint.h> + +#define TEST_LOOP(TYPE1, TYPE2) \ + void \ + f_##TYPE1##_##TYPE2 (TYPE1 *restrict dst1, TYPE1 *restrict src1, \ + TYPE2 *restrict dst2, TYPE2 *restrict src2, \ + int n) \ + { \ + for (int i = 0; i < n; ++i) \ + { \ + dst1[i] += src1[i]; \ + dst2[i] = src2[i]; \ + } \ + } + +#define TEST_ALL(T) \ + T (uint16_t, uint8_t) \ + T (uint32_t, uint16_t) \ + T (uint32_t, _Float16) \ + T (uint64_t, uint32_t) \ + T (uint64_t, float) + +TEST_ALL (TEST_LOOP) + +/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.h,} 1 } } */ +/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.h,} 1 } } */ +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s,} 2 } } */ +/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s,} 2 } } */ +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d,} 2 } } */ +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.d,} 2 } } */ + +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h,} 2 } } */ +/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 1 } } */ +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s,} 4 } } */ +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 2 } } */ +/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d,} 4 } } */ +/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 2 } } */ Index: gcc/testsuite/gcc.target/aarch64/sve/mixed_size_2.c =================================================================== --- /dev/null 2019-09-17 11:41:18.176664108 +0100 +++ gcc/testsuite/gcc.target/aarch64/sve/mixed_size_2.c 2019-11-16 10:59:20.730514460 +0000 @@ -0,0 +1,41 @@ +/* { dg-options "-O2 -ftree-vectorize -fno-tree-loop-distribute-patterns" } */ + +#include <stdint.h> + +#define TEST_LOOP(TYPE1, TYPE2) \ + void \ + f_##TYPE1##_##TYPE2 (TYPE1 *restrict dst1, TYPE1 *restrict src1, \ + TYPE2 *restrict dst2, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + { \ + dst1[i] += src1[i]; \ + dst2[i] = 1; \ + } \ + } + +#define TEST_ALL(T) \ + T (uint16_t, uint8_t) \ + T (uint32_t, uint16_t) \ + T (uint32_t, _Float16) \ + T (uint64_t, uint32_t) \ + T (uint64_t, float) + +TEST_ALL (TEST_LOOP) + +/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.b, #1\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, #1\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, #1\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #1\.0} 1 } } */ +/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #1\.0} 1 } } */ + +/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.h,} 1 } } */ +/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s,} 2 } } */ +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.d,} 2 } } */ + +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h,} 2 } } */ +/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 1 } } */ +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s,} 4 } } */ +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 2 } } */ +/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d,} 4 } } */ +/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 2 } } */ Index: gcc/testsuite/gcc.target/aarch64/sve/mixed_size_3.c =================================================================== --- /dev/null 2019-09-17 11:41:18.176664108 +0100 +++ gcc/testsuite/gcc.target/aarch64/sve/mixed_size_3.c 2019-11-16 10:59:20.730514460 +0000 @@ -0,0 +1,41 @@ +/* { dg-options "-O2 -ftree-vectorize -fno-tree-loop-distribute-patterns" } */ + +#include <stdint.h> + +#define TEST_LOOP(TYPE1, TYPE2) \ + void \ + f_##TYPE1##_##TYPE2 (TYPE1 *restrict dst1, TYPE1 *restrict src1, \ + TYPE2 *restrict dst2, TYPE2 src2, int n) \ + { \ + for (int i = 0; i < n; ++i) \ + { \ + dst1[i] += src1[i]; \ + dst2[i] = src2; \ + } \ + } + +#define TEST_ALL(T) \ + T (uint16_t, uint8_t) \ + T (uint32_t, uint16_t) \ + T (uint32_t, _Float16) \ + T (uint64_t, uint32_t) \ + T (uint64_t, float) + +TEST_ALL (TEST_LOOP) + +/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.b, w3\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, w3\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, w3\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, h0\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, s0\n} 1 } } */ + +/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.h,} 1 } } */ +/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s,} 2 } } */ +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.d,} 2 } } */ + +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h,} 2 } } */ +/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 1 } } */ +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s,} 4 } } */ +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 2 } } */ +/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d,} 4 } } */ +/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 2 } } */ Index: gcc/testsuite/gcc.target/aarch64/sve/mixed_size_4.c =================================================================== --- /dev/null 2019-09-17 11:41:18.176664108 +0100 +++ gcc/testsuite/gcc.target/aarch64/sve/mixed_size_4.c 2019-11-16 10:59:20.730514460 +0000 @@ -0,0 +1,43 @@ +/* { dg-options "-O2 -ftree-vectorize -fno-tree-loop-distribute-patterns" } */ + +#include <stdint.h> + +#define TEST_LOOP(TYPE1, TYPE2) \ + void \ + f_##TYPE1##_##TYPE2 (TYPE1 *restrict dst1, TYPE1 *restrict src1, \ + TYPE2 *restrict dst2, TYPE2 n) \ + { \ + for (TYPE2 i = 0; i < n; ++i) \ + { \ + dst1[i] += src1[i]; \ + dst2[i] = i; \ + } \ + } + +#define TEST_ALL(T) \ + T (uint16_t, uint8_t) \ + T (uint32_t, uint16_t) \ + T (uint64_t, uint32_t) + +TEST_ALL (TEST_LOOP) + +/* { dg-final { scan-assembler-not {\tindex\tz[0-9]+\.b,} } } */ +/* { dg-final { scan-assembler-times {\tindex\tz[0-9]+\.h, #0, #1\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tindex\tz[0-9]+\.s, #0, #1\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tindex\tz[0-9]+\.d, #0, #1\n} 1 } } */ + +/* { dg-final { scan-assembler-not {\tcntb\t} } } */ +/* { dg-final { scan-assembler-times {\tcnth\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tcntw\t} 1 } } */ +/* { dg-final { scan-assembler-times {\tcntd\t} 1 } } */ + +/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.h,} 1 } } */ +/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s,} 1 } } */ +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.d,} 1 } } */ + +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h,} 2 } } */ +/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 1 } } */ +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s,} 2 } } */ +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 1 } } */ +/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d,} 2 } } */ +/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 1 } } */ Index: gcc/testsuite/gcc.target/aarch64/sve/mixed_size_5.c =================================================================== --- /dev/null 2019-09-17 11:41:18.176664108 +0100 +++ gcc/testsuite/gcc.target/aarch64/sve/mixed_size_5.c 2019-11-16 10:59:20.730514460 +0000 @@ -0,0 +1,42 @@ +/* { dg-options "-O2 -ftree-vectorize -fno-tree-loop-distribute-patterns -msve-vector-bits=512" } */ + +#include <stdint.h> + +#define TEST_LOOP(TYPE1, TYPE2) \ + void \ + f_##TYPE1##_##TYPE2 (TYPE1 *restrict dst1, TYPE1 *restrict src1, \ + TYPE2 *restrict dst2, TYPE2 *restrict src2, \ + int n) \ + { \ + for (int i = 0; i < n; ++i) \ + { \ + dst1[i * 2] = src1[i * 2] + 1; \ + dst1[i * 2 + 1] = src1[i * 2 + 1] + 1; \ + dst2[i * 2] = 2; \ + dst2[i * 2 + 1] = 3; \ + } \ + } + +#define TEST_ALL(T) \ + T (uint16_t, uint8_t) \ + T (uint32_t, uint16_t) \ + T (uint32_t, _Float16) \ + T (uint64_t, uint32_t) \ + T (uint64_t, float) + +TEST_ALL (TEST_LOOP) + +/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s,} 1 } } */ +/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d,} 2 } } */ +/* { dg-final { scan-assembler-times {\tld1rqw\tz[0-9]+\.s,} 2 } } */ + +/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.h,} 1 } } */ +/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s,} 2 } } */ +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.d,} 2 } } */ + +/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h,} 1 } } */ +/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 1 } } */ +/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s,} 2 } } */ +/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 2 } } */ +/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d,} 2 } } */ +/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 2 } } */