https://gcc.gnu.org/g:3027010d8bcc854eb43425cb1da573ff7345a5ac
commit r16-5420-g3027010d8bcc854eb43425cb1da573ff7345a5ac Author: Tamar Christina <[email protected]> Date: Wed Nov 19 14:27:55 2025 +0000 AArch64: expand extractions of Adv.SIMD registers from SVE as separate insn. For this example using the Adv.SIMD/SVE Bridge #include <arm_neon.h> #include <arm_neon_sve_bridge.h> #include <stdint.h> svint16_t sub_neon_i16_sve_bridged(svint8_t a, svint8_t b) { return svset_neonq_s16(svundef_s16(), vsubq_s16(vmovl_high_s8(svget_neonq(a)), vmovl_high_s8(svget_neonq(b)))); } we generate: sub_neon_i16_sve_bridged(__SVInt8_t, __SVInt8_t): sxtl2 v0.8h, v0.16b ssubw2 v0.8h, v0.8h, v1.16b ret instead of just sub_neon_i16_sve_bridged(__SVInt8_t, __SVInt8_t): ssubl2 v0.8h, v0.16b, v1.16b ret Commit g:abf865732a7313cf79ffa325faed3467ed28d8b8 added a framework to fold uses of instrinsics combined with lo/hi extractions into the appropriate low or highpart instructions. However this doesn't trigger because the Adv.SIMD from SVE extraction code for vmovl_high_s8(svget_neonq(a)) does not have one argument as constant and only supports folding 2 insn, not 3 into 1. The above in RTL generates (insn 7 4 8 2 (set (reg:V8QI 103 [ _6 ]) (vec_select:V8QI (subreg:V16QI (reg/v:VNx16QI 109 [ a ]) 0) (parallel:V16QI [ (const_int 8 [0x8]) (const_int 9 [0x9]) (const_int 10 [0xa]) (const_int 11 [0xb]) (const_int 12 [0xc]) (const_int 13 [0xd]) (const_int 14 [0xe]) (const_int 15 [0xf]) ]))) "":3174:43 -1 (nil)) Since the SVE and the Adv. SIMD modes are tieable this is a valid instruction to make, however it's suboptimal in that we can't fold this into the existing instruction patterns. Eventually early-ra will split off the SVE reg from the patterns but by then we're passed combine and insn foldings so we miss all the optimizations. This patch introduces vec_extract optabs for 128-bit and 64-bit Adv.SIMD vector extraction from SVE registers and emits an explicit separate instruction for the subregs. This then gives combine and rtl folding the opportunity to form the combined instructions and if not we arrive at the same RTL after early-ra. gcc/ChangeLog: * config/aarch64/aarch64-sve.md (vec_extract<mode><v128>, vec_extract<mode><v64>): New. * config/aarch64/iterators.md (V64, v64): New. * config/aarch64/predicates.md (const0_to_1_operand): New. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/fold_to_highpart_6.c: Update codegen. * gcc.target/aarch64/sve/fold_to_highpart_1.c: New test. * gcc.target/aarch64/sve/fold_to_highpart_2.c: New test. Diff: --- gcc/config/aarch64/aarch64-sve.md | 42 +++ gcc/config/aarch64/iterators.md | 12 + gcc/config/aarch64/predicates.md | 4 + .../gcc.target/aarch64/simd/fold_to_highpart_6.c | 9 +- .../gcc.target/aarch64/sve/fold_to_highpart_1.c | 19 ++ .../gcc.target/aarch64/sve/fold_to_highpart_2.c | 295 +++++++++++++++++++++ 6 files changed, 380 insertions(+), 1 deletion(-) diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index 4648aa67e0c3..26c08dbd9208 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -3112,6 +3112,48 @@ } ) +;; Don't allow expansions of SVE to Adv. SIMD registers immediately as subregs. +;; Doing so prevents combine from matching instructions generated by the +;; SVE/Adv. SIMD bridge as the SVE modes are not valid inside the instructions. +;; Eventually early-ra or reload will split them but by then we've lost the +;; combinations. Instead split them early and allow fwprop or combine to +;; push them into instructions where they are actually supported as part of the +;; instruction. +(define_expand "vec_extract<mode><v128>" + [(match_operand:<V128> 0 "register_operand") + (match_operand:SVE_FULL 1 "register_operand") + (match_operand:SI 2 "const0_operand")] + "TARGET_SVE" +{ + emit_move_insn (operands[0], + force_lowpart_subreg (<V128>mode, operands[1], <MODE>mode)); + DONE; +}) + +;; Similarly for extractions of 64-bit Adv. SIMD vectors from SVE vectors. For +;; these extractions we can support offsets 0 and 1 by first extracting a +;; 128-bit vector and then selecting the appropriate half. +(define_expand "vec_extract<mode><v64>" + [(match_operand:<V64> 0 "register_operand") + (match_operand:SVE_FULL_BHS 1 "register_operand") + (match_operand:SI 2 "const0_to_1_operand")] + "TARGET_SVE" +{ + if (CONST0_RTX (SImode) == operands[2]) + emit_move_insn (operands[0], + force_lowpart_subreg (<V64>mode, operands[1], + <MODE>mode)); + else + { + rtx tmp = gen_reg_rtx (<V128>mode); + emit_move_insn (tmp, + force_lowpart_subreg (<V128>mode, operands[1], + <MODE>mode)); + emit_insn (gen_vec_extract<v128><v64> (operands[0], tmp, operands[2])); + } + DONE; +}) + ;; Extract element zero. This is a special case because we want to force ;; the registers to be the same for the second alternative, and then ;; split the instruction into nothing after RA. diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 32e5009a1f96..0c80b7adeaef 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -1820,6 +1820,18 @@ (VNx4SI "v4si") (VNx4SF "v4sf") (VNx2DI "v2di") (VNx2DF "v2df")]) +;; Gives the mode of the 64-bit lowpart of an SVE vector. +(define_mode_attr V64 [(VNx16QI "V8QI") + (VNx8HI "V4HI") (VNx8HF "V4HF") (VNx8BF "V4BF") + (VNx4SI "V2SI") (VNx4SF "V2SF") + (VNx2DI "DI") (VNx2DF "DF")]) + +;; ...and again in lower case. +(define_mode_attr v64 [(VNx16QI "v8qi") + (VNx8HI "v4hi") (VNx8HF "v4hf") (VNx8BF "v4bf") + (VNx4SI "v2si") (VNx4SF "v2sf") + (VNx2DI "di") (VNx2DF "df")]) + (define_mode_attr vnx [(V4SI "vnx4si") (V2DI "vnx2di")]) ;; 64-bit container modes the inner or scalar source mode. diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md index 3214476497c6..f53591d4b045 100644 --- a/gcc/config/aarch64/predicates.md +++ b/gcc/config/aarch64/predicates.md @@ -46,6 +46,10 @@ (and (match_code "const_int") (match_test "op == CONST0_RTX (mode)"))) +(define_predicate "const0_to_1_operand" + (and (match_code "const_int") + (match_test "IN_RANGE (INTVAL (op), 0, 1)"))) + (define_predicate "const_0_to_7_operand" (and (match_code "const_int") (match_test "IN_RANGE (INTVAL (op), 0, 7)"))) diff --git a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c index 3570d4da34b5..83ef2148fd84 100644 --- a/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c +++ b/gcc/testsuite/gcc.target/aarch64/simd/fold_to_highpart_6.c @@ -1,6 +1,7 @@ /* { dg-do compile } */ /* { dg-require-effective-target aarch64_little_endian } */ /* { dg-options "-O2 -march=armv8-a+sve" } */ +/* { dg-final { check-function-bodies "**" "" } } */ #include <arm_neon_sve_bridge.h> @@ -16,6 +17,11 @@ test_addressable () return vmovl_s8 (vget_high_s8 (z)); } +/* +** test_scalable_type: +** sxtl2 v0.8h, v0.16b +** ret +*/ int16x8_t test_scalable_type (svint8_t scalable) { @@ -34,4 +40,5 @@ test_256b_type (int16x16_t foo) return vmovl_s16 ((int16x4_t) { foo[4], foo[5], foo[6], foo[7] }); } -/* { dg-final { scan-assembler-not {sxtl2\t} } } */ +/* { dg-final { scan-assembler-times {sxtl2\t} 1 } } */ +/* { dg-final { scan-assembler-times {sxtl\t} 3 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_1.c b/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_1.c new file mode 100644 index 000000000000..a3d59a498bf6 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_1.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O1" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include <arm_neon.h> +#include <arm_neon_sve_bridge.h> +#include <stdint.h> + +/* +** sub_neon_i16_sve_bridged: +** ssubl2 v0.8h, v0.16b, v1.16b +** ret +*/ +svint16_t sub_neon_i16_sve_bridged(svint8_t a, svint8_t b) { + return svset_neonq_s16(svundef_s16(), + vsubq_s16(vmovl_high_s8(svget_neonq(a)), + vmovl_high_s8(svget_neonq(b)))); +} + diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_2.c b/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_2.c new file mode 100644 index 000000000000..6cca4adb8651 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/fold_to_highpart_2.c @@ -0,0 +1,295 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O1" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include <arm_neon.h> +#include <arm_sve.h> +#include <arm_neon_sve_bridge.h> + +// ============================================================================ +// 8 -> 16 : SIGNED +// ============================================================================ + +/* +** add_neon_i16_from_i8_low_sve_bridged: +** saddl v0.8h, v0.8b, v1.8b +** ret +*/ +svint16_t add_neon_i16_from_i8_low_sve_bridged(svint8_t a, svint8_t b) { + int16x8_t ar = vmovl_s8(vget_low_s8(svget_neonq(a))); + int16x8_t br = vmovl_s8(vget_low_s8(svget_neonq(b))); + return svset_neonq_s16(svundef_s16(), vaddq_s16(ar, br)); +} + +/* +** add_neon_i16_from_i8_high_sve_bridged: +** saddl2 v0.8h, v0.16b, v1.16b +** ret +*/ +svint16_t add_neon_i16_from_i8_high_sve_bridged(svint8_t a, svint8_t b) { + int16x8_t ar = vmovl_s8(vget_high_s8(svget_neonq(a))); + int16x8_t br = vmovl_s8(vget_high_s8(svget_neonq(b))); + return svset_neonq_s16(svundef_s16(), vaddq_s16(ar, br)); +} + +/* +** sub_neon_i16_from_i8_low_sve_bridged: +** ssubl v0.8h, v0.8b, v1.8b +** ret +*/ +svint16_t sub_neon_i16_from_i8_low_sve_bridged(svint8_t a, svint8_t b) { + int16x8_t ar = vmovl_s8(vget_low_s8(svget_neonq(a))); + int16x8_t br = vmovl_s8(vget_low_s8(svget_neonq(b))); + return svset_neonq_s16(svundef_s16(), vsubq_s16(ar, br)); +} + +/* +** sub_neon_i16_from_i8_high_sve_bridged: +** ssubl2 v0.8h, v0.16b, v1.16b +** ret +*/ +svint16_t sub_neon_i16_from_i8_high_sve_bridged(svint8_t a, svint8_t b) { + int16x8_t ar = vmovl_s8(vget_high_s8(svget_neonq(a))); + int16x8_t br = vmovl_s8(vget_high_s8(svget_neonq(b))); + return svset_neonq_s16(svundef_s16(), vsubq_s16(ar, br)); +} + +// ============================================================================ +// 8 -> 16 : UNSIGNED +// ============================================================================ + +/* +** add_neon_u16_from_u8_low_sve_bridged: +** uaddl v0.8h, v0.8b, v1.8b +** ret +*/ +svuint16_t add_neon_u16_from_u8_low_sve_bridged(svuint8_t a, svuint8_t b) { + uint16x8_t ar = vmovl_u8(vget_low_u8(svget_neonq(a))); + uint16x8_t br = vmovl_u8(vget_low_u8(svget_neonq(b))); + return svset_neonq_u16(svundef_u16(), vaddq_u16(ar, br)); +} + +/* +** add_neon_u16_from_u8_high_sve_bridged: +** uaddl2 v0.8h, v0.16b, v1.16b +** ret +*/ +svuint16_t add_neon_u16_from_u8_high_sve_bridged(svuint8_t a, svuint8_t b) { + uint16x8_t ar = vmovl_u8(vget_high_u8(svget_neonq(a))); + uint16x8_t br = vmovl_u8(vget_high_u8(svget_neonq(b))); + return svset_neonq_u16(svundef_u16(), vaddq_u16(ar, br)); +} + +/* +** sub_neon_u16_from_u8_low_sve_bridged: +** usubl v0.8h, v0.8b, v1.8b +** ret +*/ +svuint16_t sub_neon_u16_from_u8_low_sve_bridged(svuint8_t a, svuint8_t b) { + uint16x8_t ar = vmovl_u8(vget_low_u8(svget_neonq(a))); + uint16x8_t br = vmovl_u8(vget_low_u8(svget_neonq(b))); + return svset_neonq_u16(svundef_u16(), vsubq_u16(ar, br)); +} + +/* +** sub_neon_u16_from_u8_high_sve_bridged: +** usubl2 v0.8h, v0.16b, v1.16b +** ret +*/ +svuint16_t sub_neon_u16_from_u8_high_sve_bridged(svuint8_t a, svuint8_t b) { + uint16x8_t ar = vmovl_u8(vget_high_u8(svget_neonq(a))); + uint16x8_t br = vmovl_u8(vget_high_u8(svget_neonq(b))); + return svset_neonq_u16(svundef_u16(), vsubq_u16(ar, br)); +} + +// ============================================================================ +// 16 -> 32 : SIGNED +// ============================================================================ + +/* +** add_neon_i32_from_i16_low_sve_bridged: +** saddl v0.4s, v0.4h, v1.4h +** ret +*/ +svint32_t add_neon_i32_from_i16_low_sve_bridged(svint16_t a, svint16_t b) { + int32x4_t ar = vmovl_s16(vget_low_s16(svget_neonq(a))); + int32x4_t br = vmovl_s16(vget_low_s16(svget_neonq(b))); + return svset_neonq_s32(svundef_s32(), vaddq_s32(ar, br)); +} + +/* +** add_neon_i32_from_i16_high_sve_bridged: +** saddl2 v0.4s, v0.8h, v1.8h +** ret +*/ +svint32_t add_neon_i32_from_i16_high_sve_bridged(svint16_t a, svint16_t b) { + int32x4_t ar = vmovl_s16(vget_high_s16(svget_neonq(a))); + int32x4_t br = vmovl_s16(vget_high_s16(svget_neonq(b))); + return svset_neonq_s32(svundef_s32(), vaddq_s32(ar, br)); +} + +/* +** sub_neon_i32_from_i16_low_sve_bridged: +** ssubl v0.4s, v0.4h, v1.4h +** ret +*/ +svint32_t sub_neon_i32_from_i16_low_sve_bridged(svint16_t a, svint16_t b) { + int32x4_t ar = vmovl_s16(vget_low_s16(svget_neonq(a))); + int32x4_t br = vmovl_s16(vget_low_s16(svget_neonq(b))); + return svset_neonq_s32(svundef_s32(), vsubq_s32(ar, br)); +} + +/* +** sub_neon_i32_from_i16_high_sve_bridged: +** ssubl2 v0.4s, v0.8h, v1.8h +** ret +*/ +svint32_t sub_neon_i32_from_i16_high_sve_bridged(svint16_t a, svint16_t b) { + int32x4_t ar = vmovl_s16(vget_high_s16(svget_neonq(a))); + int32x4_t br = vmovl_s16(vget_high_s16(svget_neonq(b))); + return svset_neonq_s32(svundef_s32(), vsubq_s32(ar, br)); +} + +// ============================================================================ +// 16 -> 32 : UNSIGNED +// ============================================================================ + +/* +** add_neon_u32_from_u16_low_sve_bridged: +** uaddl v0.4s, v0.4h, v1.4h +** ret +*/ +svuint32_t add_neon_u32_from_u16_low_sve_bridged(svuint16_t a, svuint16_t b) { + uint32x4_t ar = vmovl_u16(vget_low_u16(svget_neonq(a))); + uint32x4_t br = vmovl_u16(vget_low_u16(svget_neonq(b))); + return svset_neonq_u32(svundef_u32(), vaddq_u32(ar, br)); +} + +/* +** add_neon_u32_from_u16_high_sve_bridged: +** uaddl2 v0.4s, v0.8h, v1.8h +** ret +*/ +svuint32_t add_neon_u32_from_u16_high_sve_bridged(svuint16_t a, svuint16_t b) { + uint32x4_t ar = vmovl_u16(vget_high_u16(svget_neonq(a))); + uint32x4_t br = vmovl_u16(vget_high_u16(svget_neonq(b))); + return svset_neonq_u32(svundef_u32(), vaddq_u32(ar, br)); +} + +/* +** sub_neon_u32_from_u16_low_sve_bridged: +** usubl v0.4s, v0.4h, v1.4h +** ret +*/ +svuint32_t sub_neon_u32_from_u16_low_sve_bridged(svuint16_t a, svuint16_t b) { + uint32x4_t ar = vmovl_u16(vget_low_u16(svget_neonq(a))); + uint32x4_t br = vmovl_u16(vget_low_u16(svget_neonq(b))); + return svset_neonq_u32(svundef_u32(), vsubq_u32(ar, br)); +} + +/* +** sub_neon_u32_from_u16_high_sve_bridged: +** usubl2 v0.4s, v0.8h, v1.8h +** ret +*/ +svuint32_t sub_neon_u32_from_u16_high_sve_bridged(svuint16_t a, svuint16_t b) { + uint32x4_t ar = vmovl_u16(vget_high_u16(svget_neonq(a))); + uint32x4_t br = vmovl_u16(vget_high_u16(svget_neonq(b))); + return svset_neonq_u32(svundef_u32(), vsubq_u32(ar, br)); +} + +// ============================================================================ +// 32 -> 64 : SIGNED +// ============================================================================ + +/* +** add_neon_i64_from_i32_low_sve_bridged: +** saddl v0.2d, v0.2s, v1.2s +** ret +*/ +svint64_t add_neon_i64_from_i32_low_sve_bridged(svint32_t a, svint32_t b) { + int64x2_t ar = vmovl_s32(vget_low_s32(svget_neonq(a))); + int64x2_t br = vmovl_s32(vget_low_s32(svget_neonq(b))); + return svset_neonq_s64(svundef_s64(), vaddq_s64(ar, br)); +} + +/* +** add_neon_i64_from_i32_high_sve_bridged: +** saddl2 v0.2d, v0.4s, v1.4s +** ret +*/ +svint64_t add_neon_i64_from_i32_high_sve_bridged(svint32_t a, svint32_t b) { + int64x2_t ar = vmovl_s32(vget_high_s32(svget_neonq(a))); + int64x2_t br = vmovl_s32(vget_high_s32(svget_neonq(b))); + return svset_neonq_s64(svundef_s64(), vaddq_s64(ar, br)); +} + +/* +** sub_neon_i64_from_i32_low_sve_bridged: +** ssubl v0.2d, v0.2s, v1.2s +** ret +*/ +svint64_t sub_neon_i64_from_i32_low_sve_bridged(svint32_t a, svint32_t b) { + int64x2_t ar = vmovl_s32(vget_low_s32(svget_neonq(a))); + int64x2_t br = vmovl_s32(vget_low_s32(svget_neonq(b))); + return svset_neonq_s64(svundef_s64(), vsubq_s64(ar, br)); +} + +/* +** sub_neon_i64_from_i32_high_sve_bridged: +** ssubl2 v0.2d, v0.4s, v1.4s +** ret +*/ +svint64_t sub_neon_i64_from_i32_high_sve_bridged(svint32_t a, svint32_t b) { + int64x2_t ar = vmovl_s32(vget_high_s32(svget_neonq(a))); + int64x2_t br = vmovl_s32(vget_high_s32(svget_neonq(b))); + return svset_neonq_s64(svundef_s64(), vsubq_s64(ar, br)); +} + +// ============================================================================ +// 32 -> 64 : UNSIGNED +// ============================================================================ + +/* +** add_neon_u64_from_u32_low_sve_bridged: +** uaddl v0.2d, v0.2s, v1.2s +** ret +*/ +svuint64_t add_neon_u64_from_u32_low_sve_bridged(svuint32_t a, svuint32_t b) { + uint64x2_t ar = vmovl_u32(vget_low_u32(svget_neonq(a))); + uint64x2_t br = vmovl_u32(vget_low_u32(svget_neonq(b))); + return svset_neonq_u64(svundef_u64(), vaddq_u64(ar, br)); +} + +/* +** add_neon_u64_from_u32_high_sve_bridged: +** uaddl2 v0.2d, v0.4s, v1.4s +** ret +*/ +svuint64_t add_neon_u64_from_u32_high_sve_bridged(svuint32_t a, svuint32_t b) { + uint64x2_t ar = vmovl_u32(vget_high_u32(svget_neonq(a))); + uint64x2_t br = vmovl_u32(vget_high_u32(svget_neonq(b))); + return svset_neonq_u64(svundef_u64(), vaddq_u64(ar, br)); +} + +/* +** sub_neon_u64_from_u32_low_sve_bridged: +** usubl v0.2d, v0.2s, v1.2s +** ret +*/ +svuint64_t sub_neon_u64_from_u32_low_sve_bridged(svuint32_t a, svuint32_t b) { + uint64x2_t ar = vmovl_u32(vget_low_u32(svget_neonq(a))); + uint64x2_t br = vmovl_u32(vget_low_u32(svget_neonq(b))); + return svset_neonq_u64(svundef_u64(), vsubq_u64(ar, br)); +} + +/* +** sub_neon_u64_from_u32_high_sve_bridged: +** usubl2 v0.2d, v0.4s, v1.4s +** ret +*/ +svuint64_t sub_neon_u64_from_u32_high_sve_bridged(svuint32_t a, svuint32_t b) { + uint64x2_t ar = vmovl_u32(vget_high_u32(svget_neonq(a))); + uint64x2_t br = vmovl_u32(vget_high_u32(svget_neonq(b))); + return svset_neonq_u64(svundef_u64(), vsubq_u64(ar, br)); +}
