Hi All, This adds implementation for the optabs for complex operations. With this the following C code:
void f90 (int _Complex a[restrict N], int _Complex b[restrict N], int _Complex c[restrict N]) { for (int i=0; i < N; i++) c[i] = a[i] + (b[i] * I); } generates .L3: mov r3, r0 vldrw.32 q2, [r3] mov r3, r1 vldrw.32 q1, [r3] mov r3, r2 vcadd.i32 q3, q2, q1, #90 adds r0, r0, #16 vstrw.32 q3, [r3] adds r1, r1, #16 adds r2, r2, #16 le lr, .L3 pop {r4, r5, r6, r7, r8, pc} which is not ideal due to register allocation and addressing mode issues with MVE in general. However -frename-register cleans up the register allocation: .L3: mov r5, r0 mov r6, r1 vldrw.32 q2, [r5] vldrw.32 q1, [r6] mov r7, r2 vcadd.i32 q3, q2, q1, #90 adds r0, r0, #16 vstrw.32 q3, [r7] adds r1, r1, #16 adds r2, r2, #16 le lr, .L3 pop {r4, r5, r6, r7, r8, pc} but leaves the addressing mode problems. Before this patch it generated a scalar loop .L2: ldr r7, [r0, r3, lsl #2] ldr r5, [r6, r3, lsl #2] ldr r4, [r1, r3, lsl #2] subs r5, r7, r5 ldr r7, [lr, r3, lsl #2] add r4, r4, r7 str r5, [r2, r3, lsl #2] str r4, [ip, r3, lsl #2] adds r3, r3, #2 cmp r3, #200 bne .L2 pop {r4, r5, r6, r7, pc} Bootstrapped Regtested on arm-none-linux-gnueabihf and no issues. Cross compiled arm-none-eabi and ran with -march=armv8.1-m.main+mve.fp -mfloat-abi=hard -mfpu=auto and regression is on-going. Unfortunately MVE does not currently implement auto-vectorization of floating point values. As such I cannot test this directly. But since they share 90% of the code with NEON these should just work whenever support is added so I would still like to commit these. To support this I had to refactor the MVE bits a bit. This now uses the same unspecs for both NEON and MVE and removes the unneeded different signed and unsigned unspecs since they both point to the signed instruction. I have tried multiple approaches to cleaning this up but I think this is the nicest it can get given the slight ISA differences. Ok for master if no issues? Thanks, Tamar gcc/ChangeLog: * config/arm/arm_mve.h (__arm_vcaddq_rot90_u8, __arm_vcaddq_rot270_u8, , __arm_vcaddq_rot90_s8, __arm_vcaddq_rot270_s8, __arm_vcaddq_rot90_u16, __arm_vcaddq_rot270_u16, __arm_vcaddq_rot90_s16, __arm_vcaddq_rot270_s16, __arm_vcaddq_rot90_u32, __arm_vcaddq_rot270_u32, __arm_vcaddq_rot90_s32, __arm_vcaddq_rot270_s32, __arm_vcmulq_rot90_f16, __arm_vcmulq_rot270_f16, __arm_vcmulq_rot180_f16, __arm_vcmulq_f16, __arm_vcaddq_rot90_f16, __arm_vcaddq_rot270_f16, __arm_vcmulq_rot90_f32, __arm_vcmulq_rot270_f32, __arm_vcmulq_rot180_f32, __arm_vcmulq_f32, __arm_vcaddq_rot90_f32, __arm_vcaddq_rot270_f32, __arm_vcmlaq_f16, __arm_vcmlaq_rot180_f16, __arm_vcmlaq_rot270_f16, __arm_vcmlaq_rot90_f16, __arm_vcmlaq_f32, __arm_vcmlaq_rot180_f32, __arm_vcmlaq_rot270_f32, __arm_vcmlaq_rot90_f32): Update builtin calls. * config/arm/arm_mve_builtins.def (vcaddq_rot90_u, vcaddq_rot270_u, vcaddq_rot90_s, vcaddq_rot270_s, vcaddq_rot90_f, vcaddq_rot270_f, vcmulq_f, vcmulq_rot90_f, vcmulq_rot180_f, vcmulq_rot270_f, vcmlaq_f, vcmlaq_rot90_f, vcmlaq_rot180_f, vcmlaq_rot270_f): Removed. (vcaddq_rot90, vcaddq_rot270, vcmulq, vcmulq_rot90, vcmulq_rot180, vcmulq_rot270, vcmlaq, vcmlaq_rot90, vcmlaq_rot180, vcmlaq_rot270): New. * config/arm/constraints.md (Dz): Include MVE. * config/arm/iterators.md (mve_rotsplit1, mve_rotsplit2): New. * config/arm/mve.md (VCADDQ_ROT270_S, VCADDQ_ROT90_S, VCADDQ_ROT270_U, VCADDQ_ROT90_U, VCADDQ_ROT270_F, VCADDQ_ROT90_F, VCMULQ_F, VCMULQ_ROT180_F, VCMULQ_ROT270_F, VCMULQ_ROT90_F, VCMLAQ_F, VCMLAQ_ROT180_F, VCMLAQ_ROT90_F, VCMLAQ_ROT270_F, VCADDQ_ROT270_S, VCADDQ_ROT270, VCADDQ_ROT90): Removed. (mve_rot, VCMUL): New. (mve_vcaddq_rot270_<supf><mode, mve_vcaddq_rot90_<supf><mode>, mve_vcaddq_rot270_f<mode>, mve_vcaddq_rot90_f<mode>, mve_vcmulq_f<mode, mve_vcmulq_rot180_f<mode>, mve_vcmulq_rot270_f<mode>, mve_vcmulq_rot90_f<mode>, mve_vcmlaq_f<mode>, mve_vcmlaq_rot180_f<mode>, mve_vcmlaq_rot270_f<mode>, mve_vcmlaq_rot90_f<mode>): Removed. (mve_vcmlaq<mve_rot><mode>, mve_vcmulq<mve_rot><mode>, mve_vcaddq<mve_rot><mode>, cadd<rot><mode>3, mve_vcaddq<mve_rot><mode>): New. * config/arm/neon.md (cadd<rot><mode>3, cml<fcmac1><rot_op><mode>4): Moved. (cmul<rot_op><mode>3): Exclude MVE types. * config/arm/unspecs.md (UNSPEC_VCMUL90, UNSPEC_VCMUL270): New. * config/arm/vec-common.md (cadd<rot><mode>3, cmul<rot_op><mode>3, arm_vcmla<rot><mode>, cml<fcmac1><rot_op><mode>4): New. --
diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h index a801705ced582105df60ccdc79a7500b320e12d4..bd379a5d915ad1d682f2d92554f4bd03c2762733 100644 --- a/gcc/config/arm/arm_mve.h +++ b/gcc/config/arm/arm_mve.h @@ -3983,14 +3983,14 @@ __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcaddq_rot90_u8 (uint8x16_t __a, uint8x16_t __b) { - return __builtin_mve_vcaddq_rot90_uv16qi (__a, __b); + return __builtin_mve_vcaddq_rot90v16qi (__a, __b); } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcaddq_rot270_u8 (uint8x16_t __a, uint8x16_t __b) { - return __builtin_mve_vcaddq_rot270_uv16qi (__a, __b); + return __builtin_mve_vcaddq_rot270v16qi (__a, __b); } __extension__ extern __inline uint8x16_t @@ -4522,14 +4522,14 @@ __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcaddq_rot90_s8 (int8x16_t __a, int8x16_t __b) { - return __builtin_mve_vcaddq_rot90_sv16qi (__a, __b); + return __builtin_mve_vcaddq_rot90v16qi (__a, __b); } __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcaddq_rot270_s8 (int8x16_t __a, int8x16_t __b) { - return __builtin_mve_vcaddq_rot270_sv16qi (__a, __b); + return __builtin_mve_vcaddq_rot270v16qi (__a, __b); } __extension__ extern __inline int8x16_t @@ -4823,14 +4823,14 @@ __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcaddq_rot90_u16 (uint16x8_t __a, uint16x8_t __b) { - return __builtin_mve_vcaddq_rot90_uv8hi (__a, __b); + return __builtin_mve_vcaddq_rot90v8hi (__a, __b); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcaddq_rot270_u16 (uint16x8_t __a, uint16x8_t __b) { - return __builtin_mve_vcaddq_rot270_uv8hi (__a, __b); + return __builtin_mve_vcaddq_rot270v8hi (__a, __b); } __extension__ extern __inline uint16x8_t @@ -5362,14 +5362,14 @@ __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcaddq_rot90_s16 (int16x8_t __a, int16x8_t __b) { - return __builtin_mve_vcaddq_rot90_sv8hi (__a, __b); + return __builtin_mve_vcaddq_rot90v8hi (__a, __b); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcaddq_rot270_s16 (int16x8_t __a, int16x8_t __b) { - return __builtin_mve_vcaddq_rot270_sv8hi (__a, __b); + return __builtin_mve_vcaddq_rot270v8hi (__a, __b); } __extension__ extern __inline int16x8_t @@ -5663,14 +5663,14 @@ __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcaddq_rot90_u32 (uint32x4_t __a, uint32x4_t __b) { - return __builtin_mve_vcaddq_rot90_uv4si (__a, __b); + return __builtin_mve_vcaddq_rot90v4si (__a, __b); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcaddq_rot270_u32 (uint32x4_t __a, uint32x4_t __b) { - return __builtin_mve_vcaddq_rot270_uv4si (__a, __b); + return __builtin_mve_vcaddq_rot270v4si (__a, __b); } __extension__ extern __inline uint32x4_t @@ -6202,14 +6202,14 @@ __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcaddq_rot90_s32 (int32x4_t __a, int32x4_t __b) { - return __builtin_mve_vcaddq_rot90_sv4si (__a, __b); + return __builtin_mve_vcaddq_rot90v4si (__a, __b); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcaddq_rot270_s32 (int32x4_t __a, int32x4_t __b) { - return __builtin_mve_vcaddq_rot270_sv4si (__a, __b); + return __builtin_mve_vcaddq_rot270v4si (__a, __b); } __extension__ extern __inline int32x4_t @@ -17380,42 +17380,42 @@ __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmulq_rot90_f16 (float16x8_t __a, float16x8_t __b) { - return __builtin_mve_vcmulq_rot90_fv8hf (__a, __b); + return __builtin_mve_vcmulq_rot90v8hf (__a, __b); } __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmulq_rot270_f16 (float16x8_t __a, float16x8_t __b) { - return __builtin_mve_vcmulq_rot270_fv8hf (__a, __b); + return __builtin_mve_vcmulq_rot270v8hf (__a, __b); } __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmulq_rot180_f16 (float16x8_t __a, float16x8_t __b) { - return __builtin_mve_vcmulq_rot180_fv8hf (__a, __b); + return __builtin_mve_vcmulq_rot180v8hf (__a, __b); } __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmulq_f16 (float16x8_t __a, float16x8_t __b) { - return __builtin_mve_vcmulq_fv8hf (__a, __b); + return __builtin_mve_vcmulqv8hf (__a, __b); } __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcaddq_rot90_f16 (float16x8_t __a, float16x8_t __b) { - return __builtin_mve_vcaddq_rot90_fv8hf (__a, __b); + return __builtin_mve_vcaddq_rot90v8hf (__a, __b); } __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcaddq_rot270_f16 (float16x8_t __a, float16x8_t __b) { - return __builtin_mve_vcaddq_rot270_fv8hf (__a, __b); + return __builtin_mve_vcaddq_rot270v8hf (__a, __b); } __extension__ extern __inline float16x8_t @@ -17632,42 +17632,42 @@ __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmulq_rot90_f32 (float32x4_t __a, float32x4_t __b) { - return __builtin_mve_vcmulq_rot90_fv4sf (__a, __b); + return __builtin_mve_vcmulq_rot90v4sf (__a, __b); } __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmulq_rot270_f32 (float32x4_t __a, float32x4_t __b) { - return __builtin_mve_vcmulq_rot270_fv4sf (__a, __b); + return __builtin_mve_vcmulq_rot270v4sf (__a, __b); } __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmulq_rot180_f32 (float32x4_t __a, float32x4_t __b) { - return __builtin_mve_vcmulq_rot180_fv4sf (__a, __b); + return __builtin_mve_vcmulq_rot180v4sf (__a, __b); } __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmulq_f32 (float32x4_t __a, float32x4_t __b) { - return __builtin_mve_vcmulq_fv4sf (__a, __b); + return __builtin_mve_vcmulqv4sf (__a, __b); } __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcaddq_rot90_f32 (float32x4_t __a, float32x4_t __b) { - return __builtin_mve_vcaddq_rot90_fv4sf (__a, __b); + return __builtin_mve_vcaddq_rot90v4sf (__a, __b); } __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcaddq_rot270_f32 (float32x4_t __a, float32x4_t __b) { - return __builtin_mve_vcaddq_rot270_fv4sf (__a, __b); + return __builtin_mve_vcaddq_rot270v4sf (__a, __b); } __extension__ extern __inline float32x4_t @@ -17822,28 +17822,28 @@ __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmlaq_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c) { - return __builtin_mve_vcmlaq_fv8hf (__a, __b, __c); + return __builtin_mve_vcmlaqv8hf (__a, __b, __c); } __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmlaq_rot180_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c) { - return __builtin_mve_vcmlaq_rot180_fv8hf (__a, __b, __c); + return __builtin_mve_vcmlaq_rot180v8hf (__a, __b, __c); } __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmlaq_rot270_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c) { - return __builtin_mve_vcmlaq_rot270_fv8hf (__a, __b, __c); + return __builtin_mve_vcmlaq_rot270v8hf (__a, __b, __c); } __extension__ extern __inline float16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmlaq_rot90_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c) { - return __builtin_mve_vcmlaq_rot90_fv8hf (__a, __b, __c); + return __builtin_mve_vcmlaq_rot90v8hf (__a, __b, __c); } __extension__ extern __inline float16x8_t @@ -18130,28 +18130,28 @@ __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmlaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c) { - return __builtin_mve_vcmlaq_fv4sf (__a, __b, __c); + return __builtin_mve_vcmlaqv4sf (__a, __b, __c); } __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmlaq_rot180_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c) { - return __builtin_mve_vcmlaq_rot180_fv4sf (__a, __b, __c); + return __builtin_mve_vcmlaq_rot180v4sf (__a, __b, __c); } __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmlaq_rot270_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c) { - return __builtin_mve_vcmlaq_rot270_fv4sf (__a, __b, __c); + return __builtin_mve_vcmlaq_rot270v4sf (__a, __b, __c); } __extension__ extern __inline float32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) __arm_vcmlaq_rot90_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c) { - return __builtin_mve_vcmlaq_rot90_fv4sf (__a, __b, __c); + return __builtin_mve_vcmlaq_rot90v4sf (__a, __b, __c); } __extension__ extern __inline float32x4_t diff --git a/gcc/config/arm/arm_mve_builtins.def b/gcc/config/arm/arm_mve_builtins.def index 753e40a951d071c1ab77476a1cc4779e91689178..f31248595b7bef4eed4963dbbbc72371b15b8af8 100644 --- a/gcc/config/arm/arm_mve_builtins.def +++ b/gcc/config/arm/arm_mve_builtins.def @@ -125,8 +125,6 @@ VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpeqq_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpeqq_n_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpcsq_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vcmpcsq_n_u, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_UNONE_UNONE, vcaddq_rot90_u, v16qi, v8hi, v4si) -VAR3 (BINOP_UNONE_UNONE_UNONE, vcaddq_rot270_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vbicq_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vandq_u, v16qi, v8hi, v4si) VAR3 (BINOP_UNONE_UNONE_UNONE, vaddvq_p_u, v16qi, v8hi, v4si) @@ -202,8 +200,6 @@ VAR3 (BINOP_NONE_NONE_NONE, vhcaddq_rot270_s, v16qi, v8hi, v4si) VAR3 (BINOP_NONE_NONE_NONE, vhaddq_s, v16qi, v8hi, v4si) VAR3 (BINOP_NONE_NONE_NONE, vhaddq_n_s, v16qi, v8hi, v4si) VAR3 (BINOP_NONE_NONE_NONE, veorq_s, v16qi, v8hi, v4si) -VAR3 (BINOP_NONE_NONE_NONE, vcaddq_rot90_s, v16qi, v8hi, v4si) -VAR3 (BINOP_NONE_NONE_NONE, vcaddq_rot270_s, v16qi, v8hi, v4si) VAR3 (BINOP_NONE_NONE_NONE, vbrsrq_n_s, v16qi, v8hi, v4si) VAR3 (BINOP_NONE_NONE_NONE, vbicq_s, v16qi, v8hi, v4si) VAR3 (BINOP_NONE_NONE_NONE, vandq_s, v16qi, v8hi, v4si) @@ -264,12 +260,6 @@ VAR2 (BINOP_NONE_NONE_NONE, vmaxnmq_f, v8hf, v4sf) VAR2 (BINOP_NONE_NONE_NONE, vmaxnmavq_f, v8hf, v4sf) VAR2 (BINOP_NONE_NONE_NONE, vmaxnmaq_f, v8hf, v4sf) VAR2 (BINOP_NONE_NONE_NONE, veorq_f, v8hf, v4sf) -VAR2 (BINOP_NONE_NONE_NONE, vcmulq_rot90_f, v8hf, v4sf) -VAR2 (BINOP_NONE_NONE_NONE, vcmulq_rot270_f, v8hf, v4sf) -VAR2 (BINOP_NONE_NONE_NONE, vcmulq_rot180_f, v8hf, v4sf) -VAR2 (BINOP_NONE_NONE_NONE, vcmulq_f, v8hf, v4sf) -VAR2 (BINOP_NONE_NONE_NONE, vcaddq_rot90_f, v8hf, v4sf) -VAR2 (BINOP_NONE_NONE_NONE, vcaddq_rot270_f, v8hf, v4sf) VAR2 (BINOP_NONE_NONE_NONE, vbicq_f, v8hf, v4sf) VAR2 (BINOP_NONE_NONE_NONE, vandq_f, v8hf, v4sf) VAR2 (BINOP_NONE_NONE_NONE, vaddq_n_f, v8hf, v4sf) @@ -472,10 +462,6 @@ VAR2 (TERNOP_NONE_NONE_NONE_NONE, vfmsq_f, v8hf, v4sf) VAR2 (TERNOP_NONE_NONE_NONE_NONE, vfmasq_n_f, v8hf, v4sf) VAR2 (TERNOP_NONE_NONE_NONE_NONE, vfmaq_n_f, v8hf, v4sf) VAR2 (TERNOP_NONE_NONE_NONE_NONE, vfmaq_f, v8hf, v4sf) -VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_rot90_f, v8hf, v4sf) -VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_rot270_f, v8hf, v4sf) -VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_rot180_f, v8hf, v4sf) -VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_f, v8hf, v4sf) VAR2 (TERNOP_NONE_NONE_NONE_IMM, vshrntq_n_s, v8hi, v4si) VAR2 (TERNOP_NONE_NONE_NONE_IMM, vshrnbq_n_s, v8hi, v4si) VAR2 (TERNOP_NONE_NONE_NONE_IMM, vrshrntq_n_s, v8hi, v4si) @@ -904,3 +890,15 @@ VAR3 (QUADOP_NONE_NONE_UNONE_IMM_UNONE, vshlcq_m_vec_s, v16qi, v8hi, v4si) VAR3 (QUADOP_NONE_NONE_UNONE_IMM_UNONE, vshlcq_m_carry_s, v16qi, v8hi, v4si) VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vshlcq_m_vec_u, v16qi, v8hi, v4si) VAR3 (QUADOP_UNONE_UNONE_UNONE_IMM_UNONE, vshlcq_m_carry_u, v16qi, v8hi, v4si) + +/* optabs without any suffixes. */ +VAR5 (BINOP_NONE_NONE_NONE, vcaddq_rot90, v16qi, v8hi, v4si, v8hf, v4sf) +VAR5 (BINOP_NONE_NONE_NONE, vcaddq_rot270, v16qi, v8hi, v4si, v8hf, v4sf) +VAR2 (BINOP_NONE_NONE_NONE, vcmulq_rot90, v8hf, v4sf) +VAR2 (BINOP_NONE_NONE_NONE, vcmulq_rot270, v8hf, v4sf) +VAR2 (BINOP_NONE_NONE_NONE, vcmulq_rot180, v8hf, v4sf) +VAR2 (BINOP_NONE_NONE_NONE, vcmulq, v8hf, v4sf) +VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_rot90, v8hf, v4sf) +VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_rot270, v8hf, v4sf) +VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq_rot180, v8hf, v4sf) +VAR2 (TERNOP_NONE_NONE_NONE_NONE, vcmlaq, v8hf, v4sf) diff --git a/gcc/config/arm/constraints.md b/gcc/config/arm/constraints.md index ff229aa98455e05470801d7110b1aaf5ab3e0d25..e166c11a4c78e8dbcdd25d5cfda14cc36daad5b2 100644 --- a/gcc/config/arm/constraints.md +++ b/gcc/config/arm/constraints.md @@ -310,7 +310,7 @@ (define_constraint "Dz" "@internal In ARM/Thumb-2 state a vector of constant zeros." (and (match_code "const_vector") - (match_test "TARGET_NEON && op == CONST0_RTX (mode)"))) + (match_test "(TARGET_NEON || TARGET_HAVE_MVE) && op == CONST0_RTX (mode)"))) (define_constraint "Da" "@internal diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index a4da379670ce3428253664d44d2e18415a8f49ab..63d4ebe786c5f1262851e462942127adc4a5e92c 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -1168,6 +1168,21 @@ (define_int_attr rotsplit2 [(UNSPEC_VCMLA "90") (UNSPEC_VCMLS "180") (UNSPEC_VCMLS180 "180")]) +(define_int_attr mve_rotsplit1 [(UNSPEC_VCMLA "") + (UNSPEC_VCMLA180 "") + (UNSPEC_VCMUL "") + (UNSPEC_VCMUL180 "") + (UNSPEC_VCMLS "_rot270") + (UNSPEC_VCMLS180 "_rot90")]) + +(define_int_attr mve_rotsplit2 [(UNSPEC_VCMLA "_rot90") + (UNSPEC_VCMLA180 "_rot270") + (UNSPEC_VCMUL "_rot90") + (UNSPEC_VCMUL180 "_rot270") + (UNSPEC_VCMLS "_rot180") + (UNSPEC_VCMLS180 "_rot180")]) + + (define_int_attr fcmac1 [(UNSPEC_VCMLA "a") (UNSPEC_VCMLA180 "a") (UNSPEC_VCMLS "s") (UNSPEC_VCMLS180 "s")]) diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 465b39a51b3a258295ed764f0e742932e5d59225..b8cd74176a41572008d86e6a074a626ccf35a69c 100644 --- a/gcc/config/arm/mve.md +++ b/gcc/config/arm/mve.md @@ -42,7 +42,7 @@ (define_c_enum "unspec" [VST4Q VRNDXQ_F VRNDQ_F VRNDPQ_F VRNDNQ_F VRNDMQ_F VCVTQ_N_FROM_F_S VCVTQ_N_FROM_F_U VADDLVQ_P_S VADDLVQ_P_U VCMPNEQ_U VCMPNEQ_S VSHLQ_S VSHLQ_U VABDQ_S VADDQ_N_S VADDVAQ_S VADDVQ_P_S VANDQ_S VBICQ_S - VBRSRQ_N_S VCADDQ_ROT270_S VCADDQ_ROT90_S VCMPEQQ_S + VBRSRQ_N_S VCMPEQQ_S VCMPEQQ_N_S VCMPNEQ_N_S VEORQ_S VHADDQ_S VHADDQ_N_S VHSUBQ_S VHSUBQ_N_S VMAXQ_S VMAXVQ_S VMINQ_S VMINVQ_S VMLADAVQ_S VMULHQ_S VMULLBQ_INT_S VMULLTQ_INT_S VMULQ_S @@ -51,7 +51,7 @@ (define_c_enum "unspec" [VST4Q VRNDXQ_F VRNDQ_F VRNDPQ_F VRNDNQ_F VRNDMQ_F VQSUBQ_N_S VRHADDQ_S VRMULHQ_S VRSHLQ_S VRSHLQ_N_S VRSHRQ_N_S VSHLQ_N_S VSHLQ_R_S VSUBQ_S VSUBQ_N_S VABDQ_U VADDQ_N_U VADDVAQ_U VADDVQ_P_U VANDQ_U VBICQ_U - VBRSRQ_N_U VCADDQ_ROT270_U VCADDQ_ROT90_U VCMPEQQ_U + VBRSRQ_N_U VCMPEQQ_U VCMPEQQ_N_U VCMPNEQ_N_U VEORQ_U VHADDQ_U VHADDQ_N_U VHSUBQ_U VHSUBQ_N_U VMAXQ_U VMAXVQ_U VMINQ_U VMINVQ_U VMLADAVQ_U VMULHQ_U VMULLBQ_INT_U VMULLTQ_INT_U VMULQ_U @@ -66,10 +66,9 @@ (define_c_enum "unspec" [VST4Q VRNDXQ_F VRNDQ_F VRNDPQ_F VRNDNQ_F VRNDMQ_F VQDMULHQ_S VQRDMULHQ_N_S VQRDMULHQ_S VQSHLUQ_N_S VCMPCSQ_N_U VCMPCSQ_U VCMPHIQ_N_U VCMPHIQ_U VABDQ_M_S VABDQ_M_U VABDQ_F VADDQ_N_F VANDQ_F VBICQ_F - VCADDQ_ROT270_F VCADDQ_ROT90_F VCMPEQQ_F VCMPEQQ_N_F + VCMPEQQ_F VCMPEQQ_N_F VCMPGEQ_F VCMPGEQ_N_F VCMPGTQ_F VCMPGTQ_N_F VCMPLEQ_F VCMPLEQ_N_F VCMPLTQ_F VCMPLTQ_N_F VCMPNEQ_F VCMPNEQ_N_F - VCMULQ_F VCMULQ_ROT180_F VCMULQ_ROT270_F VCMULQ_ROT90_F VEORQ_F VMAXNMAQ_F VMAXNMAVQ_F VMAXNMQ_F VMAXNMVQ_F VMINNMAQ_F VMINNMAVQ_F VMINNMQ_F VMINNMVQ_F VMULQ_F VMULQ_N_F VORNQ_F VORRQ_F VSUBQ_F VADDLVAQ_U @@ -112,18 +111,18 @@ (define_c_enum "unspec" [VST4Q VRNDXQ_F VRNDQ_F VRNDPQ_F VRNDNQ_F VRNDMQ_F VMLSDAVAXQ_S VMLSDAVAQ_S VMLADAVAXQ_S VCMPGEQ_M_F VCMPGTQ_M_N_F VMLSLDAVQ_P_S VRMLALDAVHAXQ_S VMLSLDAVXQ_P_S VFMAQ_F VMLSLDAVAQ_S VQSHRUNBQ_N_S - VQRSHRUNTQ_N_S VCMLAQ_F VMINNMAQ_M_F VFMASQ_N_F + VQRSHRUNTQ_N_S VMINNMAQ_M_F VFMASQ_N_F VDUPQ_M_N_F VCMPGTQ_M_F VCMPLTQ_M_F VRMLSLDAVHQ_P_S VQSHRUNTQ_N_S VABSQ_M_F VMAXNMAVQ_P_F VFMAQ_N_F VRMLSLDAVHXQ_P_S VREV32Q_M_F VRMLSLDAVHAQ_S VRMLSLDAVHAXQ_S VCMPLTQ_M_N_F VCMPNEQ_M_F VRNDAQ_M_F VRNDPQ_M_F VADDLVAQ_P_S VQMOVUNBQ_M_S VCMPLEQ_M_F - VCMLAQ_ROT180_F VMLSLDAVAXQ_S VRNDXQ_M_F VFMSQ_F - VMINNMVQ_P_F VMAXNMVQ_P_F VPSELQ_F VCMLAQ_ROT90_F + VMLSLDAVAXQ_S VRNDXQ_M_F VFMSQ_F + VMINNMVQ_P_F VMAXNMVQ_P_F VPSELQ_F VQMOVUNTQ_M_S VREV64Q_M_F VNEGQ_M_F VRNDMQ_M_F VCMPLEQ_M_N_F VCMPGEQ_M_N_F VRNDNQ_M_F VMINNMAVQ_P_F VCMPNEQ_M_N_F VRMLALDAVHQ_P_S VRMLALDAVHXQ_P_S - VCMPEQQ_M_N_F VCMLAQ_ROT270_F VMAXNMAQ_M_F VRNDQ_M_F + VCMPEQQ_M_N_F VMAXNMAQ_M_F VRNDQ_M_F VMLALDAVQ_P_U VMLALDAVQ_P_S VQMOVNBQ_M_S VQMOVNBQ_M_U VMOVLTQ_M_U VMOVLTQ_M_S VMOVNBQ_M_U VMOVNBQ_M_S VRSHRNTQ_N_U VRSHRNTQ_N_S VORRQ_M_N_S VORRQ_M_N_U @@ -240,9 +239,8 @@ (define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U "u") (VREV16Q_S "s") (VABDQ_U "u") (VADDQ_N_S "s") (VADDQ_N_U "u") (VADDVQ_P_S "s") (VADDVQ_P_U "u") (VANDQ_S "s") (VANDQ_U "u") (VBICQ_S "s") (VBICQ_U "u") - (VBRSRQ_N_S "s") (VBRSRQ_N_U "u") (VCADDQ_ROT270_S "s") - (VCADDQ_ROT270_U "u") (VCADDQ_ROT90_S "s") - (VCMPEQQ_S "s") (VCMPEQQ_U "u") (VCADDQ_ROT90_U "u") + (VBRSRQ_N_S "s") (VBRSRQ_N_U "u") + (VCMPEQQ_S "s") (VCMPEQQ_U "u") (VCMPEQQ_N_S "s") (VCMPEQQ_N_U "u") (VCMPNEQ_N_S "s") (VCMPNEQ_N_U "u") (VEORQ_S "s") (VEORQ_U "u") (VHADDQ_N_S "s") (VHADDQ_N_U "u") (VHADDQ_S "s") @@ -421,6 +419,19 @@ (define_mode_attr V_extr_elem [(V16QI "u8") (V8HI "u16") (V4SI "32") (define_mode_attr earlyclobber_32 [(V16QI "=w") (V8HI "=w") (V4SI "=&w") (V8HF "=w") (V4SF "=&w")]) +(define_int_attr mve_rot [(UNSPEC_VCADD90 "_rot90") + (UNSPEC_VCADD270 "_rot270") + (UNSPEC_VCMLA "") + (UNSPEC_VCMLA90 "_rot90") + (UNSPEC_VCMLA180 "_rot180") + (UNSPEC_VCMLA270 "_rot270") + (UNSPEC_VCMUL "") + (UNSPEC_VCMUL90 "_rot90") + (UNSPEC_VCMUL180 "_rot180") + (UNSPEC_VCMUL270 "_rot270")]) + +(define_int_iterator VCMUL [UNSPEC_VCMUL UNSPEC_VCMUL90 UNSPEC_VCMUL180 UNSPEC_VCMUL270]) + (define_int_iterator VCVTQ_TO_F [VCVTQ_TO_F_S VCVTQ_TO_F_U]) (define_int_iterator VMVNQ_N [VMVNQ_N_U VMVNQ_N_S]) (define_int_iterator VREV64Q [VREV64Q_S VREV64Q_U]) @@ -454,8 +465,6 @@ (define_int_iterator VADDVQ_P [VADDVQ_P_U VADDVQ_P_S]) (define_int_iterator VANDQ [VANDQ_U VANDQ_S]) (define_int_iterator VBICQ [VBICQ_S VBICQ_U]) (define_int_iterator VBRSRQ_N [VBRSRQ_N_U VBRSRQ_N_S]) -(define_int_iterator VCADDQ_ROT270 [VCADDQ_ROT270_S VCADDQ_ROT270_U]) -(define_int_iterator VCADDQ_ROT90 [VCADDQ_ROT90_U VCADDQ_ROT90_S]) (define_int_iterator VCMPEQQ [VCMPEQQ_U VCMPEQQ_S]) (define_int_iterator VCMPEQQ_N [VCMPEQQ_N_S VCMPEQQ_N_U]) (define_int_iterator VCMPNEQ_N [VCMPNEQ_N_U VCMPNEQ_N_S]) @@ -1585,34 +1594,28 @@ (define_insn "mve_vbrsrq_n_<supf><mode>" ]) ;; -;; [vcaddq_rot270_s, vcaddq_rot270_u]) +;; [vcaddq, vcaddq_rot90, vcadd_rot180, vcadd_rot270]) ;; -(define_insn "mve_vcaddq_rot270_<supf><mode>" +(define_insn "mve_vcaddq<mve_rot><mode>" [ (set (match_operand:MVE_2 0 "s_register_operand" "<earlyclobber_32>") (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w") (match_operand:MVE_2 2 "s_register_operand" "w")] - VCADDQ_ROT270)) + VCADD)) ] "TARGET_HAVE_MVE" - "vcadd.i%#<V_sz_elem> %q0, %q1, %q2, #270" + "vcadd.i%#<V_sz_elem> %q0, %q1, %q2, #<rot>" [(set_attr "type" "mve_move") ]) -;; -;; [vcaddq_rot90_u, vcaddq_rot90_s]) -;; -(define_insn "mve_vcaddq_rot90_<supf><mode>" - [ - (set (match_operand:MVE_2 0 "s_register_operand" "<earlyclobber_32>") - (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "w") - (match_operand:MVE_2 2 "s_register_operand" "w")] - VCADDQ_ROT90)) - ] +;; Auto vectorizer pattern for int vcadd +(define_expand "cadd<rot><mode>3" + [(set (match_operand:MVE_2 0 "register_operand") + (unspec:MVE_2 [(match_operand:MVE_2 1 "register_operand") + (match_operand:MVE_2 2 "register_operand")] + VCADD))] "TARGET_HAVE_MVE" - "vcadd.i%#<V_sz_elem> %q0, %q1, %q2, #90" - [(set_attr "type" "mve_move") -]) +) ;; ;; [vcmpcsq_n_u]) @@ -2665,32 +2668,17 @@ (define_insn "mve_vbicq_n_<supf><mode>" ]) ;; -;; [vcaddq_rot270_f]) +;; [vcaddq, vcaddq_rot90, vcadd_rot180, vcadd_rot270]) ;; -(define_insn "mve_vcaddq_rot270_f<mode>" +(define_insn "mve_vcaddq<mve_rot><mode>" [ (set (match_operand:MVE_0 0 "s_register_operand" "<earlyclobber_32>") (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w") (match_operand:MVE_0 2 "s_register_operand" "w")] - VCADDQ_ROT270_F)) + VCADD)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcadd.f%#<V_sz_elem> %q0, %q1, %q2, #270" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcaddq_rot90_f]) -;; -(define_insn "mve_vcaddq_rot90_f<mode>" - [ - (set (match_operand:MVE_0 0 "s_register_operand" "<earlyclobber_32>") - (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w") - (match_operand:MVE_0 2 "s_register_operand" "w")] - VCADDQ_ROT90_F)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcadd.f%#<V_sz_elem> %q0, %q1, %q2, #90" + "vcadd.f%#<V_sz_elem> %q0, %q1, %q2, #<rot>" [(set_attr "type" "mve_move") ]) @@ -2875,62 +2863,17 @@ (define_insn "mve_vcmpneq_n_f<mode>" ]) ;; -;; [vcmulq_f]) -;; -(define_insn "mve_vcmulq_f<mode>" - [ - (set (match_operand:MVE_0 0 "s_register_operand" "<earlyclobber_32>") - (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w") - (match_operand:MVE_0 2 "s_register_operand" "w")] - VCMULQ_F)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcmul.f%#<V_sz_elem> %q0, %q1, %q2, #0" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcmulq_rot180_f]) -;; -(define_insn "mve_vcmulq_rot180_f<mode>" - [ - (set (match_operand:MVE_0 0 "s_register_operand" "<earlyclobber_32>") - (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w") - (match_operand:MVE_0 2 "s_register_operand" "w")] - VCMULQ_ROT180_F)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcmul.f%#<V_sz_elem> %q0, %q1, %q2, #180" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcmulq_rot270_f]) -;; -(define_insn "mve_vcmulq_rot270_f<mode>" - [ - (set (match_operand:MVE_0 0 "s_register_operand" "<earlyclobber_32>") - (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w") - (match_operand:MVE_0 2 "s_register_operand" "w")] - VCMULQ_ROT270_F)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcmul.f%#<V_sz_elem> %q0, %q1, %q2, #270" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcmulq_rot90_f]) +;; [vcmulq, vcmulq_rot90, vcmulq_rot180, vcmulq_rot270]) ;; -(define_insn "mve_vcmulq_rot90_f<mode>" +(define_insn "mve_vcmulq<mve_rot><mode>" [ (set (match_operand:MVE_0 0 "s_register_operand" "<earlyclobber_32>") (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "w") (match_operand:MVE_0 2 "s_register_operand" "w")] - VCMULQ_ROT90_F)) + VCMUL)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcmul.f%#<V_sz_elem> %q0, %q1, %q2, #90" + "vcmul.f%#<V_sz_elem> %q0, %q1, %q2, #<rot>" [(set_attr "type" "mve_move") ]) @@ -4692,66 +4635,20 @@ (define_insn "mve_vaddlvaq_p_<supf>v4si" [(set_attr "type" "mve_move") (set_attr "length""8")]) ;; -;; [vcmlaq_f]) -;; -(define_insn "mve_vcmlaq_f<mode>" - [ - (set (match_operand:MVE_0 0 "s_register_operand" "=w") - (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") - (match_operand:MVE_0 2 "s_register_operand" "w") - (match_operand:MVE_0 3 "s_register_operand" "w")] - VCMLAQ_F)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcmla.f%#<V_sz_elem> %q0, %q2, %q3, #0" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcmlaq_rot180_f]) -;; -(define_insn "mve_vcmlaq_rot180_f<mode>" - [ - (set (match_operand:MVE_0 0 "s_register_operand" "=w") - (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") - (match_operand:MVE_0 2 "s_register_operand" "w") - (match_operand:MVE_0 3 "s_register_operand" "w")] - VCMLAQ_ROT180_F)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcmla.f%#<V_sz_elem> %q0, %q2, %q3, #180" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcmlaq_rot270_f]) -;; -(define_insn "mve_vcmlaq_rot270_f<mode>" - [ - (set (match_operand:MVE_0 0 "s_register_operand" "=w") - (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") - (match_operand:MVE_0 2 "s_register_operand" "w") - (match_operand:MVE_0 3 "s_register_operand" "w")] - VCMLAQ_ROT270_F)) - ] - "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcmla.f%#<V_sz_elem> %q0, %q2, %q3, #270" - [(set_attr "type" "mve_move") -]) - -;; -;; [vcmlaq_rot90_f]) +;; [vcmlaq, vcmlaq_rot90, vcmlaq_rot180, vcmlaq_rot270]) ;; -(define_insn "mve_vcmlaq_rot90_f<mode>" +(define_insn "mve_vcmlaq<mve_rot><mode>" [ - (set (match_operand:MVE_0 0 "s_register_operand" "=w") - (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0") - (match_operand:MVE_0 2 "s_register_operand" "w") - (match_operand:MVE_0 3 "s_register_operand" "w")] - VCMLAQ_ROT90_F)) + (set (match_operand:MVE_0 0 "s_register_operand" "=w,w") + (unspec:MVE_0 [(match_operand:MVE_0 1 "s_register_operand" "0,Dz") + (match_operand:MVE_0 2 "s_register_operand" "w,w") + (match_operand:MVE_0 3 "s_register_operand" "w,w")] + VCMLA)) ] "TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT" - "vcmla.f%#<V_sz_elem> %q0, %q2, %q3, #90" + "@ + vcmla.f%#<V_sz_elem> %q0, %q2, %q3, #<rot> + vcmul.f%#<V_sz_elem> %q0, %q2, %q3, #<rot>" [(set_attr "type" "mve_move") ]) diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 2ccbf99883ec6a4808f16453e3d07454f3e3077e..d87ae8adbf1292bebbb9046e67038fcaa5f23040 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -3174,14 +3174,6 @@ (define_insn "neon_vcadd<rot><mode>" [(set_attr "type" "neon_fcadd")] ) -(define_expand "cadd<rot><mode>3" - [(set (match_operand:VF 0 "register_operand") - (unspec:VF [(match_operand:VF 1 "register_operand") - (match_operand:VF 2 "register_operand")] - VCADD))] - "TARGET_COMPLEX" -) - (define_insn "neon_vcmla<rot><mode>" [(set (match_operand:VF 0 "register_operand" "=w") (plus:VF (match_operand:VF 1 "register_operand" "0") @@ -3238,32 +3230,13 @@ (define_insn "neon_vcmlaq_lane<rot><mode>" [(set_attr "type" "neon_fcmla")] ) - -;; The complex mla/mls operations always need to expand to two instructions. -;; The first operation does half the computation and the second does the -;; remainder. Because of this, expand early. -(define_expand "cml<fcmac1><rot_op><mode>4" - [(set (match_operand:VF 0 "register_operand") - (plus:VF (match_operand:VF 1 "register_operand") - (unspec:VF [(match_operand:VF 2 "register_operand") - (match_operand:VF 3 "register_operand")] - VCMLA_OP)))] - "TARGET_COMPLEX" -{ - emit_insn (gen_neon_vcmla<rotsplit1><mode> (operands[0], operands[1], - operands[2], operands[3])); - emit_insn (gen_neon_vcmla<rotsplit2><mode> (operands[0], operands[0], - operands[2], operands[3])); - DONE; -}) - ;; The complex mul operations always need to expand to two instructions. ;; The first operation does half the computation and the second does the ;; remainder. Because of this, expand early. (define_expand "cmul<rot_op><mode>3" - [(set (match_operand:VF 0 "register_operand") - (unspec:VF [(match_operand:VF 1 "register_operand") - (match_operand:VF 2 "register_operand")] + [(set (match_operand:VDF 0 "register_operand") + (unspec:VDF [(match_operand:VDF 1 "register_operand") + (match_operand:VDF 2 "register_operand")] VCMUL_OP))] "TARGET_COMPLEX" { @@ -3276,6 +3249,7 @@ (define_expand "cmul<rot_op><mode>3" DONE; }) + ;; These instructions map to the __builtins for the Dot Product operations. (define_insn "neon_<sup>dot<vsi2qi>" [(set (match_operand:VCVTI 0 "register_operand" "=w") diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md index d1b2824a0fe76f62d69c18dcec2f47dfb75b586e..1251aace01b42d393a40467906c610c45de0412a 100644 --- a/gcc/config/arm/unspecs.md +++ b/gcc/config/arm/unspecs.md @@ -511,7 +511,9 @@ (define_c_enum "unspec" [ UNSPEC_VCMLA180 UNSPEC_VCMLA270 UNSPEC_VCMUL + UNSPEC_VCMUL90 UNSPEC_VCMUL180 + UNSPEC_VCMUL270 UNSPEC_VCMLS UNSPEC_VCMLS180 UNSPEC_MATMUL_S diff --git a/gcc/config/arm/vec-common.md b/gcc/config/arm/vec-common.md index c3c86c46355e6ace6c90e189b4160dbe4cd9caf3..8affffb68e6928bc4210656134677ad5f2915426 100644 --- a/gcc/config/arm/vec-common.md +++ b/gcc/config/arm/vec-common.md @@ -191,3 +191,70 @@ (define_expand "vec_set<mode>" GEN_INT (elem), operands[0])); DONE; }) + +(define_expand "cadd<rot><mode>3" + [(set (match_operand:VF 0 "register_operand") + (unspec:VF [(match_operand:VF 1 "register_operand") + (match_operand:VF 2 "register_operand")] + VCADD))] + "TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT + && ARM_HAVE_NEON_<MODE>_ARITH)" +) + +;; The complex mul operations always need to expand to two instructions. +;; The first operation does half the computation and the second does the +;; remainder. Because of this, expand early. +(define_expand "cmul<rot_op><mode>3" + [(set (match_operand:VQ_HSF 0 "register_operand") + (unspec:VQ_HSF [(match_operand:VQ_HSF 1 "register_operand") + (match_operand:VQ_HSF 2 "register_operand")] + VCMUL_OP))] + "TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT)" +{ + if (TARGET_COMPLEX) + { + rtx tmp = gen_reg_rtx (<MODE>mode); + emit_move_insn (tmp, CONST0_RTX (<MODE>mode)); + emit_insn (gen_neon_vcmla<rotsplit1><mode> (operands[0], tmp, + operands[1], operands[2])); + emit_insn (gen_neon_vcmla<rotsplit2><mode> (operands[0], operands[0], + operands[1], operands[2])); + } + else + { + emit_insn (gen_mve_vcmulq<mve_rotsplit1><mode> (operands[0], operands[1], + operands[2])); + emit_insn (gen_mve_vcmulq<mve_rotsplit2><mode> (operands[0], operands[1], + operands[2])); + } + DONE; +}) + +(define_expand "arm_vcmla<rot><mode>" + [(set (match_operand:VF 0 "register_operand") + (plus:VF (match_operand:VF 1 "register_operand") + (unspec:VF [(match_operand:VF 2 "register_operand") + (match_operand:VF 3 "register_operand")] + VCMLA)))] + "TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT + && ARM_HAVE_NEON_<MODE>_ARITH)" +) + +;; The complex mla/mls operations always need to expand to two instructions. +;; The first operation does half the computation and the second does the +;; remainder. Because of this, expand early. +(define_expand "cml<fcmac1><rot_op><mode>4" + [(set (match_operand:VF 0 "register_operand") + (plus:VF (match_operand:VF 1 "register_operand") + (unspec:VF [(match_operand:VF 2 "register_operand") + (match_operand:VF 3 "register_operand")] + VCMLA_OP)))] + "TARGET_COMPLEX || (TARGET_HAVE_MVE && TARGET_HAVE_MVE_FLOAT + && ARM_HAVE_NEON_<MODE>_ARITH)" +{ + emit_insn (gen_arm_vcmla<rotsplit1><mode> (operands[0], operands[1], + operands[2], operands[3])); + emit_insn (gen_arm_vcmla<rotsplit2><mode> (operands[0], operands[0], + operands[2], operands[3])); + DONE; +})