Hi,
On Thu, 27 Feb 2020 at 18:03, Kyrill Tkachov <kyrylo.tkac...@foss.arm.com> wrote: > > Hi Mihail, > > On 2/27/20 2:44 PM, Mihail Ionescu wrote: > > Hi Kyrill, > > > > On 02/27/2020 11:09 AM, Kyrill Tkachov wrote: > >> Hi Mihail, > >> > >> On 2/27/20 10:27 AM, Mihail Ionescu wrote: > >>> Hi, > >>> > >>> This patch adds support for the bf16 vector create, get, set, > >>> duplicate and reinterpret intrinsics. > >>> ACLE documents are at https://developer.arm.com/docs/101028/latest > >>> ISA documents are at https://developer.arm.com/docs/ddi0596/latest > >>> > >>> Regression tested on arm-none-eabi. > >>> > >>> > >>> gcc/ChangeLog: > >>> > >>> 2020-02-27 Mihail Ionescu <mihail.ione...@arm.com> > >>> > >>> * (__ARM_NUM_LANES, __arm_lane, __arm_lane_q): Move to the > >>> beginning of the file. > >>> (vcreate_bf16, vcombine_bf16): New. > >>> (vdup_n_bf16, vdupq_n_bf16): New. > >>> (vdup_lane_bf16, vdup_laneq_bf16): New. > >>> (vdupq_lane_bf16, vdupq_laneq_bf16): New. > >>> (vduph_lane_bf16, vduph_laneq_bf16): New. > >>> (vset_lane_bf16, vsetq_lane_bf16): New. > >>> (vget_lane_bf16, vgetq_lane_bf16): New. > >>> (vget_high_bf16, vget_low_bf16): New. > >>> (vreinterpret_bf16_u8, vreinterpretq_bf16_u8): New. > >>> (vreinterpret_bf16_u16, vreinterpretq_bf16_u16): New. > >>> (vreinterpret_bf16_u32, vreinterpretq_bf16_u32): New. > >>> (vreinterpret_bf16_u64, vreinterpretq_bf16_u64): New. > >>> (vreinterpret_bf16_s8, vreinterpretq_bf16_s8): New. > >>> (vreinterpret_bf16_s16, vreinterpretq_bf16_s16): New. > >>> (vreinterpret_bf16_s32, vreinterpretq_bf16_s32): New. > >>> (vreinterpret_bf16_s64, vreinterpretq_bf16_s64): New. > >>> (vreinterpret_bf16_p8, vreinterpretq_bf16_p8): New. > >>> (vreinterpret_bf16_p16, vreinterpretq_bf16_p16): New. > >>> (vreinterpret_bf16_p64, vreinterpretq_bf16_p64): New. > >>> (vreinterpret_bf16_f32, vreinterpretq_bf16_f32): New. > >>> (vreinterpret_bf16_f64, vreinterpretq_bf16_f64): New. > >>> (vreinterpretq_bf16_p128): New. > >>> (vreinterpret_s8_bf16, vreinterpretq_s8_bf16): New. > >>> (vreinterpret_s16_bf16, vreinterpretq_s16_bf16): New. > >>> (vreinterpret_s32_bf16, vreinterpretq_s32_bf16): New. > >>> (vreinterpret_s64_bf16, vreinterpretq_s64_bf16): New. > >>> (vreinterpret_u8_bf16, vreinterpretq_u8_bf16): New. > >>> (vreinterpret_u16_bf16, vreinterpretq_u16_bf16): New. > >>> (vreinterpret_u32_bf16, vreinterpretq_u32_bf16): New. > >>> (vreinterpret_u64_bf16, vreinterpretq_u64_bf16): New. > >>> (vreinterpret_p8_bf16, vreinterpretq_p8_bf16): New. > >>> (vreinterpret_p16_bf16, vreinterpretq_p16_bf16): New. > >>> (vreinterpret_p64_bf16, vreinterpretq_p64_bf16): New. > >>> (vreinterpret_f32_bf16, vreinterpretq_f32_bf16): New. > >>> (vreinterpretq_p128_bf16): New. > >>> * config/arm/arm_neon_builtins.def (VDX): Add V4BF. > >>> (V_elem): Likewise. > >>> (V_elem_l): Likewise. > >>> (VD_LANE): Likewise. > >>> (VQX) Add V8BF. > >>> (V_DOUBLE): Likewise. > >>> (VDQX): Add V4BF and V8BF. > >>> (V_two_elem, V_three_elem, V_four_elem): Likewise. > >>> (V_reg): Likewise. > >>> (V_HALF): Likewise. > >>> (V_double_vector_mode): Likewise. > >>> (V_cmp_result): Likewise. > >>> (V_uf_sclr): Likewise. > >>> (V_sz_elem): Likewise. > >>> (Is_d_reg): Likewise. > >>> (V_mode_nunits): Likewise. > >>> * config/arm/neon.md (neon_vdup_lane): Enable for BFloat. > >>> > >>> gcc/testsuite/ChangeLog: > >>> > >>> 2020-02-27 Mihail Ionescu <mihail.ione...@arm.com> > >>> > >>> * gcc.target/arm/bf16_dup.c: New test. > >>> * gcc.target/arm/bf16_reinterpret.c: Likewise. > >>> > >>> Is it ok for trunk? > >> > >> This looks mostly ok with a few nits... > >> > >> > >>> > >>> Regards, > >>> Mihail > >>> > >>> > >>> ############### Attachment also inlined for ease of reply > >>> ############### > >>> > >>> > >>> diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h > >>> index > >>> 09297831cdcd6e695843c17b7724c114f3a129fe..5901a8f1fb84f204ae95f0ccc97bf5ae944c482c > >>> 100644 > >>> --- a/gcc/config/arm/arm_neon.h > >>> +++ b/gcc/config/arm/arm_neon.h > >>> @@ -42,6 +42,15 @@ extern "C" { > >>> #include <arm_bf16.h> > >>> #include <stdint.h> > >>> > >>> +#ifdef __ARM_BIG_ENDIAN > >>> +#define __ARM_NUM_LANES(__v) (sizeof (__v) / sizeof (__v[0])) > >>> +#define __arm_lane(__vec, __idx) (__idx ^ (__ARM_NUM_LANES(__vec) - > >>> 1)) > >>> +#define __arm_laneq(__vec, __idx) (__idx ^ > >>> (__ARM_NUM_LANES(__vec)/2 - 1)) > >>> +#else > >>> +#define __arm_lane(__vec, __idx) __idx > >>> +#define __arm_laneq(__vec, __idx) __idx > >>> +#endif > >>> + > >>> typedef __simd64_int8_t int8x8_t; > >>> typedef __simd64_int16_t int16x4_t; > >>> typedef __simd64_int32_t int32x2_t; > >>> @@ -6147,14 +6156,6 @@ vget_lane_s32 (int32x2_t __a, const int __b) > >>> /* For big-endian, GCC's vector indices are reversed within each 64 > >>> bits compared to the architectural lane indices used by Neon > >>> intrinsics. */ > >> > >> > >> Please move this comment as well. > >> > >> > >>> -#ifdef __ARM_BIG_ENDIAN > >>> -#define __ARM_NUM_LANES(__v) (sizeof (__v) / sizeof (__v[0])) > >>> -#define __arm_lane(__vec, __idx) (__idx ^ (__ARM_NUM_LANES(__vec) - > >>> 1)) > >>> -#define __arm_laneq(__vec, __idx) (__idx ^ > >>> (__ARM_NUM_LANES(__vec)/2 - 1)) > >>> -#else > >>> -#define __arm_lane(__vec, __idx) __idx > >>> -#define __arm_laneq(__vec, __idx) __idx > >>> -#endif > >>> > >>> #define vget_lane_f16(__v, __idx) \ > >>> __extension__ \ > >>> @@ -14476,6 +14477,15 @@ vreinterpret_p16_u32 (uint32x2_t __a) > >>> #if defined (__ARM_FP16_FORMAT_IEEE) || defined > >>> (__ARM_FP16_FORMAT_ALTERNATIVE) > >>> __extension__ extern __inline float16x4_t > >>> __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpret_f16_bf16 (bfloat16x4_t __a) > >>> +{ > >>> + return (float16x4_t) __a; > >>> +} > >>> +#endif > >>> + > >>> +#if defined (__ARM_FP16_FORMAT_IEEE) || defined > >>> (__ARM_FP16_FORMAT_ALTERNATIVE) > >>> +__extension__ extern __inline float16x4_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> vreinterpret_f16_p8 (poly8x8_t __a) > >>> { > >>> return (float16x4_t) __a; > >>> @@ -15688,6 +15698,15 @@ vreinterpretq_f16_p16 (poly16x8_t __a) > >>> #if defined (__ARM_FP16_FORMAT_IEEE) || defined > >>> (__ARM_FP16_FORMAT_ALTERNATIVE) > >>> __extension__ extern __inline float16x8_t > >>> __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpretq_f16_bf16 (bfloat16x8_t __a) > >>> +{ > >>> + return (float16x8_t) __a; > >>> +} > >>> +#endif > >>> + > >>> +#if defined (__ARM_FP16_FORMAT_IEEE) || defined > >>> (__ARM_FP16_FORMAT_ALTERNATIVE) > >>> +__extension__ extern __inline float16x8_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> vreinterpretq_f16_f32 (float32x4_t __a) > >>> { > >>> return (float16x8_t) __a; > >>> @@ -18750,6 +18769,492 @@ vcmlaq_rot270_laneq_f32 (float32x4_t __r, > >>> float32x4_t __a, float32x4_t __b, > >>> #pragma GCC push_options > >>> #pragma GCC target ("arch=armv8.2-a+bf16") > >>> > >>> +__extension__ extern __inline bfloat16x4_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vcreate_bf16 (uint64_t __a) > >>> +{ > >>> + return (bfloat16x4_t) __a; > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16x4_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vdup_n_bf16 (bfloat16_t __a) > >>> +{ > >>> + return __builtin_neon_vdup_nv4bf (__a); > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16x8_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vdupq_n_bf16 (bfloat16_t __a) > >>> +{ > >>> + return __builtin_neon_vdup_nv8bf (__a); > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16x4_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vdup_lane_bf16 (bfloat16x4_t __a, const int __b) > >>> +{ > >>> + return __builtin_neon_vdup_lanev4bf (__a, __b); > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16x8_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vdupq_lane_bf16 (bfloat16x4_t __a, const int __b) > >>> +{ > >>> + return __builtin_neon_vdup_lanev8bf (__a, __b); > >>> +} > >>> + > >>> +#define vset_lane_bf16(__e, __v, __idx) \ > >>> + __extension__ \ > >>> + ({ \ > >>> + bfloat16_t __elem = (__e); \ > >>> + bfloat16x4_t __vec = (__v); \ > >>> + __builtin_arm_lane_check (4, __idx); \ > >>> + __vec[__arm_lane(__vec, __idx)] = __elem; \ > >>> + __vec; \ > >>> + }) > >>> + > >>> +#define vsetq_lane_bf16(__e, __v, __idx) \ > >>> + __extension__ \ > >>> + ({ \ > >>> + bfloat16_t __elem = (__e); \ > >>> + bfloat16x8_t __vec = (__v); \ > >>> + __builtin_arm_lane_check (8, __idx); \ > >>> + __vec[__arm_laneq(__vec, __idx)] = __elem; \ > >>> + __vec; \ > >>> + }) > >>> + > >>> +#define vget_lane_bf16(__v, __idx) \ > >>> + __extension__ \ > >>> + ({ \ > >>> + bfloat16x4_t __vec = (__v); \ > >>> + __builtin_arm_lane_check (4, __idx); \ > >>> + bfloat16_t __res = __vec[__arm_lane(__vec, __idx)]; \ > >>> + __res; \ > >>> + }) > >>> + > >>> +#define vgetq_lane_bf16(__v, __idx) \ > >>> + __extension__ \ > >>> + ({ \ > >>> + bfloat16x8_t __vec = (__v); \ > >>> + __builtin_arm_lane_check (8, __idx); \ > >>> + bfloat16_t __res = __vec[__arm_laneq(__vec, __idx)]; \ > >>> + __res; \ > >>> + }) > >>> + > >>> +__extension__ extern __inline bfloat16x4_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vdup_laneq_bf16 (bfloat16x8_t __a, const int __b) > >>> +{ > >>> + return vdup_n_bf16( vgetq_lane_bf16 (__a, __b)); > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16x8_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vdupq_laneq_bf16 (bfloat16x8_t __a, const int __b) > >>> +{ > >>> + return vdupq_n_bf16( vgetq_lane_bf16 (__a, __b)); > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vduph_lane_bf16 (bfloat16x4_t __a, const int __b) > >>> +{ > >>> + return vget_lane_bf16 (__a, __b); > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vduph_laneq_bf16 (bfloat16x8_t __a, const int __b) > >>> +{ > >>> + return vgetq_lane_bf16 (__a, __b); > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16x4_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vget_high_bf16 (bfloat16x8_t __a) > >>> +{ > >>> + return __builtin_neon_vget_highv8bf (__a); > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16x4_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vget_low_bf16 (bfloat16x8_t __a) > >>> +{ > >>> + return __builtin_neon_vget_lowv8bf (__a); > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16x8_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vcombine_bf16 (bfloat16x4_t __a, bfloat16x4_t __b) > >>> +{ > >>> + return __builtin_neon_vcombinev4bf (__a, __b); > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16x4_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpret_bf16_u8 (uint8x8_t __a) > >>> +{ > >>> + return (bfloat16x4_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16x4_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpret_bf16_u16 (uint16x4_t __a) > >>> +{ > >>> + return (bfloat16x4_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16x4_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpret_bf16_u32 (uint32x2_t __a) > >>> +{ > >>> + return (bfloat16x4_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16x4_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpret_bf16_u64 (uint64x1_t __a) > >>> +{ > >>> + return (bfloat16x4_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16x4_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpret_bf16_s8 (int8x8_t __a) > >>> +{ > >>> + return (bfloat16x4_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16x4_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpret_bf16_s16 (int16x4_t __a) > >>> +{ > >>> + return (bfloat16x4_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16x4_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpret_bf16_s32 (int32x2_t __a) > >>> +{ > >>> + return (bfloat16x4_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16x4_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpret_bf16_s64 (int64x1_t __a) > >>> +{ > >>> + return (bfloat16x4_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16x4_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpret_bf16_p8 (poly8x8_t __a) > >>> +{ > >>> + return (bfloat16x4_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16x4_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpret_bf16_p16 (poly16x4_t __a) > >>> +{ > >>> + return (bfloat16x4_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16x4_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpret_bf16_p64 (poly64x1_t __a) > >>> +{ > >>> + return (bfloat16x4_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16x4_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpret_bf16_f16 (float16x4_t __a) > >>> +{ > >>> + return (bfloat16x4_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16x4_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpret_bf16_f32 (float32x2_t __a) > >>> +{ > >>> + return (bfloat16x4_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16x8_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpretq_bf16_u8 (uint8x16_t __a) > >>> +{ > >>> + return (bfloat16x8_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16x8_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpretq_bf16_u16 (uint16x8_t __a) > >>> +{ > >>> + return (bfloat16x8_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16x8_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpretq_bf16_u32 (uint32x4_t __a) > >>> +{ > >>> + return (bfloat16x8_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16x8_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpretq_bf16_u64 (uint64x2_t __a) > >>> +{ > >>> + return (bfloat16x8_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16x8_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpretq_bf16_s8 (int8x16_t __a) > >>> +{ > >>> + return (bfloat16x8_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16x8_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpretq_bf16_s16 (int16x8_t __a) > >>> +{ > >>> + return (bfloat16x8_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16x8_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpretq_bf16_s32 (int32x4_t __a) > >>> +{ > >>> + return (bfloat16x8_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16x8_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpretq_bf16_s64 (int64x2_t __a) > >>> +{ > >>> + return (bfloat16x8_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16x8_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpretq_bf16_p8 (poly8x16_t __a) > >>> +{ > >>> + return (bfloat16x8_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16x8_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpretq_bf16_p16 (poly16x8_t __a) > >>> +{ > >>> + return (bfloat16x8_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16x8_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpretq_bf16_p64 (poly64x2_t __a) > >>> +{ > >>> + return (bfloat16x8_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16x8_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpretq_bf16_p128 (poly128_t __a) > >>> +{ > >>> + return (bfloat16x8_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16x8_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpretq_bf16_f16 (float16x8_t __a) > >>> +{ > >>> + return (bfloat16x8_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline bfloat16x8_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpretq_bf16_f32 (float32x4_t __a) > >>> +{ > >>> + return (bfloat16x8_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline int8x8_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpret_s8_bf16 (bfloat16x4_t __a) > >>> +{ > >>> + return (int8x8_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline int16x4_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpret_s16_bf16 (bfloat16x4_t __a) > >>> +{ > >>> + return (int16x4_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline int32x2_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpret_s32_bf16 (bfloat16x4_t __a) > >>> +{ > >>> + return (int32x2_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline int64x1_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpret_s64_bf16 (bfloat16x4_t __a) > >>> +{ > >>> + return (int64x1_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline uint8x8_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpret_u8_bf16 (bfloat16x4_t __a) > >>> +{ > >>> + return (uint8x8_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline uint16x4_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpret_u16_bf16 (bfloat16x4_t __a) > >>> +{ > >>> + return (uint16x4_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline uint32x2_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpret_u32_bf16 (bfloat16x4_t __a) > >>> +{ > >>> + return (uint32x2_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline uint64x1_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpret_u64_bf16 (bfloat16x4_t __a) > >>> +{ > >>> + return (uint64x1_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline float32x2_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpret_f32_bf16 (bfloat16x4_t __a) > >>> +{ > >>> + return (float32x2_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline poly8x8_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpret_p8_bf16 (bfloat16x4_t __a) > >>> +{ > >>> + return (poly8x8_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline poly16x4_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpret_p16_bf16 (bfloat16x4_t __a) > >>> +{ > >>> + return (poly16x4_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline poly64x1_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpret_p64_bf16 (bfloat16x4_t __a) > >>> +{ > >>> + return (poly64x1_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline int8x16_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpretq_s8_bf16 (bfloat16x8_t __a) > >>> +{ > >>> + return (int8x16_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline int16x8_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpretq_s16_bf16 (bfloat16x8_t __a) > >>> +{ > >>> + return (int16x8_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline int32x4_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpretq_s32_bf16 (bfloat16x8_t __a) > >>> +{ > >>> + return (int32x4_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline int64x2_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpretq_s64_bf16 (bfloat16x8_t __a) > >>> +{ > >>> + return (int64x2_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline uint8x16_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpretq_u8_bf16 (bfloat16x8_t __a) > >>> +{ > >>> + return (uint8x16_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline uint16x8_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpretq_u16_bf16 (bfloat16x8_t __a) > >>> +{ > >>> + return (uint16x8_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline uint32x4_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpretq_u32_bf16 (bfloat16x8_t __a) > >>> +{ > >>> + return (uint32x4_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline uint64x2_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpretq_u64_bf16 (bfloat16x8_t __a) > >>> +{ > >>> + return (uint64x2_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline float32x4_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpretq_f32_bf16 (bfloat16x8_t __a) > >>> +{ > >>> + return (float32x4_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline poly8x16_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpretq_p8_bf16 (bfloat16x8_t __a) > >>> +{ > >>> + return (poly8x16_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline poly16x8_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpretq_p16_bf16 (bfloat16x8_t __a) > >>> +{ > >>> + return (poly16x8_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline poly64x2_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpretq_p64_bf16 (bfloat16x8_t __a) > >>> +{ > >>> + return (poly64x2_t)__a; > >>> +} > >>> + > >>> +__extension__ extern __inline poly128_t > >>> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> +vreinterpretq_p128_bf16 (bfloat16x8_t __a) > >>> +{ > >>> + return (poly128_t)__a; > >>> +} > >>> + > >>> __extension__ extern __inline float32x2_t > >>> __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) > >>> vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b) > >>> diff --git a/gcc/config/arm/arm_neon_builtins.def > >>> b/gcc/config/arm/arm_neon_builtins.def > >>> index > >>> 85aeaecf0dc7579f511d0979708635ed65399614..bf28b24b108a081a023aa76f70d4da8bc0cc2d7e > >>> 100644 > >>> --- a/gcc/config/arm/arm_neon_builtins.def > >>> +++ b/gcc/config/arm/arm_neon_builtins.def > >>> @@ -221,13 +221,13 @@ VAR10 (SETLANE, vset_lane, > >>> VAR5 (UNOP, vcreate, v8qi, v4hi, v2si, v2sf, di) > >>> VAR10 (UNOP, vdup_n, > >>> v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) > >>> -VAR2 (UNOP, vdup_n, v8hf, v4hf) > >>> +VAR4 (UNOP, vdup_n, v8hf, v4hf, v8bf, v4bf) > >>> VAR10 (GETLANE, vdup_lane, > >>> v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) > >>> -VAR2 (GETLANE, vdup_lane, v8hf, v4hf) > >>> -VAR6 (COMBINE, vcombine, v8qi, v4hi, v4hf, v2si, v2sf, di) > >>> -VAR6 (UNOP, vget_high, v16qi, v8hi, v8hf, v4si, v4sf, v2di) > >>> -VAR6 (UNOP, vget_low, v16qi, v8hi, v8hf, v4si, v4sf, v2di) > >>> +VAR4 (GETLANE, vdup_lane, v8hf, v4hf, v8bf, v4bf) > >>> +VAR7 (COMBINE, vcombine, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf) > >>> +VAR7 (UNOP, vget_high, v16qi, v8hi, v8hf, v8bf, v4si, v4sf, v2di) > >>> +VAR7 (UNOP, vget_low, v16qi, v8hi, v8hf, v8bf, v4si, v4sf, v2di) > >>> VAR3 (UNOP, vmovn, v8hi, v4si, v2di) > >>> VAR3 (UNOP, vqmovns, v8hi, v4si, v2di) > >>> VAR3 (UNOP, vqmovnu, v8hi, v4si, v2di) > >>> @@ -376,4 +376,4 @@ VAR2 (MAC_LANE_PAIR, vcmlaq_lane270, v4sf, v8hf) > >>> > >>> VAR2 (TERNOP, vbfdot, v2sf, v4sf) > >>> VAR2 (MAC_LANE_PAIR, vbfdot_lanev4bf, v2sf, v4sf) > >>> -VAR2 (MAC_LANE_PAIR, vbfdot_lanev8bf, v2sf, v4sf) > >>> \ No newline at end of file > >>> +VAR2 (MAC_LANE_PAIR, vbfdot_lanev8bf, v2sf, v4sf) > >>> diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md > >>> index > >>> cf5bfb4c77a7be0400bada8c517b877537f4d2c6..1b6aada0d0879a7f521bf868ad2c19166962fff2 > >>> 100644 > >>> --- a/gcc/config/arm/iterators.md > >>> +++ b/gcc/config/arm/iterators.md > >>> @@ -82,14 +82,14 @@ > >>> (define_mode_iterator VD_RE [V8QI V4HI V2SI V2SF DI]) > >>> > >>> ;; Double-width vector modes plus 64-bit elements. > >>> -(define_mode_iterator VDX [V8QI V4HI V4HF V2SI V2SF DI]) > >>> +(define_mode_iterator VDX [V8QI V4HI V4HF V4BF V2SI V2SF DI]) > >>> > >>> ;; Double-width vector modes plus 64-bit elements, > >>> ;; with V4BFmode added, suitable for moves. > >>> (define_mode_iterator VDXMOV [V8QI V4HI V4HF V4BF V2SI V2SF DI]) > >>> > >>> ;; Double-width vector modes, with V4HF - for vldN_lane and vstN_lane. > >>> -(define_mode_iterator VD_LANE [V8QI V4HI V4HF V2SI V2SF]) > >>> +(define_mode_iterator VD_LANE [V8QI V4HI V4HF V4BF V2SI V2SF]) > >>> > >>> ;; Double-width vector modes without floating-point elements. > >>> (define_mode_iterator VDI [V8QI V4HI V2SI]) > >>> @@ -104,7 +104,7 @@ > >>> (define_mode_iterator VQ_HS [V8HI V8HF V4SI V4SF]) > >>> > >>> ;; Quad-width vector modes plus 64-bit elements. > >>> -(define_mode_iterator VQX [V16QI V8HI V8HF V4SI V4SF V2DI]) > >>> +(define_mode_iterator VQX [V16QI V8HI V8HF V8BF V4SI V4SF V2DI]) > >>> > >>> ;; Quad-width vector modes without floating-point elements. > >>> (define_mode_iterator VQI [V16QI V8HI V4SI]) > >>> @@ -153,7 +153,7 @@ > >>> > >>> ;; Vector modes, including 64-bit integer elements. > >>> (define_mode_iterator VDQX [V8QI V16QI V4HI V8HI V2SI V4SI > >>> - V4HF V8HF V2SF V4SF DI V2DI]) > >>> + V4HF V8HF V4BF V8BF V2SF V4SF DI V2DI]) > >>> > >>> ;; Vector modes including 64-bit integer elements, but no floats. > >>> (define_mode_iterator VDQIX [V8QI V16QI V4HI V8HI V2SI V4SI DI V2DI]) > >>> @@ -518,6 +518,7 @@ > >>> (define_mode_attr V_elem [(V8QI "QI") (V16QI "QI") > >>> (V4HI "HI") (V8HI "HI") > >>> (V4HF "HF") (V8HF "HF") > >>> + (V4BF "BF") (V8BF "BF") > >>> (V2SI "SI") (V4SI "SI") > >>> (V2SF "SF") (V4SF "SF") > >>> (DI "DI") (V2DI "DI")]) > >>> @@ -526,6 +527,7 @@ > >>> (define_mode_attr V_elem_l [(V8QI "qi") (V16QI "qi") > >>> (V4HI "hi") (V8HI "hi") > >>> (V4HF "hf") (V8HF "hf") > >>> + (V4BF "bf") (V8BF "bf") > >>> (V2SI "si") (V4SI "si") > >>> (V2SF "sf") (V4SF "sf") > >>> (DI "di") (V2DI "di")]) > >>> @@ -543,6 +545,7 @@ > >>> (define_mode_attr V_two_elem [(V8QI "HI") (V16QI "HI") > >>> (V4HI "SI") (V8HI "SI") > >>> (V4HF "SF") (V8HF "SF") > >>> + (V4BF "BF") (V8BF "BF") > >>> (V2SI "V2SI") (V4SI "V2SI") > >>> (V2SF "V2SF") (V4SF "V2SF") > >>> (DI "V2DI") (V2DI "V2DI")]) > >>> @@ -563,6 +566,7 @@ > >>> (define_mode_attr V_three_elem [(V8QI "BLK") (V16QI "BLK") > >>> (V4HI "BLK") (V8HI "BLK") > >>> (V4HF "BLK") (V8HF "BLK") > >>> + (V4BF "BLK") (V8BF "BLK") > >>> (V2SI "BLK") (V4SI "BLK") > >>> (V2SF "BLK") (V4SF "BLK") > >>> (DI "EI") (V2DI "EI")]) > >>> @@ -571,6 +575,7 @@ > >>> (define_mode_attr V_four_elem [(V8QI "SI") (V16QI "SI") > >>> (V4HI "V4HI") (V8HI "V4HI") > >>> (V4HF "V4HF") (V8HF "V4HF") > >>> + (V4BF "V4BF") (V8BF "V4BF") > >>> (V2SI "V4SI") (V4SI "V4SI") > >>> (V2SF "V4SF") (V4SF "V4SF") > >>> (DI "OI") (V2DI "OI")]) > >>> @@ -579,6 +584,7 @@ > >>> (define_mode_attr V_reg [(V8QI "P") (V16QI "q") > >>> (V4HI "P") (V8HI "q") > >>> (V4HF "P") (V8HF "q") > >>> + (V4BF "P") (V8BF "q") > >>> (V2SI "P") (V4SI "q") > >>> (V2SF "P") (V4SF "q") > >>> (DI "P") (V2DI "q") > >>> @@ -609,7 +615,8 @@ > >>> (define_mode_attr V_HALF [(V16QI "V8QI") (V8HI "V4HI") > >>> (V8HF "V4HF") (V4SI "V2SI") > >>> (V4SF "V2SF") (V2DF "DF") > >>> - (V2DI "DI") (V4HF "HF")]) > >>> + (V2DI "DI") (V4HF "HF") > >>> + (V4BF "BF") (V8BF "V4BF")]) > >>> > >>> ;; Same, but lower-case. > >>> (define_mode_attr V_half [(V16QI "v8qi") (V8HI "v4hi") > >>> @@ -620,7 +627,7 @@ > >>> (define_mode_attr V_DOUBLE [(V8QI "V16QI") (V4HI "V8HI") > >>> (V2SI "V4SI") (V4HF "V8HF") > >>> �� (V2SF "V4SF") (DF "V2DF") > >>> - (DI "V2DI")]) > >>> + (DI "V2DI") (V4BF "V8BF")]) > >>> > >>> ;; Same, but lower-case. > >>> (define_mode_attr V_double [(V8QI "v16qi") (V4HI "v8hi") > >>> @@ -639,6 +646,7 @@ > >>> (V4SI "V2SI") (V4SF "V2SF") > >>> (V8QI "V8QI") (V4HI "V4HI") > >>> (V2SI "V2SI") (V2SF "V2SF") > >>> + (V8BF "V4BF") (V4BF "V4BF") > >>> (V8HF "V4HF") (V4HF "V4HF")]) > >>> > >>> ;; Mode of result of comparison operations (and bit-select operand 1). > >>> @@ -646,6 +654,7 @@ > >>> (V4HI "V4HI") (V8HI "V8HI") > >>> (V2SI "V2SI") (V4SI "V4SI") > >>> (V4HF "V4HI") (V8HF "V8HI") > >>> + (V4BF "V4HI") (V8BF "V8HI") > >>> (V2SF "V2SI") (V4SF "V4SI") > >>> (DI "DI") (V2DI "V2DI")]) > >>> > >>> @@ -687,6 +696,7 @@ > >>> (V4HI "u16") (V8HI "u16") > >>> (V2SI "32") (V4SI "32") > >>> (V4HF "u16") (V8HF "u16") > >>> + (V4BF "u16") (V8BF "u16") > >>> (V2SF "32") (V4SF "32")]) > >>> > >>> (define_mode_attr V_sz_elem [(V8QI "8") (V16QI "8") > >>> @@ -694,6 +704,7 @@ > >>> (V2SI "32") (V4SI "32") > >>> (DI "64") (V2DI "64") > >>> (V4HF "16") (V8HF "16") > >>> + (V4BF "16") (V8BF "16") > >>> (V2SF "32") (V4SF "32")]) > >>> > >>> (define_mode_attr V_elem_ch [(V8QI "b") (V16QI "b") > >>> @@ -764,10 +775,12 @@ > >>> (V2SI "true") (V4SI "false") > >>> (V2SF "true") (V4SF "false") > >>> (DI "true") (V2DI "false") > >>> + (V4BF "true") (V8BF "false") > >>> (V4HF "true") (V8HF "false")]) > >>> > >>> (define_mode_attr V_mode_nunits [(V8QI "8") (V16QI "16") > >>> (V4HF "4") (V8HF "8") > >>> + (V4BF "4") (V8BF "8") > >>> (V4HI "4") (V8HI "8") > >>> (V2SI "2") (V4SI "4") > >>> (V2SF "2") (V4SF "4") > >>> diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md > >>> index > >>> 21701b34fcd2c86bfd310904c7ceca0ce9fb047e..e732600719e2c0df35e1ec0a4ed1cb235dc25726 > >>> 100644 > >>> --- a/gcc/config/arm/neon.md > >>> +++ b/gcc/config/arm/neon.md > >>> @@ -3704,6 +3704,22 @@ if (BYTES_BIG_ENDIAN) > >>> [(set_attr "type" "neon_from_gp_q")] > >>> ) > >>> > >>> +(define_insn "neon_vdup_nv4bf" > >>> + [(set (match_operand:V4BF 0 "s_register_operand" "=w") > >>> + (vec_duplicate:V4BF (match_operand:BF 1 > >>> "s_register_operand" "r")))] > >>> + "TARGET_NEON" > >>> + "vdup.16\t%P0, %1" > >>> + [(set_attr "type" "neon_from_gp")] > >>> +) > >>> + > >>> +(define_insn "neon_vdup_nv8bf" > >>> + [(set (match_operand:V8BF 0 "s_register_operand" "=w") > >>> + (vec_duplicate:V8BF (match_operand:BF 1 > >>> "s_register_operand" "r")))] > >>> + "TARGET_NEON" > >>> + "vdup.16\t%q0, %1" > >>> + [(set_attr "type" "neon_from_gp_q")] > >>> +) > >>> + > >>> (define_insn "neon_vdup_n<mode>" > >>> [(set (match_operand:V32 0 "s_register_operand" "=w,w") > >>> (vec_duplicate:V32 (match_operand:<V_elem> 1 > >>> "s_register_operand" "r,t")))] > >>> @@ -3737,7 +3753,7 @@ if (BYTES_BIG_ENDIAN) > >>> > >>> (define_insn "neon_vdup_lane<mode>_internal" > >>> [(set (match_operand:VDQW 0 "s_register_operand" "=w") > >>> - (vec_duplicate:VDQW > >>> + (vec_duplicate:VDQW > >>> (vec_select:<V_elem> > >>> (match_operand:<V_double_vector_mode> 1 > >>> "s_register_operand" "w") > >>> (parallel [(match_operand:SI 2 "immediate_operand" > >>> "i")]))))] > >>> @@ -3758,12 +3774,12 @@ if (BYTES_BIG_ENDIAN) > >>> ) > >>> > >>> (define_insn "neon_vdup_lane<mode>_internal" > >>> - [(set (match_operand:VH 0 "s_register_operand" "=w") > >>> - (vec_duplicate:VH > >>> + [(set (match_operand:VHFBF 0 "s_register_operand" "=w") > >>> + (vec_duplicate:VHFBF > >>> (vec_select:<V_elem> > >>> (match_operand:<V_double_vector_mode> 1 "s_register_operand" "w") > >>> (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))] > >>> - "TARGET_NEON && TARGET_FP16" > >>> + "TARGET_NEON && (TARGET_FP16 || TARGET_BF16_SIMD)" > >>> { > >>> if (BYTES_BIG_ENDIAN) > >>> { > >>> @@ -3799,10 +3815,10 @@ if (BYTES_BIG_ENDIAN) > >>> }) > >>> > >>> (define_expand "neon_vdup_lane<mode>" > >>> - [(match_operand:VH 0 "s_register_operand") > >>> + [(match_operand:VHFBF 0 "s_register_operand") > >>> (match_operand:<V_double_vector_mode> 1 "s_register_operand") > >>> (match_operand:SI 2 "immediate_operand")] > >>> - "TARGET_NEON && TARGET_FP16" > >>> + "TARGET_NEON && (TARGET_FP16 || TARGET_BF16_SIMD)" > >>> { > >>> if (BYTES_BIG_ENDIAN) > >>> { > >>> @@ -6599,4 +6615,4 @@ if (BYTES_BIG_ENDIAN) > >>> } > >>> } > >>> [(set_attr "type" "neon_dot<q>")] > >>> -) > >>> \ No newline at end of file > >>> +) > >>> diff --git a/gcc/testsuite/gcc.target/arm/bf16_dup.c > >>> b/gcc/testsuite/gcc.target/arm/bf16_dup.c > >>> new file mode 100644 > >>> index > >>> 0000000000000000000000000000000000000000..82dff25fc6e244a1d930375a1e3505e9173e53dc > >>> > >>> --- /dev/null > >>> +++ b/gcc/testsuite/gcc.target/arm/bf16_dup.c > >>> @@ -0,0 +1,94 @@ > >>> +/* { dg-do compile } */ > >>> +/* { dg-additional-options "-march=armv8.2-a+bf16+fp16 > >>> -mfloat-abi=softfp" } */ > >>> + > >> > >> Doesn't this need something like > >> > >> /* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ > >> ? > >> > >> We wouldn't want to try it on M-profile targets, for example. > >> > >> > >>> +#include "arm_neon.h" > >>> + > >>> +float32x2_t > >>> +test_vbfdot_vcreate (float32x2_t r, uint64_t a, uint64_t b) > >>> +{ > >>> + bfloat16x4_t _a = vcreate_bf16(a); > >>> + bfloat16x4_t _b = vcreate_bf16(b); > >>> + > >>> + return vbfdot_f32 (r, _a, _b); > >>> +} > >>> +/* { dg-final { scan-assembler {vdot.bf16\td[0-9]+, d[0-9]+, > >>> d[0-9]+} } } */ > >>> + > >>> +bfloat16x8_t test_vcombine_bf16 (bfloat16x4_t a, bfloat16x4_t b) > >>> +{ > >>> + return vcombine_bf16 (a, b); > >>> +} > >>> + > >>> +bfloat16x4_t test_vget_high_bf16 (bfloat16x8_t a) > >>> +{ > >>> + return vget_high_bf16 (a); > >>> +} > >>> + > >>> +bfloat16x4_t test_vget_low_bf16 (bfloat16x8_t a) > >>> +{ > >>> + return vget_low_bf16 (a); > >>> +} > >>> + > >>> +bfloat16_t test_vget_lane_bf16 (bfloat16x4_t a) > >>> +{ > >>> + return vget_lane_bf16 (a, 1); > >>> +} > >>> + > >>> +bfloat16_t test_vgetq_lane_bf16 (bfloat16x8_t a) > >>> +{ > >>> + return vgetq_lane_bf16 (a, 7); > >>> +} > >>> + > >>> +bfloat16x4_t test_vset_lane_bf16 (bfloat16_t a, bfloat16x4_t b) > >>> +{ > >>> + return vset_lane_bf16 (a, b, 1); > >>> +} > >>> + > >>> +bfloat16x8_t test_vsetq_lane_bf16 (bfloat16_t a, bfloat16x8_t b) > >>> +{ > >>> + return vsetq_lane_bf16 (a, b, 7); > >>> +} > >>> + > >>> +bfloat16x4_t vdup_test (bfloat16_t a) > >>> +{ > >>> + return vdup_n_bf16 (a); > >>> +} > >>> +/* { dg-final { scan-assembler {vdup\.16\td[0-9]+, r[0-9]+} } } */ > >>> + > >>> +bfloat16x8_t vdupq_test (bfloat16_t a) > >>> +{ > >>> + return vdupq_n_bf16 (a); > >>> +} > >>> +/* { dg-final { scan-assembler {vdup\.16\tq[0-9]+, r[0-9]+} } } */ > >>> + > >>> + > >>> +bfloat16x4_t test_vdup_lane_bf16 (bfloat16x4_t a) > >>> +{ > >>> + return vdup_lane_bf16 (a, 1); > >>> +} > >>> +/* { dg-final { scan-assembler-times {vdup\.16\td[0-9]+, > >>> d[0-9]+\[1\]} 1 } } */ > >>> + > >>> +bfloat16x8_t test_vdupq_lane_bf16 (bfloat16x4_t a) > >>> +{ > >>> + return vdupq_lane_bf16 (a, 1); > >>> +} > >>> +/* { dg-final { scan-assembler-times {vdup\.16\tq[0-9]+, > >>> d[0-9]+\[1\]} 1 } } */ > >>> + > >>> +bfloat16x4_t test_vdup_laneq_bf16 (bfloat16x8_t a) > >>> +{ > >>> + return vdup_laneq_bf16 (a, 3); > >>> +} > >>> + > >>> +bfloat16x8_t test_vdupq_laneq_bf16 (bfloat16x8_t a) > >>> +{ > >>> + return vdupq_laneq_bf16 (a, 3); > >>> +} > >>> + > >>> +bfloat16_t test_vduph_lane_bf16 (bfloat16x4_t a) > >>> +{ > >>> + return vduph_lane_bf16 (a, 1); > >>> +} > >>> + > >>> +bfloat16_t test_vduph_laneq_bf16 (bfloat16x8_t a) > >>> +{ > >>> + return vduph_laneq_bf16 (a, 7); > >>> +} > >>> diff --git a/gcc/testsuite/gcc.target/arm/bf16_reinterpret.c > >>> b/gcc/testsuite/gcc.target/arm/bf16_reinterpret.c > >>> new file mode 100644 > >>> index > >>> 0000000000000000000000000000000000000000..e7d30a95fbc3ceaf4a92057a10e6be4a34e1957c > >>> > >>> --- /dev/null > >>> +++ b/gcc/testsuite/gcc.target/arm/bf16_reinterpret.c > >>> @@ -0,0 +1,435 @@ > >>> +/* { dg-do assemble { target { arm*-*-* } } } */ > >>> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ > >>> +/* { dg-add-options arm_v8_2a_bf16_neon } */ > >>> +/* { dg-additional-options "-save-temps -march=armv8.2-a+fp16+bf16 > >>> -mfloat-abi=hard -mfpu=crypto-neon-fp-armv8" } */ > >>> + > >> > >> > >> Why the dg-additional-options ? Doesn't dg-add-options do the right > >> thing here? > >> > >> Thanks, > >> > >> Kyrill > >> > > Thanks for the review. > > The dg-additional-options is needed here in order to add the other > > required extensions. A new target check could be added to do all of > > this, but I think it will be too specific to these test cases and it > > wouldn't have any other use. > > I decided to use the bf16 target check because that one passes, then > > fp16 will also pass, and then I overwrite the options with the > > additional ones to ensure that the other extensions are added. > Sorry to comment after this has been committed... So far these tests were unsupported because I use binutils 2.33.1; I just tried 2.34 and noticed problems with this patch. Why do the two new testcases use different dg-additional-options ? Why does one force -mfloat-abi=softfp and the other -mfloat-abi=hard? Anyway, both are wrong: softfp won't work on arm-linux-gnueabihf, and "hard" won't work on arm-linux-gnueabi because either gnu/stubs-hard.h or gnu/stubs-soft.h is missing. Why do you need -mfpu=crypto-neon-fp-armv8 in bf16_reinterpret? dg-add-options arm_v8_2a_bf16_neon already brings -mfloat-abi=hard -mfpu=neon-fp-armv8 for me. Christophe > > Ok, thanks. > > Kyrill > > > > > > I've addressed the other two comments in the attached diff. > > > > > > Regards, > > Mihail > > > > > >