> On 28 Jan 2026, at 12:37, Karl Meakin <[email protected]> wrote: > > On 28/01/2026 09:13, Kyrylo Tkachov wrote: >> Hi Karl, >> >>> On 27 Jan 2026, at 18:09, Karl Meakin <[email protected]> wrote: >>> >>> Add support for the `SVE_BFSCALE` architecture extension. >>> >>> gcc/ChangeLog: >>> >>> * doc/invoke.texi: Document `+sve-bfscale` flag. >>> * config/aarch64/aarch64.h (TARGET_SVE_BFSCALE): New macro. >>> * config/aarch64/aarch64-c.cc (aarch64_update_cpp_builtins): >>> Define `__AARCH64_FEATURE_SVE_BFSCALE`. >>> * config/aarch64/aarch64-sve-builtins-base.cc: Skip constant >>> folding for floating-point or unpredicated multiplications. >>> * config/aarch64/aarch64-sve-builtins-sve2.def: New `SVE_FUNCTION`s. >>> * config/aarch64/aarch64-sve.md: Modify insns for >>> `SVE_COND_FP_BINARY_INT` to handle BF16 modes. >>> * config/aarch64/aarch64-sve2.md >>> (@aarch64_sve_<optab><mode>, @aarch64_sve_<optab><mode>_single): New insn >>> for `BFSCALE`. >>> Modify insns for `UNSPEC_FSCALE` to handle BF16 modes. >>> * config/aarch64/iterators.md (SVE_FULL_F_SCALAR): Add `VNx8BF` to iterator. >>> (SVE_FULL_F_BFSCALE): New iterator. >>> (SVE_Fx24_BFSCALE): New iterator. >>> (SVE_BFx24): New iterator. >>> (UNSPEC_FMUL): New unspec. >>> (V_INT_EQUIV): Add entries for BF16 modes. >>> (b): Add entries for scalar float modes. >>> (is_bf16): Add entries for BF16 modes and reformat. >>> (SVSCALE_SINGLE_INTARG): Likewise. >>> (SVSCALE_INTARG): Likewise. >>> (SVE_FP_MULL): New iterator. >>> >>> gcc/testsuite/ChangeLog: >>> >>> * lib/target-supports.exp: Add `sve-bfscale` to `sve_exts`. >>> * gcc.target/aarch64/pragma_cpp_predefs_4.c: Add test for >>> `__ARM_SVE_FEATURE_BFSCALE`. >>> * gcc.target/aarch64/sme2/acle-asm/mul_bf16_x2.c: New test. >>> * gcc.target/aarch64/sme2/acle-asm/mul_bf16_x4.c: New test. >>> * gcc.target/aarch64/sme2/acle-asm/scale_bf16_x2.c: New test. >>> * gcc.target/aarch64/sme2/acle-asm/scale_bf16_x4.c: New test. >>> * gcc.target/aarch64/sve/acle/asm/scale_bf16.c: New test. >>> * gcc.target/aarch64/sve/acle/general-c/bfscale.c: New test. >>> --- >>> gcc/config/aarch64/aarch64-c.cc | 2 + >>> .../aarch64/aarch64-sve-builtins-base.cc | 9 +- >>> .../aarch64/aarch64-sve-builtins-sve2.def | 43 +++ >>> gcc/config/aarch64/aarch64-sve.md | 87 ++--- >>> gcc/config/aarch64/aarch64-sve2.md | 70 +++- >>> gcc/config/aarch64/aarch64.h | 1 + >>> gcc/config/aarch64/iterators.md | 63 +++- >>> gcc/doc/invoke.texi | 3 +- >>> .../gcc.target/aarch64/pragma_cpp_predefs_4.c | 5 + >>> .../aarch64/sme2/acle-asm/mul_bf16_x2.c | 193 ++++++++++ >>> .../aarch64/sme2/acle-asm/mul_bf16_x4.c | 227 ++++++++++++ >>> .../aarch64/sme2/acle-asm/scale_bf16_x2.c | 194 ++++++++++ >>> .../aarch64/sme2/acle-asm/scale_bf16_x4.c | 231 ++++++++++++ >>> .../aarch64/sve/acle/asm/scale_bf16.c | 337 ++++++++++++++++++ >>> .../aarch64/sve/acle/general-c/bfscale.c | 114 ++++++ >>> gcc/testsuite/lib/target-supports.exp | 2 +- >>> 16 files changed, 1506 insertions(+), 75 deletions(-) >>> create mode 100644 >>> gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/mul_bf16_x2.c >>> create mode 100644 >>> gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/mul_bf16_x4.c >>> create mode 100644 >>> gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/scale_bf16_x2.c >>> create mode 100644 >>> gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/scale_bf16_x4.c >>> create mode 100644 >>> gcc/testsuite/gcc.target/aarch64/sve/acle/asm/scale_bf16.c >>> create mode 100644 >>> gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/bfscale.c >>> >>> diff --git a/gcc/config/aarch64/aarch64-c.cc >>> b/gcc/config/aarch64/aarch64-c.cc >>> index b52ea7649f9..f8be998da16 100644 >>> --- a/gcc/config/aarch64/aarch64-c.cc >>> +++ b/gcc/config/aarch64/aarch64-c.cc >>> @@ -275,6 +275,8 @@ aarch64_update_cpp_builtins (cpp_reader *pfile) >>> "__ARM_FEATURE_BF16", pfile); >>> aarch64_def_or_undef (TARGET_SVE_BF16, >>> "__ARM_FEATURE_SVE_BF16", pfile); >>> + aarch64_def_or_undef (TARGET_SVE_BFSCALE, >>> + "__ARM_FEATURE_SVE_BFSCALE", pfile); >>> >>> aarch64_def_or_undef (TARGET_LUT, "__ARM_FEATURE_LUT", pfile); >>> aarch64_def_or_undef (TARGET_SME_LUTv2, "__ARM_FEATURE_SME_LUTv2", pfile); >>> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc >>> b/gcc/config/aarch64/aarch64-sve-builtins-base.cc >>> index e3d0f9b909a..b296f5c259f 100644 >>> --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc >>> +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc >>> @@ -2315,11 +2315,18 @@ class svmul_impl : public rtx_code_function >>> { >>> public: >>> CONSTEXPR svmul_impl () >>> - : rtx_code_function (MULT, MULT, UNSPEC_COND_FMUL) {} >>> + : rtx_code_function (MULT, MULT, UNSPEC_COND_FMUL, UNSPEC_FMUL) {} >>> >>> gimple * >>> fold (gimple_folder &f) const override >>> { >>> + /* The code below assumes that the function has 3 arguments (pg, rn, >>> rm). >>> + * Unpredicated functions have only 2 arguments (rn, rm) so will cause >>> the >>> + * code below to crash. Also skip if it does not operatoe on integers, >> Typo: “operate”. Also, the house style is to not have the asterisks at the >> beginning of comment lines. > Fixed, thanks. >> >>> + * since all the optimizations below are for integer multiplication. >>> */ >>> + if (!f.type_suffix (0).integer_p || f.pred == aarch64_sve::PRED_none) >>> + return nullptr; >>> + >>> if (auto *res = f.fold_const_binary (MULT_EXPR)) >>> return res; >>> >>> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sve2.def >>> b/gcc/config/aarch64/aarch64-sve-builtins-sve2.def >>> index 19249a58875..62a80a7b320 100644 >>> --- a/gcc/config/aarch64/aarch64-sve-builtins-sve2.def >>> +++ b/gcc/config/aarch64/aarch64-sve-builtins-sve2.def >>> @@ -447,3 +447,46 @@ DEF_SVE_FUNCTION_GS_FPM (svmmla, mmla, s_float_mf8, >>> none, none, set) >>> #define REQUIRED_EXTENSIONS nonstreaming_sve (AARCH64_FL_SVE_F16F32MM) >>> DEF_SVE_FUNCTION (svmmla, mmla, cvt_f32_f16, none) >>> #undef REQUIRED_EXTENSIONS >>> + >>> +/* >>> +- BFSCALE (predicated) >>> + // Only if __ARM_FEATURE_SVE_BFSCALE != 0 && __ARM_FEATURE_SVE2 != 0 >>> + svbfloat16_t svscale[_bf16]_m (svbool_t pg, svbfloat16_t zdn, svint16_t >>> zm); >>> + svbfloat16_t svscale[_bf16]_x (svbool_t pg, svbfloat16_t zdn, svint16_t >>> zm); >>> + svbfloat16_t svscale[_bf16]_z (svbool_t pg, svbfloat16_t zdn, svint16_t >>> zm); >>> + svbfloat16_t svscale[_n_bf16]_m (svbool_t pg, svbfloat16_t zdn, int16_t >>> zm); >>> + svbfloat16_t svscale[_n_bf16]_x (svbool_t pg, svbfloat16_t zdn, int16_t >>> zm); >>> + svbfloat16_t svscale[_n_bf16]_z (svbool_t pg, svbfloat16_t zdn, int16_t >>> zm); */ >>> +#define REQUIRED_EXTENSIONS \ >>> + sve_and_sme (AARCH64_FL_SVE2 | AARCH64_FL_SVE_BFSCALE, \ >>> + AARCH64_FL_SME2 | AARCH64_FL_SVE_BFSCALE) >>> +DEF_SVE_FUNCTION (svscale, binary_int_opt_n, h_bfloat, mxz) >>> +#undef REQUIRED_EXTENSIONS >>> + >>> +/* >>> +- BFSCALE (multiple vectors) >>> + // Only if __ARM_FEATURE_SVE_BFSCALE != 0 && __ARM_FEATURE_SME2 != 0 >>> + svbfloat16x2_t svscale[_bf16_x2] (svbfloat16x2_t zdn, svint16x2_t zm) >>> __arm_streaming; >>> + svbfloat16x4_t svscale[_bf16_x4] (svbfloat16x4_t zdn, svint16x4_t zm) >>> __arm_streaming; >>> + >>> +- BFSCALE (multiple and single vector) >>> + // Only if __ARM_FEATURE_SVE_BFSCALE != 0 && __ARM_FEATURE_SME2 != 0 >>> + svbfloat16x2_t svscale[_single_bf16_x2] (svbfloat16x2_t zn, svint16_t >>> zm) __arm_streaming; >>> + svbfloat16x4_t svscale[_single_bf16_x4] (svbfloat16x4_t zn, svint16_t >>> zm) __arm_streaming; */ >>> +#define REQUIRED_EXTENSIONS streaming_only (AARCH64_FL_SVE_BFSCALE | >>> AARCH64_FL_SME2) >>> +DEF_SVE_FUNCTION_GS (svscale, binary_int_opt_single_n, h_bfloat, x24, none) >>> +#undef REQUIRED_EXTENSIONS >>> + >>> +/* >>> +- BFMUL (multiple vectors) >>> + // Only if __ARM_FEATURE_SVE_BFSCALE != 0 && __ARM_FEATURE_SME2 != 0 >>> + svbfloat16x2_t svmul[_bf16_x2] (svbfloat16x2_t zdn, svbfloat16x2_t zm) >>> __arm_streaming; >>> + svbfloat16x4_t svmul[_bf16_x4] (svbfloat16x4_t zdn, svbfloat16x4_t zm) >>> __arm_streaming; >>> + >>> +- BFMUL (multiple and single vector) >>> + // Only if __ARM_FEATURE_SVE_BFSCALE != 0 && __ARM_FEATURE_SME2 != 0 >>> + svbfloat16x2_t svmul[_single_bf16_x2] (svbfloat16x2_t zn, svbfloat16x2_t >>> zm) __arm_streaming; >>> + svbfloat16x4_t svmul[_single_bf16_x4] (svbfloat16x4_t zn, svbfloat16x4_t >>> zm) __arm_streaming; */ >>> +#define REQUIRED_EXTENSIONS streaming_only (AARCH64_FL_SVE_BFSCALE | >>> AARCH64_FL_SME2) >>> +DEF_SVE_FUNCTION_GS (svmul, binary_opt_single_n, h_bfloat, x24, none) >>> +#undef REQUIRED_EXTENSIONS >>> diff --git a/gcc/config/aarch64/aarch64-sve.md >>> b/gcc/config/aarch64/aarch64-sve.md >>> index 846ed5c65d2..97fd9516959 100644 >>> --- a/gcc/config/aarch64/aarch64-sve.md >>> +++ b/gcc/config/aarch64/aarch64-sve.md >>> @@ -5529,6 +5529,7 @@ (define_insn_and_rewrite >>> "*cond_<sve_int_op><mode>_any" >>> ;; ------------------------------------------------------------------------- >>> ;; Includes: >>> ;; - FSCALE >>> +;; - BFSCALE (SVE_BFSCALE) >>> ;; - FTSMUL >>> ;; - FTSSEL >>> ;; ------------------------------------------------------------------------- >>> @@ -5566,15 +5567,15 @@ (define_insn "@aarch64_sve_<optab><mode>" >>> (define_insn "@aarch64_pred_<optab><mode>" >>> [(set (match_operand:SVE_FULL_F_SCALAR 0 "register_operand") >>> (unspec:SVE_FULL_F_SCALAR >>> - [(match_operand:<VPRED> 1 "register_operand") >>> - (match_operand:SI 4 "aarch64_sve_gp_strictness") >>> + [(match_operand:<VPRED> 1 "register_operand") >>> + (match_operand:SI 4 "aarch64_sve_gp_strictness") >>> (match_operand:SVE_FULL_F_SCALAR 2 "register_operand") >>> - (match_operand:<V_INT_EQUIV> 3 "register_operand")] >>> + (match_operand:<V_INT_EQUIV> 3 "register_operand")] >>> SVE_COND_FP_BINARY_INT))] >>> "TARGET_SVE" >>> - {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ] >>> - [ w , Upl , 0 , w ; * ] >>> <sve_fp_op>\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, %Z3.<Vetype> >>> - [ ?&w , Upl , w , w ; yes ] movprfx\t%Z0, >>> %Z2\;<sve_fp_op>\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, %Z3.<Vetype> >>> + {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ] >>> + [ w , Upl , 0 , w ; * ] <b><sve_fp_op>\t%Z0.<Vetype>, %1/m, >>> %Z0.<Vetype>, %Z3.<Vetype> >>> + [ ?&w , Upl , w , w ; yes ] movprfx\t%Z0, >>> %Z2\;<b><sve_fp_op>\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, %Z3.<Vetype> >>> } >>> [(set_attr "sve_type" "sve_fp_mul")] >>> ) >>> @@ -5582,16 +5583,16 @@ (define_insn "@aarch64_pred_<optab><mode>" >>> ;; Predicated floating-point binary operations with merging, taking an >>> ;; integer as their second operand. >>> (define_expand "@cond_<optab><mode>" >>> - [(set (match_operand:SVE_FULL_F 0 "register_operand") >>> - (unspec:SVE_FULL_F >>> + [(set (match_operand:SVE_FULL_F_BFSCALE 0 "register_operand") >>> + (unspec:SVE_FULL_F_BFSCALE >>> [(match_operand:<VPRED> 1 "register_operand") >>> - (unspec:SVE_FULL_F >>> + (unspec:SVE_FULL_F_BFSCALE >>> [(match_dup 1) >>> (const_int SVE_STRICT_GP) >>> - (match_operand:SVE_FULL_F 2 "register_operand") >>> - (match_operand:<V_INT_EQUIV> 3 "register_operand")] >>> + (match_operand:SVE_FULL_F_BFSCALE 2 "register_operand") >>> + (match_operand:<V_INT_EQUIV> 3 "register_operand")] >>> SVE_COND_FP_BINARY_INT) >>> - (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")] >>> + (match_operand:SVE_FULL_F_BFSCALE 4 "aarch64_simd_reg_or_zero")] >>> UNSPEC_SEL))] >>> "TARGET_SVE" >>> ) >>> @@ -5599,21 +5600,21 @@ (define_expand "@cond_<optab><mode>" >>> ;; Predicated floating-point binary operations that take an integer as their >>> ;; second operand, with inactive lanes coming from the first operand. >>> (define_insn_and_rewrite "*cond_<optab><mode>_2_relaxed" >>> - [(set (match_operand:SVE_FULL_F 0 "register_operand") >>> - (unspec:SVE_FULL_F >>> + [(set (match_operand:SVE_FULL_F_BFSCALE 0 "register_operand") >>> + (unspec:SVE_FULL_F_BFSCALE >>> [(match_operand:<VPRED> 1 "register_operand") >>> - (unspec:SVE_FULL_F >>> + (unspec:SVE_FULL_F_BFSCALE >>> [(match_operand 4) >>> (const_int SVE_RELAXED_GP) >>> - (match_operand:SVE_FULL_F 2 "register_operand") >>> + (match_operand:SVE_FULL_F_BFSCALE 2 "register_operand") >>> (match_operand:<V_INT_EQUIV> 3 "register_operand")] >>> SVE_COND_FP_BINARY_INT) >>> (match_dup 2)] >>> UNSPEC_SEL))] >>> "TARGET_SVE" >>> - {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ] >>> - [ w , Upl , 0 , w ; * ] <sve_fp_op>\t%0.<Vetype>, >>> %1/m, %0.<Vetype>, %3.<Vetype> >>> - [ ?&w , Upl , w , w ; yes ] movprfx\t%0, >>> %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> >>> + {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ] >>> + [ w , Upl , 0 , w ; * ] <b><sve_fp_op>\t%0.<Vetype>, %1/m, >>> %0.<Vetype>, %3.<Vetype> >>> + [ ?&w , Upl , w , w ; yes ] movprfx\t%0, >>> %2\;<b><sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> >>> } >>> "&& !rtx_equal_p (operands[1], operands[4])" >>> { >>> @@ -5623,21 +5624,21 @@ (define_insn_and_rewrite >>> "*cond_<optab><mode>_2_relaxed" >>> ) >>> >>> (define_insn "*cond_<optab><mode>_2_strict" >>> - [(set (match_operand:SVE_FULL_F 0 "register_operand") >>> - (unspec:SVE_FULL_F >>> + [(set (match_operand:SVE_FULL_F_BFSCALE 0 "register_operand") >>> + (unspec:SVE_FULL_F_BFSCALE >>> [(match_operand:<VPRED> 1 "register_operand") >>> - (unspec:SVE_FULL_F >>> + (unspec:SVE_FULL_F_BFSCALE >>> [(match_dup 1) >>> (const_int SVE_STRICT_GP) >>> - (match_operand:SVE_FULL_F 2 "register_operand") >>> + (match_operand:SVE_FULL_F_BFSCALE 2 "register_operand") >>> (match_operand:<V_INT_EQUIV> 3 "register_operand")] >>> SVE_COND_FP_BINARY_INT) >>> (match_dup 2)] >>> UNSPEC_SEL))] >>> "TARGET_SVE" >>> - {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ] >>> - [ w , Upl , 0 , w ; * ] <sve_fp_op>\t%0.<Vetype>, >>> %1/m, %0.<Vetype>, %3.<Vetype> >>> - [ ?&w , Upl , w , w ; yes ] movprfx\t%0, >>> %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> >>> + {@ [ cons: =0 , 1 , 2 , 3 ; attrs: movprfx ] >>> + [ w , Upl , 0 , w ; * ] <b><sve_fp_op>\t%0.<Vetype>, %1/m, >>> %0.<Vetype>, %3.<Vetype> >>> + [ ?&w , Upl , w , w ; yes ] movprfx\t%0, >>> %2\;<b><sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> >>> } >>> [(set_attr "sve_type" "sve_fp_mul")] >>> ) >>> @@ -5646,22 +5647,22 @@ (define_insn "*cond_<optab><mode>_2_strict" >>> ;; their second operand, with the values of inactive lanes being distinct >>> ;; from the other inputs. >>> (define_insn_and_rewrite "*cond_<optab><mode>_any_relaxed" >>> - [(set (match_operand:SVE_FULL_F 0 "register_operand") >>> - (unspec:SVE_FULL_F >>> + [(set (match_operand:SVE_FULL_F_BFSCALE 0 "register_operand") >>> + (unspec:SVE_FULL_F_BFSCALE >>> [(match_operand:<VPRED> 1 "register_operand") >>> - (unspec:SVE_FULL_F >>> + (unspec:SVE_FULL_F_BFSCALE >>> [(match_operand 5) >>> (const_int SVE_RELAXED_GP) >>> - (match_operand:SVE_FULL_F 2 "register_operand") >>> + (match_operand:SVE_FULL_F_BFSCALE 2 "register_operand") >>> (match_operand:<V_INT_EQUIV> 3 "register_operand")] >>> SVE_COND_FP_BINARY_INT) >>> - (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")] >>> + (match_operand:SVE_FULL_F_BFSCALE 4 "aarch64_simd_reg_or_zero")] >>> UNSPEC_SEL))] >>> "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])" >>> {@ [ cons: =0 , 1 , 2 , 3 , 4 ] >>> - [ &w , Upl , 0 , w , Dz ] movprfx\t%0.<Vetype>, %1/z, >>> %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> >>> - [ &w , Upl , w , w , Dz ] movprfx\t%0.<Vetype>, %1/z, >>> %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> >>> - [ &w , Upl , w , w , 0 ] movprfx\t%0.<Vetype>, %1/m, >>> %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> >>> + [ &w , Upl , 0 , w , Dz ] movprfx\t%0.<Vetype>, %1/z, >>> %2.<Vetype>\;<b><sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> >>> + [ &w , Upl , w , w , Dz ] movprfx\t%0.<Vetype>, %1/z, >>> %2.<Vetype>\;<b><sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> >>> + [ &w , Upl , w , w , 0 ] movprfx\t%0.<Vetype>, %1/m, >>> %2.<Vetype>\;<b><sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> >>> [ ?&w , Upl , w , w , w ] # >>> } >>> "&& 1" >>> @@ -5684,22 +5685,22 @@ (define_insn_and_rewrite >>> "*cond_<optab><mode>_any_relaxed" >>> ) >>> >>> (define_insn_and_rewrite "*cond_<optab><mode>_any_strict" >>> - [(set (match_operand:SVE_FULL_F 0 "register_operand") >>> - (unspec:SVE_FULL_F >>> + [(set (match_operand:SVE_FULL_F_BFSCALE 0 "register_operand") >>> + (unspec:SVE_FULL_F_BFSCALE >>> [(match_operand:<VPRED> 1 "register_operand") >>> - (unspec:SVE_FULL_F >>> + (unspec:SVE_FULL_F_BFSCALE >>> [(match_dup 1) >>> (const_int SVE_STRICT_GP) >>> - (match_operand:SVE_FULL_F 2 "register_operand") >>> - (match_operand:<V_INT_EQUIV> 3 "register_operand")] >>> + (match_operand:SVE_FULL_F_BFSCALE 2 "register_operand") >>> + (match_operand:<V_INT_EQUIV> 3 "register_operand")] >>> SVE_COND_FP_BINARY_INT) >>> - (match_operand:SVE_FULL_F 4 "aarch64_simd_reg_or_zero")] >>> + (match_operand:SVE_FULL_F_BFSCALE 4 "aarch64_simd_reg_or_zero")] >>> UNSPEC_SEL))] >>> "TARGET_SVE && !rtx_equal_p (operands[2], operands[4])" >>> {@ [ cons: =0 , 1 , 2 , 3 , 4 ] >>> - [ &w , Upl , 0 , w , Dz ] movprfx\t%0.<Vetype>, %1/z, >>> %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> >>> - [ &w , Upl , w , w , Dz ] movprfx\t%0.<Vetype>, %1/z, >>> %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> >>> - [ &w , Upl , w , w , 0 ] movprfx\t%0.<Vetype>, %1/m, >>> %2.<Vetype>\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> >>> + [ &w , Upl , 0 , w , Dz ] movprfx\t%0.<Vetype>, %1/z, >>> %2.<Vetype>\;<b><sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> >>> + [ &w , Upl , w , w , Dz ] movprfx\t%0.<Vetype>, %1/z, >>> %2.<Vetype>\;<b><sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> >>> + [ &w , Upl , w , w , 0 ] movprfx\t%0.<Vetype>, %1/m, >>> %2.<Vetype>\;<b><sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype> >>> [ ?&w , Upl , w , w , w ] # >>> } >>> "&& reload_completed >>> diff --git a/gcc/config/aarch64/aarch64-sve2.md >>> b/gcc/config/aarch64/aarch64-sve2.md >>> index 1278b9081cd..89e9bf9690c 100644 >>> --- a/gcc/config/aarch64/aarch64-sve2.md >>> +++ b/gcc/config/aarch64/aarch64-sve2.md >>> @@ -61,6 +61,7 @@ >>> ;; ---- [FP] Non-widening bfloat16 arithmetic >>> ;; ---- [FP] Clamp to minimum/maximum >>> ;; ---- [FP] Scaling by powers of two >>> +;; ---- [FP] Multiplication >>> ;; >>> ;; == Uniform ternary arithmnetic >>> ;; ---- [INT] General ternary arithmetic that maps to unspecs >>> @@ -1492,26 +1493,74 @@ (define_insn "@aarch64_sve_fclamp_single<mode>" >>> ;; ------------------------------------------------------------------------- >>> ;; Includes the multiple and single vector and multiple vectors forms of >>> ;; - FSCALE >>> +;; - BFSCALE >> It would be good to have the (SVE_BFSCALE) in the comment here to keep >> consistent with the practice of highlighting non-default ISA extensions >> needed for various instructions. > Fixed, thanks. >> >>> ;; ------------------------------------------------------------------------- >>> >>> +;; FSCALE / BFSCALE (multiple vectors) >>> +;; sv{b}floatNx2_t svscale[_{b}fN_x2] (sv{b}floatNx2_t zdn, svintNx2_t zm) >>> __arm_streaming; >>> +;; sv{b}floatNx4_t svscale[_{b}fN_x4] (sv{b}floatNx4_t zdn, svintNx4_t zm) >>> __arm_streaming; >>> +;; {B}FSCALE { <Zdn1>.T-<Zdn2>.T }, { <Zdn1>.T-<Zdn2>.T }, { >>> <Zm1>.H-<Zm2>.T } >>> +;; {B}FSCALE { <Zdn1>.T-<Zdn4>.T }, { <Zdn1>.T-<Zdn4>.T }, { >>> <Zm1>.H-<Zm4>.T } >>> (define_insn "@aarch64_sve_fscale<mode>" >>> - [(set (match_operand:SVE_Fx24_NOBF 0 "register_operand" >>> "=Uw<vector_count>") >>> - (unspec:SVE_Fx24_NOBF >>> - [(match_operand:SVE_Fx24_NOBF 1 "register_operand" "0") >>> + [(set (match_operand:SVE_Fx24_BFSCALE 0 "register_operand" >>> "=Uw<vector_count>") >>> + (unspec:SVE_Fx24_BFSCALE >>> + [(match_operand:SVE_Fx24_BFSCALE 1 "register_operand" "0") >>> (match_operand:<SVSCALE_INTARG> 2 "register_operand" "Uw<vector_count>")] >>> UNSPEC_FSCALE))] >>> - "TARGET_STREAMING_SME2 && TARGET_FP8" >>> - "fscale\t%0, %1, %2" >>> + "TARGET_STREAMING_SME2 && (<is_bf16> ? TARGET_SVE_BFSCALE : TARGET_FP8)" >> Is this instruction supposed to work outside of SME2/SSVE? I think this >> condition doesn’t allow the instruction for non-SME targets. Or am I >> misremembering the TARGET_* definitions here? >> Otherwise looks ok to me. >> Thanks, >> Kyrill > > The documentation for the instructions > (https://armv8.arm.com/latest_builds/v9A/isa64/fscale_mz_zzv.xml) state they > are only available when SME2 is enabled,
That link is internal btw. > and the documentation for the intrinsics > (https://arm-software.github.io/acle/main/acle.html#fscale) says that > streaming mode must be enabled. Ah ok, so FEAT_SVE_BFSCALE introduces SVE-only BFSCALE instructions, but also streaming mode variants, which is what this pattern is implementing: https://developer.arm.com/documentation/ddi0602/2024-12/SME-Instructions/BFSCALE--multiple-vectors---Multi-vector-BFloat16-adjust-exponent-?lang=en Ok, no objections from me then. Thanks, Kyrill > >>> + "<b>fscale\t%0, %1, %2" >>> ) >>> >>> +;; FSCALE / BFSCALE (multiple and single vector) >>> +;; sv{b}floatNx2_t svscale[_single_{b}fN_x2] (sv{b}floatNx2_t zdn, >>> svintN_t zm) __arm_streaming; >>> +;; sv{b}floatNx4_t svscale[_single_{b}fN_x4] (sv{b}floatNx4_t zdn, >>> svintN_t zm) __arm_streaming; >>> +;; {B}FSCALE { <Zdn1>.T-<Zdn2>.T }, { <Zdn1>.T-<Zdn2>.T }, <Zm1>.T >>> +;; {B}FSCALE { <Zdn1>.T-<Zdn4>.T }, { <Zdn1>.T-<Zdn4>.T }, <Zm1>.T >>> (define_insn "@aarch64_sve_single_fscale<mode>" >>> - [(set (match_operand:SVE_Fx24_NOBF 0 "register_operand" >>> "=Uw<vector_count>") >>> - (unspec:SVE_Fx24_NOBF >>> - [(match_operand:SVE_Fx24_NOBF 1 "register_operand" "0") >>> + [(set (match_operand:SVE_Fx24_BFSCALE 0 "register_operand" >>> "=Uw<vector_count>") >>> + (unspec:SVE_Fx24_BFSCALE >>> + [(match_operand:SVE_Fx24_BFSCALE 1 "register_operand" "0") >>> (match_operand:<SVSCALE_SINGLE_INTARG> 2 "register_operand" "x")] >>> UNSPEC_FSCALE))] >>> - "TARGET_STREAMING_SME2 && TARGET_FP8" >>> - "fscale\t%0, %1, %2.<Vetype>" >>> + "TARGET_STREAMING_SME2 && (<is_bf16> ? TARGET_SVE_BFSCALE : TARGET_FP8)" >>> + "<b>fscale\t%0, %1, %2.<Vetype>" >>> +) >>> + >>> +;; >>> ------------------------------------------------------------------------- >>> +;; ---- [FP] Multiplication >>> +;; >>> ------------------------------------------------------------------------- >>> +;; Includes the multiple and single vector and multiple vectors forms of >>> +;; - BFMUL >>> +;; >>> ------------------------------------------------------------------------- >>> + >>> +;; BFMUL (multiple vectors) >>> +;; svbfloat16x2_t svmul[_bf16_x2](svbfloat16x2_t zd, svbfloat16x2_t zm) >>> __arm_streaming; >>> +;; svbfloat16x4_t svmul[_bf16_x4](svbfloat16x4_t zd, svbfloat16x4_t zm) >>> __arm_streaming; >>> +;; BFMUL { <Zd1>.H-<Zd2>.H }, { <Zn1>.H-<Zn2>.H }, { <Zm1>.H-<Zm2>.H } >>> +;; BFMUL { <Zd1>.H-<Zd4>.H }, { <Zn1>.H-<Zn4>.H }, { <Zm1>.H-<Zm4>.H } >>> +(define_insn "@aarch64_sve_<optab><mode>" >>> + [(set (match_operand:SVE_BFx24 0 "register_operand" "=Uw<vector_count>") >>> + (unspec:SVE_BFx24 >>> + [(match_operand:SVE_BFx24 1 "register_operand" "Uw<vector_count>") >>> + (match_operand:SVE_BFx24 2 "register_operand" "Uw<vector_count>")] >>> + SVE_FP_MUL))] >>> + "TARGET_STREAMING_SME2 && TARGET_SVE_BFSCALE" >>> + "bfmul\t%0, %1, %2" >>> +) >>> + >>> +;; BFMUL (multiple and single vector) >>> +;; svbfloat16x2_t svmul[_single_bf16_x2](svbfloat16x2_t zd, svbfloat16_t >>> zm) __arm_streaming; >>> +;; svbfloat16x4_t svmul[_single_bf16_x4](svbfloat16x4_t zd, svbfloat16_t >>> zm) __arm_streaming; >>> +;; BFMUL { <Zd1>.H-<Zd2>.H }, { <Zn1>.H-<Zn2>.H }, <Zm>.H >>> +;; BFMUL { <Zd1>.H-<Zd4>.H }, { <Zn1>.H-<Zn4>.H }, <Zm>.H >>> +(define_insn "@aarch64_sve_<optab><mode>_single" >>> + [(set (match_operand:SVE_BFx24 0 "register_operand" "=Uw<vector_count>") >>> + (unspec:SVE_BFx24 >>> + [(match_operand:SVE_BFx24 1 "register_operand" "Uw<vector_count>") >>> + (match_operand:<VSINGLE> 2 "register_operand" "x")] >>> + SVE_FP_MUL))] >>> + "TARGET_STREAMING_SME2 && TARGET_SVE_BFSCALE" >>> + "bfmul\t%0, %1, %2.h" >>> ) >>> >>> ;; ========================================================================= >>> @@ -4704,4 +4753,3 @@ (define_insn >>> "@aarch64_sve2_<sve_fp_op><VNx4SF_ONLY:mode><VNx8HF_ONLY:mode>" >>> } >>> [(set_attr "sve_type" "sve_fp_mul")] >>> ) >>> - >>> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h >>> index 1dd942f377f..21f9682ef97 100644 >>> --- a/gcc/config/aarch64/aarch64.h >>> +++ b/gcc/config/aarch64/aarch64.h >>> @@ -385,6 +385,7 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE >>> ATTRIBUTE_UNUSED >>> #define TARGET_BF16_FP AARCH64_HAVE_ISA (BF16) >>> #define TARGET_BF16_SIMD (TARGET_BF16_FP && TARGET_SIMD) >>> #define TARGET_SVE_BF16 (TARGET_BF16_FP && TARGET_SVE) >>> +#define TARGET_SVE_BFSCALE (AARCH64_HAVE_ISA (SVE_BFSCALE)) >>> >>> /* PAUTH instructions are enabled through +pauth. */ >>> #define TARGET_PAUTH AARCH64_HAVE_ISA (PAUTH) >>> diff --git a/gcc/config/aarch64/iterators.md >>> b/gcc/config/aarch64/iterators.md >>> index b425b0ed2ca..39b1e84edcc 100644 >>> --- a/gcc/config/aarch64/iterators.md >>> +++ b/gcc/config/aarch64/iterators.md >>> @@ -501,10 +501,13 @@ (define_mode_iterator SVE_PARTIAL_F [VNx2HF VNx4HF >>> VNx2SF]) >>> (define_mode_iterator SVE_F [SVE_PARTIAL_F SVE_FULL_F]) >>> >>> ;; Fully-packed SVE floating-point vector modes and their scalar >>> equivalents. >>> -(define_mode_iterator SVE_FULL_F_SCALAR [SVE_FULL_F GPF_HF]) >>> +(define_mode_iterator SVE_FULL_F_SCALAR [SVE_FULL_F GPF_HF (VNx8BF >>> "TARGET_SVE_BFSCALE")]) >>> >>> (define_mode_iterator SVE_FULL_F_B16B16 [(VNx8BF "TARGET_SSVE_B16B16") >>> SVE_FULL_F]) >>> >>> +(define_mode_iterator SVE_FULL_F_BFSCALE [SVE_FULL_F >>> + (VNx8BF "TARGET_SVE_BFSCALE")]) >>> + >>> (define_mode_iterator SVE_PARTIAL_F_B16B16 [(VNx2BF "TARGET_SSVE_B16B16") >>> (VNx4BF "TARGET_SSVE_B16B16") >>> SVE_PARTIAL_F]) >>> @@ -746,10 +749,18 @@ (define_mode_iterator SVE_Ix24 [VNx32QI VNx16HI >>> VNx8SI VNx4DI >>> (define_mode_iterator SVE_Fx24_NOBF [VNx16HF VNx8SF VNx4DF >>> VNx32HF VNx16SF VNx8DF]) >>> >>> +(define_mode_iterator SVE_Fx24_BFSCALE [ >>> + SVE_Fx24_NOBF >>> + (VNx16BF "TARGET_SVE_BFSCALE") ;; bf16x2 >>> + (VNx32BF "TARGET_SVE_BFSCALE") ;; bf16x4 >>> +]) >>> + >>> (define_mode_iterator SVE_Fx24 [(VNx16BF "TARGET_SSVE_B16B16") >>> (VNx32BF "TARGET_SSVE_B16B16") >>> SVE_Fx24_NOBF]) >>> >>> +(define_mode_iterator SVE_BFx24 [VNx16BF VNx32BF]) >>> + >>> (define_mode_iterator SVE_SFx24 [VNx8SF VNx16SF]) >>> >>> ;; The modes used to represent different ZA access sizes. >>> @@ -824,6 +835,7 @@ (define_c_enum "unspec" >>> UNSPEC_FMAX ; Used in aarch64-simd.md. >>> UNSPEC_FMAXNMV ; Used in aarch64-simd.md. >>> UNSPEC_FMAXV ; Used in aarch64-simd.md. >>> + UNSPEC_FMUL ; Used in aarch64-sve2.md. >>> UNSPEC_FMIN ; Used in aarch64-simd.md. >>> UNSPEC_FMINNMV ; Used in aarch64-simd.md. >>> UNSPEC_FMINV ; Used in aarch64-simd.md. >>> @@ -2211,6 +2223,8 @@ (define_mode_attr V_INT_EQUIV [(V8QI "V8QI") (V16QI >>> "V16QI") >>> (VNx16QI "VNx16QI") >>> (VNx8HI "VNx8HI") (VNx8HF "VNx8HI") >>> (VNx8BF "VNx8HI") >>> + (VNx16BF "VNx16HI") >>> + (VNx32BF "VNx32HI") >>> (VNx4SI "VNx4SI") (VNx4SF "VNx4SI") >>> (VNx2DI "VNx2DI") (VNx2DF "VNx2DI") >>> (VNx8SF "VNx8SI") (VNx16SF "VNx16SI") >>> @@ -2792,17 +2806,20 @@ (define_mode_attr vec_or_offset [(V8QI "vec") >>> (V16QI "vec") (V4HI "vec") >>> (V8HI "vec") (V2SI "vec") (V4SI "vec") >>> (V2DI "vec") (DI "offset")]) >>> >>> -(define_mode_attr b [(V4BF "b") (V4HF "") (V8BF "b") (V8HF "") >>> +(define_mode_attr b [(BF "b") (HF "") (SF "") (DF "") >>> + (V4BF "b") (V4HF "") (V8BF "b") (V8HF "") >>> (VNx2BF "b") (VNx2HF "") (VNx2SF "") >>> (VNx4BF "b") (VNx4HF "") (VNx4SF "") >>> (VNx8BF "b") (VNx8HF "") (VNx2DF "") >>> (VNx16BF "b") (VNx16HF "") (VNx8SF "") (VNx4DF "") >>> (VNx32BF "b") (VNx32HF "") (VNx16SF "") (VNx8DF "")]) >>> >>> -(define_mode_attr is_bf16 [(VNx2BF "true") (VNx4BF "true") (VNx8BF "true") >>> - (VNx2HF "false") (VNx4HF "false") (VNx8HF "false") >>> - (VNx2SF "false") (VNx4SF "false") >>> - (VNx2DF "false")]) >>> +(define_mode_attr is_bf16 [ >>> + (VNx2BF "true") (VNx4BF "true") (VNx8BF "true") (VNx16BF "true") >>> (VNx32BF "true") >>> + (VNx2HF "false") (VNx4HF "false") (VNx8HF "false") (VNx16HF "false") >>> (VNx32HF "false") >>> + (VNx2SF "false") (VNx4SF "false") (VNx8SF "false") (VNx16SF "false") >>> + (VNx2DF "false") (VNx4DF "false") (VNx8DF "false") >>> +]) >>> >>> (define_mode_attr aligned_operand [(VNx16QI "register_operand") >>> (VNx8HI "register_operand") >>> @@ -2829,22 +2846,29 @@ (define_mode_attr LD1_EXTENDQ_MEM [(VNx4SI >>> "VNx1SI") (VNx4SF "VNx1SI") >>> >>> ;; Maps the output type of svscale to the corresponding int vector type in >>> the >>> ;; second argument. >>> -(define_mode_attr SVSCALE_SINGLE_INTARG [(VNx16HF "VNx8HI") ;; f16_x2 -> >>> s16 >>> - (VNx32HF "VNx8HI") ;; f16_x4 -> s16 >>> - (VNx8SF "VNx4SI") ;; f32_x2 -> s32 >>> - (VNx16SF "VNx4SI") ;; f32_x4 -> s32 >>> - (VNx4DF "VNx2DI") ;; f64_x2 -> s64 >>> - (VNx8DF "VNx2DI") ;; f64_x4 -> s64 >>> +(define_mode_attr SVSCALE_SINGLE_INTARG [ >>> + (VNx16HF "VNx8HI") ;; f16_x2 -> s16 >>> + (VNx32HF "VNx8HI") ;; f16_x4 -> s16 >>> + (VNx16BF "VNx8HI") ;; bf16_x2 -> s16 >>> + (VNx32BF "VNx8HI") ;; bf16_x4 -> s16 >>> + (VNx8SF "VNx4SI") ;; f32_x2 -> s32 >>> + (VNx16SF "VNx4SI") ;; f32_x4 -> s32 >>> + (VNx4DF "VNx2DI") ;; f64_x2 -> s64 >>> + (VNx8DF "VNx2DI") ;; f64_x4 -> s64 >>> ]) >>> >>> -(define_mode_attr SVSCALE_INTARG [(VNx16HF "VNx16HI") ;; f16_x2 -> s16x2 >>> - (VNx32HF "VNx32HI") ;; f16_x4 -> s16x4 >>> - (VNx8SF "VNx8SI") ;; f32_x2 -> s32_x2 >>> - (VNx16SF "VNx16SI") ;; f32_x4 -> s32_x4 >>> - (VNx4DF "VNx4DI") ;; f64_x2 -> s64_x2 >>> - (VNx8DF "VNx8DI") ;; f64_x4 -> s64_x4 >>> +(define_mode_attr SVSCALE_INTARG [ >>> + (VNx16HF "VNx16HI") ;; f16_x2 -> s16x2 >>> + (VNx32HF "VNx32HI") ;; f16_x4 -> s16x4 >>> + (VNx16BF "VNx16HI") ;; bf16_x2 -> s16x2 >>> + (VNx32BF "VNx32HI") ;; bf16_x4 -> s16x4 >>> + (VNx8SF "VNx8SI") ;; f32_x2 -> s32_x2 >>> + (VNx16SF "VNx16SI") ;; f32_x4 -> s32_x4 >>> + (VNx4DF "VNx4DI") ;; f64_x2 -> s64_x2 >>> + (VNx8DF "VNx8DI") ;; f64_x4 -> s64_x4 >>> ]) >>> >>> + >>> ;; ------------------------------------------------------------------- >>> ;; Code Iterators >>> ;; ------------------------------------------------------------------- >>> @@ -3644,6 +3668,8 @@ (define_int_iterator SVE_COND_FP_ADD >>> [UNSPEC_COND_FADD]) >>> (define_int_iterator SVE_COND_FP_SUB [UNSPEC_COND_FSUB]) >>> (define_int_iterator SVE_COND_FP_MUL [UNSPEC_COND_FMUL]) >>> >>> +(define_int_iterator SVE_FP_MUL [UNSPEC_FMUL]) >>> + >>> (define_int_iterator SVE_COND_FP_BINARY_I1 [UNSPEC_COND_FMAX >>> UNSPEC_COND_FMAXNM >>> UNSPEC_COND_FMIN >>> @@ -4205,6 +4231,7 @@ (define_int_attr optab [(UNSPEC_ANDF "and") >>> (UNSPEC_FMINNMQV "fminnmqv") >>> (UNSPEC_FMINNMV "smin") >>> (UNSPEC_FMINV "smin_nan") >>> + (UNSPEC_FMUL "fmul") >>> (UNSPEC_SMUL_HIGHPART "smulh") >>> (UNSPEC_UMUL_HIGHPART "umulh") >>> (UNSPEC_FMLA "fma") >>> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi >>> index 87c0470c3db..bb43bf27658 100644 >>> --- a/gcc/doc/invoke.texi >>> +++ b/gcc/doc/invoke.texi >>> @@ -23621,7 +23621,8 @@ Enable the Checked Pointer Arithmetic instructions. >>> @item sve-b16b16 >>> Enable the SVE non-widening brain floating-point (@code{bf16}) extension. >>> This only has an effect when @code{sve2} or @code{sme2} are also enabled. >>> - >>> +@item sve-bfscale >>> +Enable the SVE_BFSCALE extension. >>> @end table >>> >>> Feature @option{crypto} implies @option{aes}, @option{sha2}, and >>> @option{simd}, >>> diff --git a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_4.c >>> b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_4.c >>> index 284c2a23252..70f59b47aee 100644 >>> --- a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_4.c >>> +++ b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_4.c >>> @@ -111,6 +111,11 @@ >>> #error Foo >>> #endif >>> >>> +#pragma GCC target "+nothing+sve-bfscale" >>> +#ifndef __ARM_FEATURE_SVE_BFSCALE >>> +#error "__ARM_FEATURE_SVE_BFSCALE should be defined but isn't" >>> +#endif >>> + >>> #pragma GCC target "+nothing+sve2+sme-f8f16" >>> #ifndef __ARM_FEATURE_SME_F8F16 >>> #error Foo >>> diff --git a/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/mul_bf16_x2.c >>> b/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/mul_bf16_x2.c >>> new file mode 100644 >>> index 00000000000..d9e6ad1bb4f >>> --- /dev/null >>> +++ b/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/mul_bf16_x2.c >>> @@ -0,0 +1,193 @@ >>> +/* { dg-do assemble { target aarch64_asm_sve-bfscale_ok } } */ >>> +/* { dg-do compile { target { ! aarch64_asm_sve-bfscale_ok } } } */ >>> +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */ >>> + >>> +#include "test_sme2_acle.h" >>> +#pragma GCC target "+sve-bfscale" >>> + >>> +/* >>> +** mul_z0_z0_z4: >>> +** bfmul {z0\.h - z1\.h}, {z0\.h - z1\.h}, {z4\.h - z5\.h} >>> +** ret >>> +*/ >>> +TEST_XN (mul_z0_z0_z4, svbfloat16x2_t, z0, >>> + svmul_bf16_x2 (z0, z4), >>> + svmul (z0, z4)) >>> + >>> +/* >>> +** mul_z0_z4_z0: >>> +** bfmul {z0\.h - z1\.h}, {z4\.h - z5\.h}, {z0\.h - z1\.h} >>> +** ret >>> +*/ >>> +TEST_XN (mul_z0_z4_z0, svbfloat16x2_t, z0, >>> + svmul_bf16_x2 (z4, z0), >>> + svmul (z4, z0)) >>> + >>> +/* >>> +** mul_z0_z4_z28: >>> +** bfmul {z0\.h - z1\.h}, {z4\.h - z5\.h}, {z28\.h - z29\.h} >>> +** ret >>> +*/ >>> +TEST_XN (mul_z0_z4_z28, svbfloat16x2_t, z0, >>> + svmul_bf16_x2 (z4, z28), >>> + svmul (z4, z28)) >>> + >>> +/* >>> +** mul_z18_z18_z4: >>> +** bfmul {z18\.h - z19\.h}, {z18\.h - z19\.h}, {z4\.h - z5\.h} >>> +** ret >>> +*/ >>> +TEST_XN (mul_z18_z18_z4, svbfloat16x2_t, z18, >>> + svmul_bf16_x2 (z18, z4), >>> + svmul (z18, z4)) >>> + >>> +/* >>> +** mul_z23_z23_z18: >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** bfmul [^\n]+, {z18\.h - z19\.h} >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** ret >>> +*/ >>> +TEST_XN (mul_z23_z23_z18, svbfloat16x2_t, z23, >>> + svmul_bf16_x2 (z23, z18), >>> + svmul (z23, z18)) >>> + >>> +/* >>> +** mul_z28_z28_z0: >>> +** bfmul {z28\.h - z29\.h}, {z28\.h - z29\.h}, {z0\.h - z1\.h} >>> +** ret >>> +*/ >>> +TEST_XN (mul_z28_z28_z0, svbfloat16x2_t, z28, >>> + svmul_bf16_x2 (z28, z0), >>> + svmul (z28, z0)) >>> + >>> +/* >>> +** mul_z0_z0_z18: >>> +** bfmul {z0\.h - z1\.h}, {z0\.h - z1\.h}, {z18\.h - z19\.h} >>> +** ret >>> +*/ >>> +TEST_XN (mul_z0_z0_z18, svbfloat16x2_t, z0, >>> + svmul_bf16_x2 (z0, z18), >>> + svmul (z0, z18)) >>> + >>> +/* >>> +** mul_z4_z4_z23: >>> +** ( >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** bfmul {z4\.h - z5\.h}, {z4\.h - z5\.h}, [^\n]+ >>> +** | >>> +** bfmul {z4\.h - z5\.h}, {z4\.h - z5\.h}, [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** ) >>> +** ret >>> +*/ >>> +TEST_XN (mul_z4_z4_z23, svbfloat16x2_t, z4, >>> + svmul_bf16_x2 (z4, z23), >>> + svmul (z4, z23)) >>> + >>> +/* >>> +** mul_single_z24_z24_z0: >>> +** bfmul {z24\.h - z25\.h}, {z24\.h - z25\.h}, z0\.h >>> +** ret >>> +*/ >>> +TEST_XN_SINGLE (mul_single_z24_z24_z0, svbfloat16x2_t, svbfloat16_t, z24, >>> + svmul_single_bf16_x2 (z24, z0), >>> + svmul (z24, z0)) >>> + >>> +/* >>> +** mul_single_z24_z28_z0: >>> +** bfmul {z24\.h - z25\.h}, {z28\.h - z29\.h}, z0\.h >>> +** ret >>> +*/ >>> +TEST_XN_SINGLE (mul_single_z24_z28_z0, svbfloat16x2_t, svbfloat16_t, z24, >>> + svmul_single_bf16_x2 (z28, z0), >>> + svmul (z28, z0)) >>> + >>> +/* >>> +** mul_single_z24_z1_z0: >>> +** ( >>> +** mov z30\.d, z1\.d >>> +** mov z31\.d, z2\.d >>> +** | >>> +** mov z31\.d, z2\.d >>> +** mov z30\.d, z1\.d >>> +** ) >>> +** bfmul {z24\.h - z25\.h}, {z30\.h - z31\.h}, z0\.h >>> +** ret >>> +*/ >>> +TEST_XN_SINGLE (mul_single_z24_z1_z0, svbfloat16x2_t, svbfloat16_t, z24, >>> + svmul_single_bf16_x2 (z1, z0), >>> + svmul (z1, z0)) >>> + >>> +/* >>> +** mul_single_z1_z24_z0: >>> +** bfmul {z30\.h - z31\.h}, {z24\.h - z25\.h}, z0\.h >>> +** ( >>> +** mov z2\.d, z31\.d >>> +** mov z1\.d, z30\.d >>> +** | >>> +** mov z1\.d, z30\.d >>> +** mov z2\.d, z31\.d >>> +** ) >>> +** ret >>> +*/ >>> +TEST_XN_SINGLE (mul_single_z1_z24_z0, svbfloat16x2_t, svbfloat16_t, z1, >>> + svmul_single_bf16_x2 (z24, z0), >>> + svmul (z24, z0)) >>> + >>> +/* >>> +** mul_single_z1_z1_z0: >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** bfmul ({z[0-9]+\.h - z[0-9]+\.h}), \1, z0\.h >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** ret >>> +*/ >>> +TEST_XN_SINGLE (mul_single_z1_z1_z0, svbfloat16x2_t, svbfloat16_t, z1, >>> + svmul_single_bf16_x2 (z1, z0), >>> + svmul (z1, z0)) >>> + >>> +/* >>> +** mul_single_z18_z18_z0: >>> +** bfmul {z18\.h - z19\.h}, {z18\.h - z19\.h}, z0\.h >>> +** ret >>> +*/ >>> +TEST_XN_SINGLE (mul_single_z18_z18_z0, svbfloat16x2_t, svbfloat16_t, z18, >>> + svmul_single_bf16_x2 (z18, z0), >>> + svmul (z18, z0)) >>> + >>> +/* >>> +** mul_single_awkward: >>> +** ... >>> +** bfmul {z0\.h - z1\.h}, {z30\.h - z31\.h}, z[0-9]+\.h >>> +** ret >>> +*/ >>> +TEST_XN_SINGLE_AWKWARD (mul_single_awkward, svbfloat16x2_t, svbfloat16_t, >>> + z0_res = svmul_single_bf16_x2 (z1, z0), >>> + z0_res = svmul (z1, z0)) >>> + >>> +/* >>> +** mul_single_z0_z0_z15: >>> +** ... >>> +** bfmul {z0\.h - z1\.h}, {z0\.h - z1\.h}, z15\.h >>> +** ... >>> +** ret >>> +*/ >>> +TEST_XN_SINGLE_Z15 (mul_single_z0_z0_z15, svbfloat16x2_t, svbfloat16_t, >>> + z0 = svmul_single_bf16_x2 (z0, z15), >>> + z0 = svmul (z0, z15)) >>> + >>> +/* >>> +** mul_single_z24_z24_z16: >>> +** mov (z[0-7])\.d, z16\.d >>> +** bfmul {z24\.h - z25\.h}, {z24\.h - z25\.h}, \1\.h >>> +** ret >>> +*/ >>> +TEST_XN_SINGLE (mul_single_z24_z24_z16, svbfloat16x2_t, svbfloat16_t, z24, >>> + svmul_single_bf16_x2 (z24, z16), >>> + svmul (z24, z16)) >>> diff --git a/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/mul_bf16_x4.c >>> b/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/mul_bf16_x4.c >>> new file mode 100644 >>> index 00000000000..8da388e5e12 >>> --- /dev/null >>> +++ b/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/mul_bf16_x4.c >>> @@ -0,0 +1,227 @@ >>> +/* { dg-do assemble { target aarch64_asm_sve-bfscale_ok } } */ >>> +/* { dg-do compile { target { ! aarch64_asm_sve-bfscale_ok } } } */ >>> +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */ >>> + >>> +#include "test_sme2_acle.h" >>> +#pragma GCC target "+sve-bfscale" >>> + >>> +/* >>> +** mul_z0_z0_z4: >>> +** bfmul {z0\.h - z3\.h}, {z0\.h - z3\.h}, {z4\.h - z7\.h} >>> +** ret >>> +*/ >>> +TEST_XN (mul_z0_z0_z4, svbfloat16x4_t, z0, >>> + svmul_bf16_x4 (z0, z4), >>> + svmul (z0, z4)) >>> + >>> +/* >>> +** mul_z0_z4_z0: >>> +** bfmul {z0\.h - z3\.h}, {z4\.h - z7\.h}, {z0\.h - z3\.h} >>> +** ret >>> +*/ >>> +TEST_XN (mul_z0_z4_z0, svbfloat16x4_t, z0, >>> + svmul_bf16_x4 (z4, z0), >>> + svmul (z4, z0)) >>> + >>> +/* >>> +** mul_z0_z4_z28: >>> +** bfmul {z0\.h - z3\.h}, {z4\.h - z7\.h}, {z28\.h - z31\.h} >>> +** ret >>> +*/ >>> +TEST_XN (mul_z0_z4_z28, svbfloat16x4_t, z0, >>> + svmul_bf16_x4 (z4, z28), >>> + svmul (z4, z28)) >>> + >>> +/* >>> +** mul_z18_z18_z4: >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** bfmul [^\n]+, {z4\.h - z7\.h} >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** ret >>> +*/ >>> +TEST_XN (mul_z18_z18_z4, svbfloat16x4_t, z18, >>> + svmul_bf16_x4 (z18, z4), >>> + svmul (z18, z4)) >>> + >>> +/* >>> +** mul_z23_z23_z28: >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** bfmul [^\n]+, {z28\.h - z31\.h} >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** ret >>> +*/ >>> +TEST_XN (mul_z23_z23_z28, svbfloat16x4_t, z23, >>> + svmul_bf16_x4 (z23, z28), >>> + svmul (z23, z28)) >>> + >>> +/* >>> +** mul_z28_z28_z0: >>> +** bfmul {z28\.h - z31\.h}, {z28\.h - z31\.h}, {z0\.h - z3\.h} >>> +** ret >>> +*/ >>> +TEST_XN (mul_z28_z28_z0, svbfloat16x4_t, z28, >>> + svmul_bf16_x4 (z28, z0), >>> + svmul (z28, z0)) >>> + >>> +/* >>> +** mul_z0_z0_z18: >>> +** ( >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** bfmul {z0\.h - z3\.h}, {z0\.h - z3\.h}, [^\n]+ >>> +** | >>> +** bfmul {z0\.h - z3\.h}, {z0\.h - z3\.h}, [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** ) >>> +** ret >>> +*/ >>> +TEST_XN (mul_z0_z0_z18, svbfloat16x4_t, z0, >>> + svmul_bf16_x4 (z0, z18), >>> + svmul (z0, z18)) >>> + >>> +/* >>> +** mul_z4_z4_z23: >>> +** ( >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** bfmul {z4\.h - z7\.h}, {z4\.h - z7\.h}, [^\n]+ >>> +** | >>> +** bfmul {z4\.h - z7\.h}, {z4\.h - z7\.h}, [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** ) >>> +** ret >>> +*/ >>> +TEST_XN (mul_z4_z4_z23, svbfloat16x4_t, z4, >>> + svmul_bf16_x4 (z4, z23), >>> + svmul (z4, z23)) >>> + >>> +/* >>> +** mul_single_z24_z24_z0: >>> +** bfmul {z24\.h - z27\.h}, {z24\.h - z27\.h}, z0\.h >>> +** ret >>> +*/ >>> +TEST_XN_SINGLE (mul_single_z24_z24_z0, svbfloat16x4_t, svbfloat16_t, z24, >>> + svmul_single_bf16_x4 (z24, z0), >>> + svmul (z24, z0)) >>> + >>> +/* >>> +** mul_single_z24_z28_z0: >>> +** bfmul {z24\.h - z27\.h}, {z28\.h - z31\.h}, z0\.h >>> +** ret >>> +*/ >>> +TEST_XN_SINGLE (mul_single_z24_z28_z0, svbfloat16x4_t, svbfloat16_t, z24, >>> + svmul_single_bf16_x4 (z28, z0), >>> + svmul (z28, z0)) >>> + >>> +/* >>> +** mul_single_z24_z1_z0: >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** bfmul {z24\.h - z27\.h}, {z28\.h - z31\.h}, z0\.h >>> +** ret >>> +*/ >>> +TEST_XN_SINGLE (mul_single_z24_z1_z0, svbfloat16x4_t, svbfloat16_t, z24, >>> + svmul_single_bf16_x4 (z1, z0), >>> + svmul (z1, z0)) >>> + >>> +/* >>> +** mul_single_z1_z24_z0: >>> +** bfmul {z28\.h - z31\.h}, {z24\.h - z27\.h}, z0\.h >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** ret >>> +*/ >>> +TEST_XN_SINGLE (mul_single_z1_z24_z0, svbfloat16x4_t, svbfloat16_t, z1, >>> + svmul_single_bf16_x4 (z24, z0), >>> + svmul (z24, z0)) >>> + >>> +/* >>> +** mul_single_z1_z1_z0: >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** bfmul ({z[0-9]+\.h - z[0-9]+\.h}), \1, z0\.h >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** ret >>> +*/ >>> +TEST_XN_SINGLE (mul_single_z1_z1_z0, svbfloat16x4_t, svbfloat16_t, z1, >>> + svmul_single_bf16_x4 (z1, z0), >>> + svmul (z1, z0)) >>> + >>> +/* >>> +** mul_single_z18_z18_z0: >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** bfmul [^\n]+, z0\.h >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** ret >>> +*/ >>> +TEST_XN_SINGLE (mul_single_z18_z18_z0, svbfloat16x4_t, svbfloat16_t, z18, >>> + svmul_single_bf16_x4 (z18, z0), >>> + svmul (z18, z0)) >>> + >>> +/* >>> +** mul_single_awkward: >>> +** ... >>> +** bfmul {z0\.h - z3\.h}, {z[0-9]+\.h - z[0-9]+\.h}, z[0-9]+\.h >>> +** ret >>> +*/ >>> +TEST_XN_SINGLE_AWKWARD (mul_single_awkward, svbfloat16x4_t, svbfloat16_t, >>> + z0_res = svmul_single_bf16_x4 (z1, z0), >>> + z0_res = svmul (z1, z0)) >>> + >>> +/* >>> +** mul_single_z0_z0_z15: >>> +** ... >>> +** bfmul {z0\.h - z3\.h}, {z0\.h - z3\.h}, z15\.h >>> +** ... >>> +** ret >>> +*/ >>> +TEST_XN_SINGLE_Z15 (mul_single_z0_z0_z15, svbfloat16x4_t, svbfloat16_t, >>> + z0 = svmul_single_bf16_x4 (z0, z15), >>> + z0 = svmul (z0, z15)) >>> + >>> +/* >>> +** mul_single_z24_z24_z16: >>> +** mov (z[0-7])\.d, z16\.d >>> +** bfmul {z24\.h - z27\.h}, {z24\.h - z27\.h}, \1\.h >>> +** ret >>> +*/ >>> +TEST_XN_SINGLE (mul_single_z24_z24_z16, svbfloat16x4_t, svbfloat16_t, z24, >>> + svmul_single_bf16_x4 (z24, z16), >>> + svmul (z24, z16)) >>> diff --git a/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/scale_bf16_x2.c >>> b/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/scale_bf16_x2.c >>> new file mode 100644 >>> index 00000000000..33a14e34653 >>> --- /dev/null >>> +++ b/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/scale_bf16_x2.c >>> @@ -0,0 +1,194 @@ >>> +/* { dg-do assemble { target aarch64_asm_sve-bfscale_ok } } */ >>> +/* { dg-do compile { target { ! aarch64_asm_sve-bfscale_ok } } } */ >>> +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */ >>> + >>> +#include "test_sme2_acle.h" >>> +#pragma GCC target "+sve-bfscale" >>> + >>> +/* >>> +** bfscale_z0_z0_z4: >>> +** bfscale {z0\.h - z1\.h}, {z0\.h - z1\.h}, {z4\.h - z5\.h} >>> +** ret >>> +*/ >>> +TEST_DUAL_XN (bfscale_z0_z0_z4, svbfloat16x2_t, svint16x2_t, z0, >>> + svscale_bf16_x2 (z0, z4), >>> + svscale (z0, z4)) >>> + >>> +/* >>> +** bfscale_z4_z4_z0: >>> +** bfscale {z4\.h - z5\.h}, {z4\.h - z5\.h}, {z0\.h - z1\.h} >>> +** ret >>> +*/ >>> +TEST_DUAL_XN (bfscale_z4_z4_z0, svint16x2_t, svbfloat16x2_t, z4, >>> + svscale_bf16_x2 (z4, z0), >>> + svscale (z4, z0)) >>> + >>> +/* >>> +** bfscale_z18_z18_z4: >>> +** bfscale {z18\.h - z19\.h}, {z18\.h - z19\.h}, {z4\.h - z5\.h} >>> +** ret >>> +*/ >>> +TEST_DUAL_XN (bfscale_z18_z18_z4, svbfloat16x2_t, svint16x2_t, z18, >>> + svscale_bf16_x2 (z18, z4), >>> + svscale (z18, z4)) >>> + >>> +/* >>> +** bfscale_z23_z23_z18: >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** bfscale [^\n]+, {z18\.h - z19\.h} >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** ret >>> +*/ >>> +TEST_DUAL_XN (bfscale_z23_z23_z18, svint16x2_t, svbfloat16x2_t, z23, >>> + svscale_bf16_x2 (z23, z18), >>> + svscale (z23, z18)) >>> + >>> + >>> +/* >>> +** bfscale_z28_z28_z4: >>> +** bfscale {z28\.h - z29\.h}, {z28\.h - z29\.h}, {z4\.h - z5\.h} >>> +** ret >>> +*/ >>> +TEST_DUAL_XN (bfscale_z28_z28_z4, svbfloat16x2_t, svint16x2_t, z28, >>> + svscale_bf16_x2 (z28, z4), >>> + svscale (z28, z4)) >>> + >>> +/* >>> +** bfscale_z4_z4_z18: >>> +** bfscale {z4\.h - z5\.h}, {z4\.h - z5\.h}, {z18\.h - z19\.h} >>> +** ret >>> +*/ >>> +TEST_DUAL_XN (bfscale_z4_z4_z18, svint16x2_t, svbfloat16x2_t, z4, >>> + svscale_bf16_x2 (z4, z18), >>> + svscale (z4, z18)) >>> + >>> +/* >>> +** bfscale_z28_28_z23: >>> +** ( >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** bfscale {z28\.h - z29\.h}, {z28\.h - z29\.h}, [^\n]+ >>> +** | >>> +** bfscale {z28\.h - z29\.h}, {z28\.h - z29\.h}, [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** ) >>> +** ret >>> +*/ >>> +TEST_DUAL_XN (bfscale_z28_28_z23, svbfloat16x2_t, svint16x2_t, z28, >>> + svscale_bf16_x2 (z28, z23), >>> + svscale (z28, z23)) >>> + >>> +/* >>> +** bfscale_single_z24_z24_z0: >>> +** bfscale {z24\.h - z25\.h}, {z24\.h - z25\.h}, z0\.h >>> +** ret >>> +*/ >>> +TEST_XN_SINGLE (bfscale_single_z24_z24_z0, svbfloat16x2_t, svint16_t, z24, >>> + svscale_single_bf16_x2 (z24, z0), >>> + svscale (z24, z0)) >>> + >>> +/* >>> +** bfscale_single_z24_z28_z0: >>> +** ( >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** bfscale {z24\.h - z25\.h}, {z24\.h - z25\.h}, z0\.h >>> +** | >>> +** bfscale {z28\.h - z29\.h}, {z28\.h - z29\.h}, z0\.h >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** ) >>> +** ret >>> +*/ >>> +TEST_XN_SINGLE (bfscale_single_z24_z28_z0, svbfloat16x2_t, svint16_t, z24, >>> + svscale_single_bf16_x2 (z28, z0), >>> + svscale (z28, z0)) >>> + >>> +/* >>> +** bfscale_single_z24_z1_z0: >>> +** ( >>> +** mov z24\.d, z1\.d >>> +** mov z25\.d, z2\.d >>> +** | >>> +** mov z25\.d, z2\.d >>> +** mov z24\.d, z1\.d >>> +** ) >>> +** bfscale {z24\.h - z25\.h}, {z24\.h - z25\.h}, z0\.h >>> +** ret >>> +*/ >>> +TEST_XN_SINGLE (bfscale_single_z24_z1_z0, svbfloat16x2_t, svint16_t, z24, >>> + svscale_single_bf16_x2 (z1, z0), >>> + svscale (z1, z0)) >>> + >>> +/* >>> +** bfscale_single_z1_z24_z0: >>> +** bfscale {z24\.h - z25\.h}, {z24\.h - z25\.h}, z0\.h >>> +** ( >>> +** mov z1\.d, z24\.d >>> +** mov z2\.d, z25\.d >>> +** | >>> +** mov z2\.d, z25\.d >>> +** mov z1\.d, z24\.d >>> +** ) >>> +** ret >>> +*/ >>> +TEST_XN_SINGLE (bfscale_single_z1_z24_z0, svbfloat16x2_t, svint16_t, z1, >>> + svscale_single_bf16_x2 (z24, z0), >>> + svscale (z24, z0)) >>> + >>> +/* >>> +** bfscale_single_z1_z1_z0: >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** bfscale ({z[0-9]+\.h - z[0-9]+\.h}), \1, z0\.h >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** ret >>> +*/ >>> +TEST_XN_SINGLE (bfscale_single_z1_z1_z0, svbfloat16x2_t, svint16_t, z1, >>> + svscale_single_bf16_x2 (z1, z0), >>> + svscale (z1, z0)) >>> + >>> +/* >>> +** bfscale_single_z18_z18_z0: >>> +** bfscale {z18\.h - z19\.h}, {z18\.h - z19\.h}, z0\.h >>> +** ret >>> +*/ >>> +TEST_XN_SINGLE (bfscale_single_z18_z18_z0, svbfloat16x2_t, svint16_t, z18, >>> + svscale_single_bf16_x2 (z18, z0), >>> + svscale (z18, z0)) >>> + >>> +/* >>> +** bfscale_single_awkward: >>> +** ... >>> +** bfscale ({z[0-9]+\.h - z[0-9]+\.h}), \1, z[0-9]+\.h >>> +** ... >>> +** ret >>> +*/ >>> +TEST_XN_SINGLE_AWKWARD (bfscale_single_awkward, svbfloat16x2_t, svint16_t, >>> + z0_res = svscale_single_bf16_x2 (z1, z0), >>> + z0_res = svscale (z1, z0)) >>> + >>> +/* >>> +** bfscale_single_z0_z0_z15: >>> +** ... >>> +** bfscale {z0\.h - z1\.h}, {z0\.h - z1\.h}, z15\.h >>> +** ... >>> +** ret >>> +*/ >>> +TEST_XN_SINGLE_Z15 (bfscale_single_z0_z0_z15, svbfloat16x2_t, svint16_t, >>> + z0 = svscale_single_bf16_x2 (z0, z15), >>> + z0 = svscale (z0, z15)) >>> + >>> +/* >>> +** bfscale_single_z24_z24_z16: >>> +** mov (z[0-7])\.d, z16\.d >>> +** bfscale {z24\.h - z25\.h}, {z24\.h - z25\.h}, \1\.h >>> +** ret >>> +*/ >>> +TEST_XN_SINGLE (bfscale_single_z24_z24_z16, svbfloat16x2_t, svint16_t, z24, >>> + svscale_single_bf16_x2 (z24, z16), >>> + svscale (z24, z16)) >>> diff --git a/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/scale_bf16_x4.c >>> b/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/scale_bf16_x4.c >>> new file mode 100644 >>> index 00000000000..d9e90746a98 >>> --- /dev/null >>> +++ b/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/scale_bf16_x4.c >>> @@ -0,0 +1,231 @@ >>> +/* { dg-do assemble { target aarch64_asm_sve-bfscale_ok } } */ >>> +/* { dg-do compile { target { ! aarch64_asm_sve-bfscale_ok } } } */ >>> +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */ >>> + >>> +#include "test_sme2_acle.h" >>> +#pragma GCC target "+sve-bfscale" >>> + >>> +/* >>> +** bfscale_z0_z0_z4: >>> +** bfscale {z0\.h - z3\.h}, {z0\.h - z3\.h}, {z4\.h - z7\.h} >>> +** ret >>> +*/ >>> +TEST_DUAL_XN (bfscale_z0_z0_z4, svbfloat16x4_t, svint16x4_t, z0, >>> + svscale_bf16_x4 (z0, z4), >>> + svscale (z0, z4)) >>> + >>> +/* >>> +** bfscale_z4_z4_z0: >>> +** bfscale {z4\.h - z7\.h}, {z4\.h - z7\.h}, {z0\.h - z3\.h} >>> +** ret >>> +*/ >>> +TEST_DUAL_XN (bfscale_z4_z4_z0, svint16x4_t, svbfloat16x4_t, z4, >>> + svscale_bf16_x4 (z4, z0), >>> + svscale (z4, z0)) >>> + >>> +/* >>> +** bfscale_z18_z18_z4: >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** bfscale [^\n]+, {z4\.h - z7\.h} >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** ret >>> +*/ >>> +TEST_DUAL_XN (bfscale_z18_z18_z4, svbfloat16x4_t, svint16x4_t, z18, >>> + svscale_bf16_x4 (z18, z4), >>> + svscale (z18, z4)) >>> + >>> +/* >>> +** bfscale_z23_z23_z28: >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** bfscale [^\n]+, {z28\.h - z31\.h} >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** ret >>> +*/ >>> +TEST_DUAL_XN (bfscale_z23_z23_z28, svint16x4_t, svbfloat16x4_t, z23, >>> + svscale_bf16_x4 (z23, z28), >>> + svscale (z23, z28)) >>> + >>> +/* >>> +** bfscale_z28_z28_z4: >>> +** bfscale {z28\.h - z31\.h}, {z28\.h - z31\.h}, {z4\.h - z7\.h} >>> +** ret >>> +*/ >>> +TEST_DUAL_XN (bfscale_z28_z28_z4, svbfloat16x4_t, svint16x4_t, z28, >>> + svscale_bf16_x4 (z28, z4), >>> + svscale (z28, z4)) >>> + >>> +/* >>> +** bfscale_z4_z4_z18: >>> +** ( >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** bfscale {z4\.h - z7\.h}, {z4\.h - z7\.h}, [^\n]+ >>> +** | >>> +** bfscale {z4\.h - z7\.h}, {z4\.h - z7\.h}, [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** ) >>> +** ret >>> +*/ >>> +TEST_DUAL_XN (bfscale_z4_z4_z18, svint16x4_t, svbfloat16x4_t, z4, >>> + svscale_bf16_x4 (z4, z18), >>> + svscale (z4, z18)) >>> + >>> +/* >>> +** bfscale_z0_z0_z23: >>> +** ( >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** bfscale {z0\.h - z3\.h}, {z0\.h - z3\.h}, [^\n]+ >>> +** | >>> +** bfscale {z0\.h - z3\.h}, {z0\.h - z3\.h}, [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** ) >>> +** ret >>> +*/ >>> +TEST_DUAL_XN (bfscale_z0_z0_z23, svbfloat16x4_t, svint16x4_t, z0, >>> + svscale_bf16_x4 (z0, z23), >>> + svscale (z0, z23)) >>> + >>> +/* >>> +** bfscale_single_z24_z24_z0: >>> +** bfscale {z24\.h - z27\.h}, {z24\.h - z27\.h}, z0\.h >>> +** ret >>> +*/ >>> +TEST_XN_SINGLE (bfscale_single_z24_z24_z0, svbfloat16x4_t, svint16_t, z24, >>> + svscale_single_bf16_x4 (z24, z0), >>> + svscale (z24, z0)) >>> + >>> +/* >>> +** bfscale_single_z24_z28_z0: >>> +** ( >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** bfscale {z24\.h - z27\.h}, {z24\.h - z27\.h}, z0\.h >>> +** | >>> +** bfscale {z28\.h - z31\.h}, {z28\.h - z31\.h}, z0\.h >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** ) >>> +** ret >>> +*/ >>> +TEST_XN_SINGLE (bfscale_single_z24_z28_z0, svbfloat16x4_t, svint16_t, z24, >>> + svscale_single_bf16_x4 (z28, z0), >>> + svscale (z28, z0)) >>> + >>> +/* >>> +** bfscale_single_z24_z1_z0: >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** bfscale {z24\.h - z27\.h}, {z24\.h - z27\.h}, z0\.h >>> +** ret >>> +*/ >>> +TEST_XN_SINGLE (bfscale_single_z24_z1_z0, svbfloat16x4_t, svint16_t, z24, >>> + svscale_single_bf16_x4 (z1, z0), >>> + svscale (z1, z0)) >>> + >>> +/* >>> +** bfscale_single_z1_z24_z0: >>> +** bfscale {z24\.h - z27\.h}, {z24\.h - z27\.h}, z0\.h >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** ret >>> +*/ >>> +TEST_XN_SINGLE (bfscale_single_z1_z24_z0, svbfloat16x4_t, svint16_t, z1, >>> + svscale_single_bf16_x4 (z24, z0), >>> + svscale (z24, z0)) >>> + >>> +/* >>> +** bfscale_single_z1_z1_z0: >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** bfscale ({z[0-9]+\.h - z[0-9]+\.h}), \1, z0\.h >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** ret >>> +*/ >>> +TEST_XN_SINGLE (bfscale_single_z1_z1_z0, svbfloat16x4_t, svint16_t, z1, >>> + svscale_single_bf16_x4 (z1, z0), >>> + svscale (z1, z0)) >>> + >>> +/* >>> +** bfscale_single_z18_z18_z0: >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** bfscale [^\n]+, z0\.h >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** mov [^\n]+ >>> +** ret >>> +*/ >>> +TEST_XN_SINGLE (bfscale_single_z18_z18_z0, svbfloat16x4_t, svint16_t, z18, >>> + svscale_single_bf16_x4 (z18, z0), >>> + svscale (z18, z0)) >>> + >>> +/* >>> +** bfscale_single_awkward: >>> +** ... >>> +** bfscale ({z[0-9]+\.h - z[0-9]+\.h}), \1, z[0-9]+\.h >>> +** ... >>> +** ret >>> +*/ >>> +TEST_XN_SINGLE_AWKWARD (bfscale_single_awkward, svbfloat16x4_t, svint16_t, >>> + z0_res = svscale_single_bf16_x4 (z1, z0), >>> + z0_res = svscale (z1, z0)) >>> + >>> +/* >>> +** bfscale_single_z0_z0_z15: >>> +** ... >>> +** bfscale {z0\.h - z3\.h}, {z0\.h - z3\.h}, z15\.h >>> +** ... >>> +** ret >>> +*/ >>> +TEST_XN_SINGLE_Z15 (bfscale_single_z0_z0_z15, svbfloat16x4_t, svint16_t, >>> + z0 = svscale_single_bf16_x4 (z0, z15), >>> + z0 = svscale (z0, z15)) >>> + >>> +/* >>> +** bfscale_single_z24_z24_z16: >>> +** mov (z[0-7])\.d, z16\.d >>> +** bfscale {z24\.h - z27\.h}, {z24\.h - z27\.h}, \1\.h >>> +** ret >>> +*/ >>> +TEST_XN_SINGLE (bfscale_single_z24_z24_z16, svbfloat16x4_t, svint16_t, z24, >>> + svscale_single_bf16_x4 (z24, z16), >>> + svscale (z24, z16)) >>> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/scale_bf16.c >>> b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/scale_bf16.c >>> new file mode 100644 >>> index 00000000000..4e8ff3392ae >>> --- /dev/null >>> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/scale_bf16.c >>> @@ -0,0 +1,337 @@ >>> +/* { dg-do assemble { target aarch64_asm_sve-bfscale_ok } } */ >>> +/* { dg-do compile { target { ! aarch64_asm_sve-bfscale_ok } } } */ >>> +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */ >>> + >>> +#include "test_sve_acle.h" >>> +#pragma GCC target "+sve2,+sve-bfscale" >>> +#ifdef STREAMING_COMPATIBLE >>> +#pragma GCC target "+sme2" >>> +#endif >>> + >>> +/* >>> +** scale_bf16_m_tied1: >>> +** bfscale z0\.h, p0/m, z0\.h, z4\.h >>> +** ret >>> +*/ >>> +TEST_DUAL_Z (scale_bf16_m_tied1, svbfloat16_t, svint16_t, >>> + z0 = svscale_bf16_m (p0, z0, z4), >>> + z0 = svscale_m (p0, z0, z4)) >>> + >>> +/* >>> +** scale_bf16_m_tied2: >>> +** mov (z[0-9]+)\.d, z0\.d >>> +** movprfx z0, z4 >>> +** bfscale z0\.h, p0/m, z0\.h, \1\.h >>> +** ret >>> +*/ >>> +TEST_DUAL_Z_REV (scale_bf16_m_tied2, svbfloat16_t, svint16_t, >>> + z0_res = svscale_bf16_m (p0, z4, z0), >>> + z0_res = svscale_m (p0, z4, z0)) >>> + >>> +/* >>> +** scale_bf16_m_untied: >>> +** movprfx z0, z1 >>> +** bfscale z0\.h, p0/m, z0\.h, z4\.h >>> +** ret >>> +*/ >>> +TEST_DUAL_Z (scale_bf16_m_untied, svbfloat16_t, svint16_t, >>> + z0 = svscale_bf16_m (p0, z1, z4), >>> + z0 = svscale_m (p0, z1, z4)) >>> + >>> +/* >>> +** scale_w0_bf16_m_tied1: >>> +** mov (z[0-9]+\.h), w0 >>> +** bfscale z0\.h, p0/m, z0\.h, \1 >>> +** ret >>> +*/ >>> +TEST_UNIFORM_ZX (scale_w0_bf16_m_tied1, svbfloat16_t, int16_t, >>> + z0 = svscale_n_bf16_m (p0, z0, x0), >>> + z0 = svscale_m (p0, z0, x0)) >>> + >>> +/* >>> +** scale_w0_bf16_m_untied: >>> +** mov (z[0-9]+\.h), w0 >>> +** movprfx z0, z1 >>> +** bfscale z0\.h, p0/m, z0\.h, \1 >>> +** ret >>> +*/ >>> +TEST_UNIFORM_ZX (scale_w0_bf16_m_untied, svbfloat16_t, int16_t, >>> + z0 = svscale_n_bf16_m (p0, z1, x0), >>> + z0 = svscale_m (p0, z1, x0)) >>> + >>> +/* >>> +** scale_3_bf16_m_tied1: >>> +** mov (z[0-9]+\.h), #3 >>> +** bfscale z0\.h, p0/m, z0\.h, \1 >>> +** ret >>> +*/ >>> +TEST_UNIFORM_Z (scale_3_bf16_m_tied1, svbfloat16_t, >>> + z0 = svscale_n_bf16_m (p0, z0, 3), >>> + z0 = svscale_m (p0, z0, 3)) >>> + >>> +/* >>> +** scale_3_bf16_m_untied: >>> +** mov (z[0-9]+\.h), #3 >>> +** movprfx z0, z1 >>> +** bfscale z0\.h, p0/m, z0\.h, \1 >>> +** ret >>> +*/ >>> +TEST_UNIFORM_Z (scale_3_bf16_m_untied, svbfloat16_t, >>> + z0 = svscale_n_bf16_m (p0, z1, 3), >>> + z0 = svscale_m (p0, z1, 3)) >>> + >>> +/* >>> +** scale_m3_bf16_m: >>> +** mov (z[0-9]+\.h), #-3 >>> +** bfscale z0\.h, p0/m, z0\.h, \1 >>> +** ret >>> +*/ >>> +TEST_UNIFORM_Z (scale_m3_bf16_m, svbfloat16_t, >>> + z0 = svscale_n_bf16_m (p0, z0, -3), >>> + z0 = svscale_m (p0, z0, -3)) >>> + >>> +/* >>> +** scale_bf16_z_tied1: >>> +** movprfx z0\.h, p0/z, z0\.h >>> +** bfscale z0\.h, p0/m, z0\.h, z4\.h >>> +** ret >>> +*/ >>> +TEST_DUAL_Z (scale_bf16_z_tied1, svbfloat16_t, svint16_t, >>> + z0 = svscale_bf16_z (p0, z0, z4), >>> + z0 = svscale_z (p0, z0, z4)) >>> + >>> +/* >>> +** scale_bf16_z_tied2: >>> +** mov (z[0-9]+)\.d, z0\.d >>> +** movprfx z0\.h, p0/z, z4\.h >>> +** bfscale z0\.h, p0/m, z0\.h, \1\.h >>> +** ret >>> +*/ >>> +TEST_DUAL_Z_REV (scale_bf16_z_tied2, svbfloat16_t, svint16_t, >>> + z0_res = svscale_bf16_z (p0, z4, z0), >>> + z0_res = svscale_z (p0, z4, z0)) >>> + >>> +/* >>> +** scale_bf16_z_untied: >>> +** movprfx z0\.h, p0/z, z1\.h >>> +** bfscale z0\.h, p0/m, z0\.h, z4\.h >>> +** ret >>> +*/ >>> +TEST_DUAL_Z (scale_bf16_z_untied, svbfloat16_t, svint16_t, >>> + z0 = svscale_bf16_z (p0, z1, z4), >>> + z0 = svscale_z (p0, z1, z4)) >>> + >>> +/* >>> +** scale_w0_bf16_z_tied1: >>> +** mov (z[0-9]+\.h), w0 >>> +** movprfx z0\.h, p0/z, z0\.h >>> +** bfscale z0\.h, p0/m, z0\.h, \1 >>> +** ret >>> +*/ >>> +TEST_UNIFORM_ZX (scale_w0_bf16_z_tied1, svbfloat16_t, int16_t, >>> + z0 = svscale_n_bf16_z (p0, z0, x0), >>> + z0 = svscale_z (p0, z0, x0)) >>> + >>> +/* >>> +** scale_w0_bf16_z_untied: >>> +** mov (z[0-9]+\.h), w0 >>> +** movprfx z0\.h, p0/z, z1\.h >>> +** bfscale z0\.h, p0/m, z0\.h, \1 >>> +** ret >>> +*/ >>> +TEST_UNIFORM_ZX (scale_w0_bf16_z_untied, svbfloat16_t, int16_t, >>> + z0 = svscale_n_bf16_z (p0, z1, x0), >>> + z0 = svscale_z (p0, z1, x0)) >>> + >>> +/* >>> +** scale_3_bf16_z_tied1: >>> +** mov (z[0-9]+\.h), #3 >>> +** movprfx z0\.h, p0/z, z0\.h >>> +** bfscale z0\.h, p0/m, z0\.h, \1 >>> +** ret >>> +*/ >>> +TEST_UNIFORM_Z (scale_3_bf16_z_tied1, svbfloat16_t, >>> + z0 = svscale_n_bf16_z (p0, z0, 3), >>> + z0 = svscale_z (p0, z0, 3)) >>> + >>> +/* >>> +** scale_3_bf16_z_untied: >>> +** mov (z[0-9]+\.h), #3 >>> +** movprfx z0\.h, p0/z, z1\.h >>> +** bfscale z0\.h, p0/m, z0\.h, \1 >>> +** ret >>> +*/ >>> +TEST_UNIFORM_Z (scale_3_bf16_z_untied, svbfloat16_t, >>> + z0 = svscale_n_bf16_z (p0, z1, 3), >>> + z0 = svscale_z (p0, z1, 3)) >>> + >>> +/* >>> +** scale_m3_bf16_z: >>> +** mov (z[0-9]+\.h), #-3 >>> +** movprfx z0\.h, p0/z, z0\.h >>> +** bfscale z0\.h, p0/m, z0\.h, \1 >>> +** ret >>> +*/ >>> +TEST_UNIFORM_Z (scale_m3_bf16_z, svbfloat16_t, >>> + z0 = svscale_n_bf16_z (p0, z0, -3), >>> + z0 = svscale_z (p0, z0, -3)) >>> + >>> +/* >>> +** scale_bf16_x_tied1: >>> +** bfscale z0\.h, p0/m, z0\.h, z4\.h >>> +** ret >>> +*/ >>> +TEST_DUAL_Z (scale_bf16_x_tied1, svbfloat16_t, svint16_t, >>> + z0 = svscale_bf16_x (p0, z0, z4), >>> + z0 = svscale_x (p0, z0, z4)) >>> + >>> +/* >>> +** scale_bf16_x_tied2: >>> +** mov (z[0-9]+)\.d, z0\.d >>> +** movprfx z0, z4 >>> +** bfscale z0\.h, p0/m, z0\.h, \1\.h >>> +** ret >>> +*/ >>> +TEST_DUAL_Z_REV (scale_bf16_x_tied2, svbfloat16_t, svint16_t, >>> + z0_res = svscale_bf16_x (p0, z4, z0), >>> + z0_res = svscale_x (p0, z4, z0)) >>> + >>> +/* >>> +** scale_bf16_x_untied: >>> +** movprfx z0, z1 >>> +** bfscale z0\.h, p0/m, z0\.h, z4\.h >>> +** ret >>> +*/ >>> +TEST_DUAL_Z (scale_bf16_x_untied, svbfloat16_t, svint16_t, >>> + z0 = svscale_bf16_x (p0, z1, z4), >>> + z0 = svscale_x (p0, z1, z4)) >>> + >>> +/* >>> +** scale_w0_bf16_x_tied1: >>> +** mov (z[0-9]+\.h), w0 >>> +** bfscale z0\.h, p0/m, z0\.h, \1 >>> +** ret >>> +*/ >>> +TEST_UNIFORM_ZX (scale_w0_bf16_x_tied1, svbfloat16_t, int16_t, >>> + z0 = svscale_n_bf16_x (p0, z0, x0), >>> + z0 = svscale_x (p0, z0, x0)) >>> + >>> +/* >>> +** scale_w0_bf16_x_untied: >>> +** mov (z[0-9]+\.h), w0 >>> +** movprfx z0, z1 >>> +** bfscale z0\.h, p0/m, z0\.h, \1 >>> +** ret >>> +*/ >>> +TEST_UNIFORM_ZX (scale_w0_bf16_x_untied, svbfloat16_t, int16_t, >>> + z0 = svscale_n_bf16_x (p0, z1, x0), >>> + z0 = svscale_x (p0, z1, x0)) >>> + >>> +/* >>> +** scale_3_bf16_x_tied1: >>> +** mov (z[0-9]+\.h), #3 >>> +** bfscale z0\.h, p0/m, z0\.h, \1 >>> +** ret >>> +*/ >>> +TEST_UNIFORM_Z (scale_3_bf16_x_tied1, svbfloat16_t, >>> + z0 = svscale_n_bf16_x (p0, z0, 3), >>> + z0 = svscale_x (p0, z0, 3)) >>> + >>> +/* >>> +** scale_3_bf16_x_untied: >>> +** mov (z[0-9]+\.h), #3 >>> +** movprfx z0, z1 >>> +** bfscale z0\.h, p0/m, z0\.h, \1 >>> +** ret >>> +*/ >>> +TEST_UNIFORM_Z (scale_3_bf16_x_untied, svbfloat16_t, >>> + z0 = svscale_n_bf16_x (p0, z1, 3), >>> + z0 = svscale_x (p0, z1, 3)) >>> + >>> +/* >>> +** scale_m3_bf16_x: >>> +** mov (z[0-9]+\.h), #-3 >>> +** bfscale z0\.h, p0/m, z0\.h, \1 >>> +** ret >>> +*/ >>> +TEST_UNIFORM_Z (scale_m3_bf16_x, svbfloat16_t, >>> + z0 = svscale_n_bf16_x (p0, z0, -3), >>> + z0 = svscale_x (p0, z0, -3)) >>> + >>> +/* >>> +** ptrue_scale_bf16_x_tied1: >>> +** ... >>> +** ptrue p[0-9]+\.b[^\n]* >>> +** ... >>> +** ret >>> +*/ >>> +TEST_DUAL_Z (ptrue_scale_bf16_x_tied1, svbfloat16_t, svint16_t, >>> + z0 = svscale_bf16_x (svptrue_b16 (), z0, z4), >>> + z0 = svscale_x (svptrue_b16 (), z0, z4)) >>> + >>> +/* >>> +** ptrue_scale_bf16_x_tied2: >>> +** ... >>> +** ptrue p[0-9]+\.b[^\n]* >>> +** ... >>> +** ret >>> +*/ >>> +TEST_DUAL_Z_REV (ptrue_scale_bf16_x_tied2, svbfloat16_t, svint16_t, >>> + z0_res = svscale_bf16_x (svptrue_b16 (), z4, z0), >>> + z0_res = svscale_x (svptrue_b16 (), z4, z0)) >>> + >>> +/* >>> +** ptrue_scale_bf16_x_untied: >>> +** ... >>> +** ptrue p[0-9]+\.b[^\n]* >>> +** ... >>> +** ret >>> +*/ >>> +TEST_DUAL_Z (ptrue_scale_bf16_x_untied, svbfloat16_t, svint16_t, >>> + z0 = svscale_bf16_x (svptrue_b16 (), z1, z4), >>> + z0 = svscale_x (svptrue_b16 (), z1, z4)) >>> + >>> +/* >>> +** ptrue_scale_3_bf16_x_tied1: >>> +** ... >>> +** ptrue p[0-9]+\.b[^\n]* >>> +** ... >>> +** ret >>> +*/ >>> +TEST_UNIFORM_Z (ptrue_scale_3_bf16_x_tied1, svbfloat16_t, >>> + z0 = svscale_n_bf16_x (svptrue_b16 (), z0, 3), >>> + z0 = svscale_x (svptrue_b16 (), z0, 3)) >>> + >>> +/* >>> +** ptrue_scale_3_bf16_x_untied: >>> +** ... >>> +** ptrue p[0-9]+\.b[^\n]* >>> +** ... >>> +** ret >>> +*/ >>> +TEST_UNIFORM_Z (ptrue_scale_3_bf16_x_untied, svbfloat16_t, >>> + z0 = svscale_n_bf16_x (svptrue_b16 (), z1, 3), >>> + z0 = svscale_x (svptrue_b16 (), z1, 3)) >>> + >>> +/* >>> +** ptrue_scale_m3_bf16_x_tied1: >>> +** ... >>> +** ptrue p[0-9]+\.b[^\n]* >>> +** ... >>> +** ret >>> +*/ >>> +TEST_UNIFORM_Z (ptrue_scale_m3_bf16_x_tied1, svbfloat16_t, >>> + z0 = svscale_n_bf16_x (svptrue_b16 (), z0, -3), >>> + z0 = svscale_x (svptrue_b16 (), z0, -3)) >>> + >>> +/* >>> +** ptrue_scale_m3_bf16_x_untied: >>> +** ... >>> +** ptrue p[0-9]+\.b[^\n]* >>> +** ... >>> +** ret >>> +*/ >>> +TEST_UNIFORM_Z (ptrue_scale_m3_bf16_x_untied, svbfloat16_t, >>> + z0 = svscale_n_bf16_x (svptrue_b16 (), z1, -3), >>> + z0 = svscale_x (svptrue_b16 (), z1, -3)) >>> + >>> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/bfscale.c >>> b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/bfscale.c >>> new file mode 100644 >>> index 00000000000..051ff47b3bc >>> --- /dev/null >>> +++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/bfscale.c >>> @@ -0,0 +1,114 @@ >>> +// { dg-options "-std=c23 -fsyntax-only" } >>> +// { dg-do compile } >>> + >>> +#pragma GCC target "+sve,+sve2,+sme,+sme2,+sve-bfscale" >>> +static_assert (__ARM_FEATURE_SVE2 == 1); >>> +static_assert (__ARM_FEATURE_SME2 == 1); >>> +static_assert (__ARM_FEATURE_SVE_BFSCALE == 1); >>> +#include <arm_sve.h> >>> +#include <arm_sme.h> >>> + >>> +/* >>> +- BFSCALE (predicated) >>> + // Only if __ARM_FEATURE_SVE_BFSCALE != 0 && __ARM_FEATURE_SVE2 != 0 >>> + svbfloat16_t svscale[_bf16]_m (svbool_t pg, svbfloat16_t zdn, svint16_t >>> zm); >>> + svbfloat16_t svscale[_bf16]_x (svbool_t pg, svbfloat16_t zdn, svint16_t >>> zm); >>> + svbfloat16_t svscale[_bf16]_z (svbool_t pg, svbfloat16_t zdn, svint16_t >>> zm); >>> + svbfloat16_t svscale[_n_bf16]_m (svbool_t pg, svbfloat16_t zdn, int16_t >>> zm); >>> + svbfloat16_t svscale[_n_bf16]_x (svbool_t pg, svbfloat16_t zdn, int16_t >>> zm); >>> + svbfloat16_t svscale[_n_bf16]_z (svbool_t pg, svbfloat16_t zdn, int16_t >>> zm); */ >>> + >>> +void >>> +svscale_predicated_explicit_ok (svbool_t p, svbfloat16_t bf16x1, >>> + svint16_t i16x1, int16_t i16) >>> +{ >>> + bf16x1 = svscale_bf16_m (p, bf16x1, i16x1); >>> + bf16x1 = svscale_bf16_x (p, bf16x1, i16x1); >>> + bf16x1 = svscale_bf16_z (p, bf16x1, i16x1); >>> + >>> + bf16x1 = svscale_n_bf16_m (p, bf16x1, i16); >>> + bf16x1 = svscale_n_bf16_x (p, bf16x1, i16); >>> + bf16x1 = svscale_n_bf16_z (p, bf16x1, i16); >>> +} >>> + >>> +void >>> +svscale_predicated_inferred_ok (svbool_t p, svbfloat16_t bf16x1, >>> + svbfloat16x4_t bf16x4, svint16_t i16x1, >>> + int16_t i16) >>> +{ >>> + bf16x1 = svscale_m (p, bf16x1, i16x1); >>> + bf16x1 = svscale_x (p, bf16x1, i16x1); >>> + bf16x1 = svscale_z (p, bf16x1, i16x1); >>> + >>> + bf16x1 = svscale_m (p, bf16x1, i16); >>> + bf16x1 = svscale_x (p, bf16x1, i16); >>> + bf16x1 = svscale_z (p, bf16x1, i16); >>> +} >>> + >>> +/* >>> +- BFSCALE (multiple vectors) >>> + // Only if __ARM_FEATURE_SVE_BFSCALE != 0 && __ARM_FEATURE_SME2 != 0 >>> + svbfloat16x2_t svscale[_bf16_x2] (svbfloat16x2_t zdn, svint16x2_t zm) >>> __arm_streaming; >>> + svbfloat16x4_t svscale[_bf16_x4] (svbfloat16x4_t zdn, svint16x4_t zm) >>> __arm_streaming; >>> + >>> +- BFSCALE (multiple and single vector) >>> + // Only if __ARM_FEATURE_SVE_BFSCALE != 0 && __ARM_FEATURE_SME2 != 0 >>> + svbfloat16x2_t svscale[_single_bf16_x2] (svbfloat16x2_t zn, svint16_t >>> zm) __arm_streaming; >>> + svbfloat16x4_t svscale[_single_bf16_x4] (svbfloat16x4_t zn, svint16_t >>> zm) __arm_streaming; */ >>> + >>> +void >>> +svscale_explicit_ok (svbfloat16_t bf16x1, svbfloat16x2_t bf16x2, >>> + svbfloat16x4_t bf16x4, svint16_t i16x1, svint16x2_t i16x2, >>> + svint16x4_t i16x4) __arm_streaming >>> +{ >>> + bf16x2 = svscale_bf16_x2 (bf16x2, i16x2); >>> + bf16x4 = svscale_bf16_x4 (bf16x4, i16x4); >>> + >>> + bf16x2 = svscale_single_bf16_x2 (bf16x2, i16x1); >>> + bf16x4 = svscale_single_bf16_x4 (bf16x4, i16x1); >>> +} >>> + >>> +void >>> +svscale_inferred_ok (svbfloat16x2_t bf16x2, svbfloat16x4_t bf16x4, >>> + svint16_t i16x1, svint16x2_t i16x2, >>> + svint16x4_t i16x4) __arm_streaming >>> +{ >>> + bf16x2 = svscale_bf16_x2 (bf16x2, i16x2); >>> + bf16x4 = svscale_bf16_x4 (bf16x4, i16x4); >>> + >>> + bf16x2 = svscale_single_bf16_x2 (bf16x2, i16x1); >>> + bf16x4 = svscale_single_bf16_x4 (bf16x4, i16x1); >>> +} >>> + >>> +/* >>> +- BFMUL (multiple vectors) >>> + // Only if __ARM_FEATURE_SVE_BFSCALE != 0 && __ARM_FEATURE_SME2 != 0 >>> + svbfloat16x2_t svmul[_bf16_x2] (svbfloat16x2_t zdn, svbfloat16x2_t zm) >>> __arm_streaming; >>> + svbfloat16x4_t svmul[_bf16_x4] (svbfloat16x4_t zdn, svbfloat16x4_t zm) >>> __arm_streaming; >>> + >>> +- BFMUL (multiple and single vector) >>> + // Only if __ARM_FEATURE_SVE_BFSCALE != 0 && __ARM_FEATURE_SME2 != 0 >>> + svbfloat16x2_t svmul[_single_bf16_x2] (svbfloat16x2_t zn, svbfloat16x2_t >>> zm) __arm_streaming; >>> + svbfloat16x4_t svmul[_single_bf16_x4] (svbfloat16x4_t zn, svbfloat16x4_t >>> zm) __arm_streaming; */ >>> + >>> +void >>> +svmul_explicit_ok (svbfloat16_t bf16x1, svbfloat16x2_t bf16x2, >>> + svbfloat16x4_t bf16x4) __arm_streaming >>> +{ >>> + svmul_bf16_x2 (bf16x2, bf16x2); >>> + svmul_bf16_x4 (bf16x4, bf16x4); >>> + >>> + svmul_single_bf16_x2 (bf16x2, bf16x1); >>> + svmul_single_bf16_x4 (bf16x4, bf16x1); >>> +} >>> + >>> +void >>> +svmul_inferred_ok (svbfloat16_t bf16x1, svbfloat16x2_t bf16x2, >>> + svbfloat16x4_t bf16x4) __arm_streaming >>> +{ >>> + svmul (bf16x2, bf16x2); >>> + svmul (bf16x4, bf16x4); >>> + >>> + svmul (bf16x2, bf16x1); >>> + svmul (bf16x4, bf16x1); >>> +} >>> diff --git a/gcc/testsuite/lib/target-supports.exp >>> b/gcc/testsuite/lib/target-supports.exp >>> index 2b450669c3d..87c21664422 100644 >>> --- a/gcc/testsuite/lib/target-supports.exp >>> +++ b/gcc/testsuite/lib/target-supports.exp >>> @@ -12682,7 +12682,7 @@ set exts { >>> set exts_sve2 { >>> "sme-f8f16" "sme-f8f32" >>> "sme-b16b16" "sme-f16f16" "sme-i16i64" "sme" "sme2" "sme2p1" >>> - "ssve-fp8dot2" "ssve-fp8dot4" "ssve-fp8fma" >>> + "ssve-fp8dot2" "ssve-fp8dot4" "ssve-fp8fma" "sve-bfscale" >>> } >>> >>> foreach { aarch64_ext } $exts { >>> -- >>> 2.43.0
