+ "<b>fscale\t%0, %1, %2"
)
+;; FSCALE / BFSCALE (multiple and single vector)
+;; sv{b}floatNx2_t svscale[_single_{b}fN_x2] (sv{b}floatNx2_t zdn, svintN_t
zm) __arm_streaming;
+;; sv{b}floatNx4_t svscale[_single_{b}fN_x4] (sv{b}floatNx4_t zdn, svintN_t
zm) __arm_streaming;
+;; {B}FSCALE { <Zdn1>.T-<Zdn2>.T }, { <Zdn1>.T-<Zdn2>.T }, <Zm1>.T
+;; {B}FSCALE { <Zdn1>.T-<Zdn4>.T }, { <Zdn1>.T-<Zdn4>.T }, <Zm1>.T
(define_insn "@aarch64_sve_single_fscale<mode>"
- [(set (match_operand:SVE_Fx24_NOBF 0 "register_operand" "=Uw<vector_count>")
- (unspec:SVE_Fx24_NOBF
- [(match_operand:SVE_Fx24_NOBF 1 "register_operand" "0")
+ [(set (match_operand:SVE_Fx24_BFSCALE 0 "register_operand"
"=Uw<vector_count>")
+ (unspec:SVE_Fx24_BFSCALE
+ [(match_operand:SVE_Fx24_BFSCALE 1 "register_operand" "0")
(match_operand:<SVSCALE_SINGLE_INTARG> 2 "register_operand" "x")]
UNSPEC_FSCALE))]
- "TARGET_STREAMING_SME2 && TARGET_FP8"
- "fscale\t%0, %1, %2.<Vetype>"
+ "TARGET_STREAMING_SME2 && (<is_bf16> ? TARGET_SVE_BFSCALE : TARGET_FP8)"
+ "<b>fscale\t%0, %1, %2.<Vetype>"
+)
+
+;; -------------------------------------------------------------------------
+;; ---- [FP] Multiplication
+;; -------------------------------------------------------------------------
+;; Includes the multiple and single vector and multiple vectors forms of
+;; - BFMUL
+;; -------------------------------------------------------------------------
+
+;; BFMUL (multiple vectors)
+;; svbfloat16x2_t svmul[_bf16_x2](svbfloat16x2_t zd, svbfloat16x2_t zm)
__arm_streaming;
+;; svbfloat16x4_t svmul[_bf16_x4](svbfloat16x4_t zd, svbfloat16x4_t zm)
__arm_streaming;
+;; BFMUL { <Zd1>.H-<Zd2>.H }, { <Zn1>.H-<Zn2>.H }, { <Zm1>.H-<Zm2>.H }
+;; BFMUL { <Zd1>.H-<Zd4>.H }, { <Zn1>.H-<Zn4>.H }, { <Zm1>.H-<Zm4>.H }
+(define_insn "@aarch64_sve_<optab><mode>"
+ [(set (match_operand:SVE_BFx24 0 "register_operand" "=Uw<vector_count>")
+ (unspec:SVE_BFx24
+ [(match_operand:SVE_BFx24 1 "register_operand" "Uw<vector_count>")
+ (match_operand:SVE_BFx24 2 "register_operand" "Uw<vector_count>")]
+ SVE_FP_MUL))]
+ "TARGET_STREAMING_SME2 && TARGET_SVE_BFSCALE"
+ "bfmul\t%0, %1, %2"
+)
+
+;; BFMUL (multiple and single vector)
+;; svbfloat16x2_t svmul[_single_bf16_x2](svbfloat16x2_t zd, svbfloat16_t zm)
__arm_streaming;
+;; svbfloat16x4_t svmul[_single_bf16_x4](svbfloat16x4_t zd, svbfloat16_t zm)
__arm_streaming;
+;; BFMUL { <Zd1>.H-<Zd2>.H }, { <Zn1>.H-<Zn2>.H }, <Zm>.H
+;; BFMUL { <Zd1>.H-<Zd4>.H }, { <Zn1>.H-<Zn4>.H }, <Zm>.H
+(define_insn "@aarch64_sve_<optab><mode>_single"
+ [(set (match_operand:SVE_BFx24 0 "register_operand" "=Uw<vector_count>")
+ (unspec:SVE_BFx24
+ [(match_operand:SVE_BFx24 1 "register_operand" "Uw<vector_count>")
+ (match_operand:<VSINGLE> 2 "register_operand" "x")]
+ SVE_FP_MUL))]
+ "TARGET_STREAMING_SME2 && TARGET_SVE_BFSCALE"
+ "bfmul\t%0, %1, %2.h"
)
;; =========================================================================
@@ -4704,4 +4753,3 @@ (define_insn
"@aarch64_sve2_<sve_fp_op><VNx4SF_ONLY:mode><VNx8HF_ONLY:mode>"
}
[(set_attr "sve_type" "sve_fp_mul")]
)
-
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 1dd942f377f..21f9682ef97 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -385,6 +385,7 @@ constexpr auto AARCH64_FL_DEFAULT_ISA_MODE ATTRIBUTE_UNUSED
#define TARGET_BF16_FP AARCH64_HAVE_ISA (BF16)
#define TARGET_BF16_SIMD (TARGET_BF16_FP && TARGET_SIMD)
#define TARGET_SVE_BF16 (TARGET_BF16_FP && TARGET_SVE)
+#define TARGET_SVE_BFSCALE (AARCH64_HAVE_ISA (SVE_BFSCALE))
/* PAUTH instructions are enabled through +pauth. */
#define TARGET_PAUTH AARCH64_HAVE_ISA (PAUTH)
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index b425b0ed2ca..39b1e84edcc 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -501,10 +501,13 @@ (define_mode_iterator SVE_PARTIAL_F [VNx2HF VNx4HF
VNx2SF])
(define_mode_iterator SVE_F [SVE_PARTIAL_F SVE_FULL_F])
;; Fully-packed SVE floating-point vector modes and their scalar equivalents.
-(define_mode_iterator SVE_FULL_F_SCALAR [SVE_FULL_F GPF_HF])
+(define_mode_iterator SVE_FULL_F_SCALAR [SVE_FULL_F GPF_HF (VNx8BF
"TARGET_SVE_BFSCALE")])
(define_mode_iterator SVE_FULL_F_B16B16 [(VNx8BF "TARGET_SSVE_B16B16")
SVE_FULL_F])
+(define_mode_iterator SVE_FULL_F_BFSCALE [SVE_FULL_F
+ (VNx8BF "TARGET_SVE_BFSCALE")])
+
(define_mode_iterator SVE_PARTIAL_F_B16B16 [(VNx2BF "TARGET_SSVE_B16B16")
(VNx4BF "TARGET_SSVE_B16B16")
SVE_PARTIAL_F])
@@ -746,10 +749,18 @@ (define_mode_iterator SVE_Ix24 [VNx32QI VNx16HI VNx8SI
VNx4DI
(define_mode_iterator SVE_Fx24_NOBF [VNx16HF VNx8SF VNx4DF
VNx32HF VNx16SF VNx8DF])
+(define_mode_iterator SVE_Fx24_BFSCALE [
+ SVE_Fx24_NOBF
+ (VNx16BF "TARGET_SVE_BFSCALE") ;; bf16x2
+ (VNx32BF "TARGET_SVE_BFSCALE") ;; bf16x4
+])
+
(define_mode_iterator SVE_Fx24 [(VNx16BF "TARGET_SSVE_B16B16")
(VNx32BF "TARGET_SSVE_B16B16")
SVE_Fx24_NOBF])
+(define_mode_iterator SVE_BFx24 [VNx16BF VNx32BF])
+
(define_mode_iterator SVE_SFx24 [VNx8SF VNx16SF])
;; The modes used to represent different ZA access sizes.
@@ -824,6 +835,7 @@ (define_c_enum "unspec"
UNSPEC_FMAX ; Used in aarch64-simd.md.
UNSPEC_FMAXNMV ; Used in aarch64-simd.md.
UNSPEC_FMAXV ; Used in aarch64-simd.md.
+ UNSPEC_FMUL ; Used in aarch64-sve2.md.
UNSPEC_FMIN ; Used in aarch64-simd.md.
UNSPEC_FMINNMV ; Used in aarch64-simd.md.
UNSPEC_FMINV ; Used in aarch64-simd.md.
@@ -2211,6 +2223,8 @@ (define_mode_attr V_INT_EQUIV [(V8QI "V8QI") (V16QI
"V16QI")
(VNx16QI "VNx16QI")
(VNx8HI "VNx8HI") (VNx8HF "VNx8HI")
(VNx8BF "VNx8HI")
+ (VNx16BF "VNx16HI")
+ (VNx32BF "VNx32HI")
(VNx4SI "VNx4SI") (VNx4SF "VNx4SI")
(VNx2DI "VNx2DI") (VNx2DF "VNx2DI")
(VNx8SF "VNx8SI") (VNx16SF "VNx16SI")
@@ -2792,17 +2806,20 @@ (define_mode_attr vec_or_offset [(V8QI "vec") (V16QI "vec") (V4HI
"vec")
(V8HI "vec") (V2SI "vec") (V4SI "vec")
(V2DI "vec") (DI "offset")])
-(define_mode_attr b [(V4BF "b") (V4HF "") (V8BF "b") (V8HF "")
+(define_mode_attr b [(BF "b") (HF "") (SF "") (DF "")
+ (V4BF "b") (V4HF "") (V8BF "b") (V8HF "")
(VNx2BF "b") (VNx2HF "") (VNx2SF "")
(VNx4BF "b") (VNx4HF "") (VNx4SF "")
(VNx8BF "b") (VNx8HF "") (VNx2DF "")
(VNx16BF "b") (VNx16HF "") (VNx8SF "") (VNx4DF "")
(VNx32BF "b") (VNx32HF "") (VNx16SF "") (VNx8DF "")])
-(define_mode_attr is_bf16 [(VNx2BF "true") (VNx4BF "true") (VNx8BF "true")
- (VNx2HF "false") (VNx4HF "false") (VNx8HF "false")
- (VNx2SF "false") (VNx4SF "false")
- (VNx2DF "false")])
+(define_mode_attr is_bf16 [
+ (VNx2BF "true") (VNx4BF "true") (VNx8BF "true") (VNx16BF "true") (VNx32BF
"true")
+ (VNx2HF "false") (VNx4HF "false") (VNx8HF "false") (VNx16HF "false") (VNx32HF
"false")
+ (VNx2SF "false") (VNx4SF "false") (VNx8SF "false") (VNx16SF "false")
+ (VNx2DF "false") (VNx4DF "false") (VNx8DF "false")
+])
(define_mode_attr aligned_operand [(VNx16QI "register_operand")
(VNx8HI "register_operand")
@@ -2829,22 +2846,29 @@ (define_mode_attr LD1_EXTENDQ_MEM [(VNx4SI "VNx1SI") (VNx4SF
"VNx1SI")
;; Maps the output type of svscale to the corresponding int vector type in the
;; second argument.
-(define_mode_attr SVSCALE_SINGLE_INTARG [(VNx16HF "VNx8HI") ;; f16_x2 -> s16
- (VNx32HF "VNx8HI") ;; f16_x4 -> s16
- (VNx8SF "VNx4SI") ;; f32_x2 -> s32
- (VNx16SF "VNx4SI") ;; f32_x4 -> s32
- (VNx4DF "VNx2DI") ;; f64_x2 -> s64
- (VNx8DF "VNx2DI") ;; f64_x4 -> s64
+(define_mode_attr SVSCALE_SINGLE_INTARG [
+ (VNx16HF "VNx8HI") ;; f16_x2 -> s16
+ (VNx32HF "VNx8HI") ;; f16_x4 -> s16
+ (VNx16BF "VNx8HI") ;; bf16_x2 -> s16
+ (VNx32BF "VNx8HI") ;; bf16_x4 -> s16
+ (VNx8SF "VNx4SI") ;; f32_x2 -> s32
+ (VNx16SF "VNx4SI") ;; f32_x4 -> s32
+ (VNx4DF "VNx2DI") ;; f64_x2 -> s64
+ (VNx8DF "VNx2DI") ;; f64_x4 -> s64
])
-(define_mode_attr SVSCALE_INTARG [(VNx16HF "VNx16HI") ;; f16_x2 -> s16x2
- (VNx32HF "VNx32HI") ;; f16_x4 -> s16x4
- (VNx8SF "VNx8SI") ;; f32_x2 -> s32_x2
- (VNx16SF "VNx16SI") ;; f32_x4 -> s32_x4
- (VNx4DF "VNx4DI") ;; f64_x2 -> s64_x2
- (VNx8DF "VNx8DI") ;; f64_x4 -> s64_x4
+(define_mode_attr SVSCALE_INTARG [
+ (VNx16HF "VNx16HI") ;; f16_x2 -> s16x2
+ (VNx32HF "VNx32HI") ;; f16_x4 -> s16x4
+ (VNx16BF "VNx16HI") ;; bf16_x2 -> s16x2
+ (VNx32BF "VNx32HI") ;; bf16_x4 -> s16x4
+ (VNx8SF "VNx8SI") ;; f32_x2 -> s32_x2
+ (VNx16SF "VNx16SI") ;; f32_x4 -> s32_x4
+ (VNx4DF "VNx4DI") ;; f64_x2 -> s64_x2
+ (VNx8DF "VNx8DI") ;; f64_x4 -> s64_x4
])
+
;; -------------------------------------------------------------------
;; Code Iterators
;; -------------------------------------------------------------------
@@ -3644,6 +3668,8 @@ (define_int_iterator SVE_COND_FP_ADD [UNSPEC_COND_FADD])
(define_int_iterator SVE_COND_FP_SUB [UNSPEC_COND_FSUB])
(define_int_iterator SVE_COND_FP_MUL [UNSPEC_COND_FMUL])
+(define_int_iterator SVE_FP_MUL [UNSPEC_FMUL])
+
(define_int_iterator SVE_COND_FP_BINARY_I1 [UNSPEC_COND_FMAX
UNSPEC_COND_FMAXNM
UNSPEC_COND_FMIN
@@ -4205,6 +4231,7 @@ (define_int_attr optab [(UNSPEC_ANDF "and")
(UNSPEC_FMINNMQV "fminnmqv")
(UNSPEC_FMINNMV "smin")
(UNSPEC_FMINV "smin_nan")
+ (UNSPEC_FMUL "fmul")
(UNSPEC_SMUL_HIGHPART "smulh")
(UNSPEC_UMUL_HIGHPART "umulh")
(UNSPEC_FMLA "fma")
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 87c0470c3db..bb43bf27658 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -23621,7 +23621,8 @@ Enable the Checked Pointer Arithmetic instructions.
@item sve-b16b16
Enable the SVE non-widening brain floating-point (@code{bf16}) extension.
This only has an effect when @code{sve2} or @code{sme2} are also enabled.
-
+@item sve-bfscale
+Enable the SVE_BFSCALE extension.
@end table
Feature @option{crypto} implies @option{aes}, @option{sha2}, and @option{simd},
diff --git a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_4.c
b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_4.c
index 284c2a23252..70f59b47aee 100644
--- a/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/pragma_cpp_predefs_4.c
@@ -111,6 +111,11 @@
#error Foo
#endif
+#pragma GCC target "+nothing+sve-bfscale"
+#ifndef __ARM_FEATURE_SVE_BFSCALE
+#error "__ARM_FEATURE_SVE_BFSCALE should be defined but isn't"
+#endif
+
#pragma GCC target "+nothing+sve2+sme-f8f16"
#ifndef __ARM_FEATURE_SME_F8F16
#error Foo
diff --git a/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/mul_bf16_x2.c
b/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/mul_bf16_x2.c
new file mode 100644
index 00000000000..d9e6ad1bb4f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/mul_bf16_x2.c
@@ -0,0 +1,193 @@
+/* { dg-do assemble { target aarch64_asm_sve-bfscale_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sve-bfscale_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sme2_acle.h"
+#pragma GCC target "+sve-bfscale"
+
+/*
+** mul_z0_z0_z4:
+** bfmul {z0\.h - z1\.h}, {z0\.h - z1\.h}, {z4\.h - z5\.h}
+** ret
+*/
+TEST_XN (mul_z0_z0_z4, svbfloat16x2_t, z0,
+ svmul_bf16_x2 (z0, z4),
+ svmul (z0, z4))
+
+/*
+** mul_z0_z4_z0:
+** bfmul {z0\.h - z1\.h}, {z4\.h - z5\.h}, {z0\.h - z1\.h}
+** ret
+*/
+TEST_XN (mul_z0_z4_z0, svbfloat16x2_t, z0,
+ svmul_bf16_x2 (z4, z0),
+ svmul (z4, z0))
+
+/*
+** mul_z0_z4_z28:
+** bfmul {z0\.h - z1\.h}, {z4\.h - z5\.h}, {z28\.h - z29\.h}
+** ret
+*/
+TEST_XN (mul_z0_z4_z28, svbfloat16x2_t, z0,
+ svmul_bf16_x2 (z4, z28),
+ svmul (z4, z28))
+
+/*
+** mul_z18_z18_z4:
+** bfmul {z18\.h - z19\.h}, {z18\.h - z19\.h}, {z4\.h - z5\.h}
+** ret
+*/
+TEST_XN (mul_z18_z18_z4, svbfloat16x2_t, z18,
+ svmul_bf16_x2 (z18, z4),
+ svmul (z18, z4))
+
+/*
+** mul_z23_z23_z18:
+** mov [^\n]+
+** mov [^\n]+
+** bfmul [^\n]+, {z18\.h - z19\.h}
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN (mul_z23_z23_z18, svbfloat16x2_t, z23,
+ svmul_bf16_x2 (z23, z18),
+ svmul (z23, z18))
+
+/*
+** mul_z28_z28_z0:
+** bfmul {z28\.h - z29\.h}, {z28\.h - z29\.h}, {z0\.h - z1\.h}
+** ret
+*/
+TEST_XN (mul_z28_z28_z0, svbfloat16x2_t, z28,
+ svmul_bf16_x2 (z28, z0),
+ svmul (z28, z0))
+
+/*
+** mul_z0_z0_z18:
+** bfmul {z0\.h - z1\.h}, {z0\.h - z1\.h}, {z18\.h - z19\.h}
+** ret
+*/
+TEST_XN (mul_z0_z0_z18, svbfloat16x2_t, z0,
+ svmul_bf16_x2 (z0, z18),
+ svmul (z0, z18))
+
+/*
+** mul_z4_z4_z23:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** bfmul {z4\.h - z5\.h}, {z4\.h - z5\.h}, [^\n]+
+** |
+** bfmul {z4\.h - z5\.h}, {z4\.h - z5\.h}, [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_XN (mul_z4_z4_z23, svbfloat16x2_t, z4,
+ svmul_bf16_x2 (z4, z23),
+ svmul (z4, z23))
+
+/*
+** mul_single_z24_z24_z0:
+** bfmul {z24\.h - z25\.h}, {z24\.h - z25\.h}, z0\.h
+** ret
+*/
+TEST_XN_SINGLE (mul_single_z24_z24_z0, svbfloat16x2_t, svbfloat16_t, z24,
+ svmul_single_bf16_x2 (z24, z0),
+ svmul (z24, z0))
+
+/*
+** mul_single_z24_z28_z0:
+** bfmul {z24\.h - z25\.h}, {z28\.h - z29\.h}, z0\.h
+** ret
+*/
+TEST_XN_SINGLE (mul_single_z24_z28_z0, svbfloat16x2_t, svbfloat16_t, z24,
+ svmul_single_bf16_x2 (z28, z0),
+ svmul (z28, z0))
+
+/*
+** mul_single_z24_z1_z0:
+** (
+** mov z30\.d, z1\.d
+** mov z31\.d, z2\.d
+** |
+** mov z31\.d, z2\.d
+** mov z30\.d, z1\.d
+** )
+** bfmul {z24\.h - z25\.h}, {z30\.h - z31\.h}, z0\.h
+** ret
+*/
+TEST_XN_SINGLE (mul_single_z24_z1_z0, svbfloat16x2_t, svbfloat16_t, z24,
+ svmul_single_bf16_x2 (z1, z0),
+ svmul (z1, z0))
+
+/*
+** mul_single_z1_z24_z0:
+** bfmul {z30\.h - z31\.h}, {z24\.h - z25\.h}, z0\.h
+** (
+** mov z2\.d, z31\.d
+** mov z1\.d, z30\.d
+** |
+** mov z1\.d, z30\.d
+** mov z2\.d, z31\.d
+** )
+** ret
+*/
+TEST_XN_SINGLE (mul_single_z1_z24_z0, svbfloat16x2_t, svbfloat16_t, z1,
+ svmul_single_bf16_x2 (z24, z0),
+ svmul (z24, z0))
+
+/*
+** mul_single_z1_z1_z0:
+** mov [^\n]+
+** mov [^\n]+
+** bfmul ({z[0-9]+\.h - z[0-9]+\.h}), \1, z0\.h
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN_SINGLE (mul_single_z1_z1_z0, svbfloat16x2_t, svbfloat16_t, z1,
+ svmul_single_bf16_x2 (z1, z0),
+ svmul (z1, z0))
+
+/*
+** mul_single_z18_z18_z0:
+** bfmul {z18\.h - z19\.h}, {z18\.h - z19\.h}, z0\.h
+** ret
+*/
+TEST_XN_SINGLE (mul_single_z18_z18_z0, svbfloat16x2_t, svbfloat16_t, z18,
+ svmul_single_bf16_x2 (z18, z0),
+ svmul (z18, z0))
+
+/*
+** mul_single_awkward:
+** ...
+** bfmul {z0\.h - z1\.h}, {z30\.h - z31\.h}, z[0-9]+\.h
+** ret
+*/
+TEST_XN_SINGLE_AWKWARD (mul_single_awkward, svbfloat16x2_t, svbfloat16_t,
+ z0_res = svmul_single_bf16_x2 (z1, z0),
+ z0_res = svmul (z1, z0))
+
+/*
+** mul_single_z0_z0_z15:
+** ...
+** bfmul {z0\.h - z1\.h}, {z0\.h - z1\.h}, z15\.h
+** ...
+** ret
+*/
+TEST_XN_SINGLE_Z15 (mul_single_z0_z0_z15, svbfloat16x2_t, svbfloat16_t,
+ z0 = svmul_single_bf16_x2 (z0, z15),
+ z0 = svmul (z0, z15))
+
+/*
+** mul_single_z24_z24_z16:
+** mov (z[0-7])\.d, z16\.d
+** bfmul {z24\.h - z25\.h}, {z24\.h - z25\.h}, \1\.h
+** ret
+*/
+TEST_XN_SINGLE (mul_single_z24_z24_z16, svbfloat16x2_t, svbfloat16_t, z24,
+ svmul_single_bf16_x2 (z24, z16),
+ svmul (z24, z16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/mul_bf16_x4.c
b/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/mul_bf16_x4.c
new file mode 100644
index 00000000000..8da388e5e12
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/mul_bf16_x4.c
@@ -0,0 +1,227 @@
+/* { dg-do assemble { target aarch64_asm_sve-bfscale_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sve-bfscale_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sme2_acle.h"
+#pragma GCC target "+sve-bfscale"
+
+/*
+** mul_z0_z0_z4:
+** bfmul {z0\.h - z3\.h}, {z0\.h - z3\.h}, {z4\.h - z7\.h}
+** ret
+*/
+TEST_XN (mul_z0_z0_z4, svbfloat16x4_t, z0,
+ svmul_bf16_x4 (z0, z4),
+ svmul (z0, z4))
+
+/*
+** mul_z0_z4_z0:
+** bfmul {z0\.h - z3\.h}, {z4\.h - z7\.h}, {z0\.h - z3\.h}
+** ret
+*/
+TEST_XN (mul_z0_z4_z0, svbfloat16x4_t, z0,
+ svmul_bf16_x4 (z4, z0),
+ svmul (z4, z0))
+
+/*
+** mul_z0_z4_z28:
+** bfmul {z0\.h - z3\.h}, {z4\.h - z7\.h}, {z28\.h - z31\.h}
+** ret
+*/
+TEST_XN (mul_z0_z4_z28, svbfloat16x4_t, z0,
+ svmul_bf16_x4 (z4, z28),
+ svmul (z4, z28))
+
+/*
+** mul_z18_z18_z4:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmul [^\n]+, {z4\.h - z7\.h}
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN (mul_z18_z18_z4, svbfloat16x4_t, z18,
+ svmul_bf16_x4 (z18, z4),
+ svmul (z18, z4))
+
+/*
+** mul_z23_z23_z28:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmul [^\n]+, {z28\.h - z31\.h}
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN (mul_z23_z23_z28, svbfloat16x4_t, z23,
+ svmul_bf16_x4 (z23, z28),
+ svmul (z23, z28))
+
+/*
+** mul_z28_z28_z0:
+** bfmul {z28\.h - z31\.h}, {z28\.h - z31\.h}, {z0\.h - z3\.h}
+** ret
+*/
+TEST_XN (mul_z28_z28_z0, svbfloat16x4_t, z28,
+ svmul_bf16_x4 (z28, z0),
+ svmul (z28, z0))
+
+/*
+** mul_z0_z0_z18:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmul {z0\.h - z3\.h}, {z0\.h - z3\.h}, [^\n]+
+** |
+** bfmul {z0\.h - z3\.h}, {z0\.h - z3\.h}, [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_XN (mul_z0_z0_z18, svbfloat16x4_t, z0,
+ svmul_bf16_x4 (z0, z18),
+ svmul (z0, z18))
+
+/*
+** mul_z4_z4_z23:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmul {z4\.h - z7\.h}, {z4\.h - z7\.h}, [^\n]+
+** |
+** bfmul {z4\.h - z7\.h}, {z4\.h - z7\.h}, [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_XN (mul_z4_z4_z23, svbfloat16x4_t, z4,
+ svmul_bf16_x4 (z4, z23),
+ svmul (z4, z23))
+
+/*
+** mul_single_z24_z24_z0:
+** bfmul {z24\.h - z27\.h}, {z24\.h - z27\.h}, z0\.h
+** ret
+*/
+TEST_XN_SINGLE (mul_single_z24_z24_z0, svbfloat16x4_t, svbfloat16_t, z24,
+ svmul_single_bf16_x4 (z24, z0),
+ svmul (z24, z0))
+
+/*
+** mul_single_z24_z28_z0:
+** bfmul {z24\.h - z27\.h}, {z28\.h - z31\.h}, z0\.h
+** ret
+*/
+TEST_XN_SINGLE (mul_single_z24_z28_z0, svbfloat16x4_t, svbfloat16_t, z24,
+ svmul_single_bf16_x4 (z28, z0),
+ svmul (z28, z0))
+
+/*
+** mul_single_z24_z1_z0:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmul {z24\.h - z27\.h}, {z28\.h - z31\.h}, z0\.h
+** ret
+*/
+TEST_XN_SINGLE (mul_single_z24_z1_z0, svbfloat16x4_t, svbfloat16_t, z24,
+ svmul_single_bf16_x4 (z1, z0),
+ svmul (z1, z0))
+
+/*
+** mul_single_z1_z24_z0:
+** bfmul {z28\.h - z31\.h}, {z24\.h - z27\.h}, z0\.h
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN_SINGLE (mul_single_z1_z24_z0, svbfloat16x4_t, svbfloat16_t, z1,
+ svmul_single_bf16_x4 (z24, z0),
+ svmul (z24, z0))
+
+/*
+** mul_single_z1_z1_z0:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmul ({z[0-9]+\.h - z[0-9]+\.h}), \1, z0\.h
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN_SINGLE (mul_single_z1_z1_z0, svbfloat16x4_t, svbfloat16_t, z1,
+ svmul_single_bf16_x4 (z1, z0),
+ svmul (z1, z0))
+
+/*
+** mul_single_z18_z18_z0:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfmul [^\n]+, z0\.h
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN_SINGLE (mul_single_z18_z18_z0, svbfloat16x4_t, svbfloat16_t, z18,
+ svmul_single_bf16_x4 (z18, z0),
+ svmul (z18, z0))
+
+/*
+** mul_single_awkward:
+** ...
+** bfmul {z0\.h - z3\.h}, {z[0-9]+\.h - z[0-9]+\.h}, z[0-9]+\.h
+** ret
+*/
+TEST_XN_SINGLE_AWKWARD (mul_single_awkward, svbfloat16x4_t, svbfloat16_t,
+ z0_res = svmul_single_bf16_x4 (z1, z0),
+ z0_res = svmul (z1, z0))
+
+/*
+** mul_single_z0_z0_z15:
+** ...
+** bfmul {z0\.h - z3\.h}, {z0\.h - z3\.h}, z15\.h
+** ...
+** ret
+*/
+TEST_XN_SINGLE_Z15 (mul_single_z0_z0_z15, svbfloat16x4_t, svbfloat16_t,
+ z0 = svmul_single_bf16_x4 (z0, z15),
+ z0 = svmul (z0, z15))
+
+/*
+** mul_single_z24_z24_z16:
+** mov (z[0-7])\.d, z16\.d
+** bfmul {z24\.h - z27\.h}, {z24\.h - z27\.h}, \1\.h
+** ret
+*/
+TEST_XN_SINGLE (mul_single_z24_z24_z16, svbfloat16x4_t, svbfloat16_t, z24,
+ svmul_single_bf16_x4 (z24, z16),
+ svmul (z24, z16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/scale_bf16_x2.c
b/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/scale_bf16_x2.c
new file mode 100644
index 00000000000..33a14e34653
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/scale_bf16_x2.c
@@ -0,0 +1,194 @@
+/* { dg-do assemble { target aarch64_asm_sve-bfscale_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sve-bfscale_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sme2_acle.h"
+#pragma GCC target "+sve-bfscale"
+
+/*
+** bfscale_z0_z0_z4:
+** bfscale {z0\.h - z1\.h}, {z0\.h - z1\.h}, {z4\.h - z5\.h}
+** ret
+*/
+TEST_DUAL_XN (bfscale_z0_z0_z4, svbfloat16x2_t, svint16x2_t, z0,
+ svscale_bf16_x2 (z0, z4),
+ svscale (z0, z4))
+
+/*
+** bfscale_z4_z4_z0:
+** bfscale {z4\.h - z5\.h}, {z4\.h - z5\.h}, {z0\.h - z1\.h}
+** ret
+*/
+TEST_DUAL_XN (bfscale_z4_z4_z0, svint16x2_t, svbfloat16x2_t, z4,
+ svscale_bf16_x2 (z4, z0),
+ svscale (z4, z0))
+
+/*
+** bfscale_z18_z18_z4:
+** bfscale {z18\.h - z19\.h}, {z18\.h - z19\.h}, {z4\.h - z5\.h}
+** ret
+*/
+TEST_DUAL_XN (bfscale_z18_z18_z4, svbfloat16x2_t, svint16x2_t, z18,
+ svscale_bf16_x2 (z18, z4),
+ svscale (z18, z4))
+
+/*
+** bfscale_z23_z23_z18:
+** mov [^\n]+
+** mov [^\n]+
+** bfscale [^\n]+, {z18\.h - z19\.h}
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_DUAL_XN (bfscale_z23_z23_z18, svint16x2_t, svbfloat16x2_t, z23,
+ svscale_bf16_x2 (z23, z18),
+ svscale (z23, z18))
+
+
+/*
+** bfscale_z28_z28_z4:
+** bfscale {z28\.h - z29\.h}, {z28\.h - z29\.h}, {z4\.h - z5\.h}
+** ret
+*/
+TEST_DUAL_XN (bfscale_z28_z28_z4, svbfloat16x2_t, svint16x2_t, z28,
+ svscale_bf16_x2 (z28, z4),
+ svscale (z28, z4))
+
+/*
+** bfscale_z4_z4_z18:
+** bfscale {z4\.h - z5\.h}, {z4\.h - z5\.h}, {z18\.h - z19\.h}
+** ret
+*/
+TEST_DUAL_XN (bfscale_z4_z4_z18, svint16x2_t, svbfloat16x2_t, z4,
+ svscale_bf16_x2 (z4, z18),
+ svscale (z4, z18))
+
+/*
+** bfscale_z28_28_z23:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** bfscale {z28\.h - z29\.h}, {z28\.h - z29\.h}, [^\n]+
+** |
+** bfscale {z28\.h - z29\.h}, {z28\.h - z29\.h}, [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_DUAL_XN (bfscale_z28_28_z23, svbfloat16x2_t, svint16x2_t, z28,
+ svscale_bf16_x2 (z28, z23),
+ svscale (z28, z23))
+
+/*
+** bfscale_single_z24_z24_z0:
+** bfscale {z24\.h - z25\.h}, {z24\.h - z25\.h}, z0\.h
+** ret
+*/
+TEST_XN_SINGLE (bfscale_single_z24_z24_z0, svbfloat16x2_t, svint16_t, z24,
+ svscale_single_bf16_x2 (z24, z0),
+ svscale (z24, z0))
+
+/*
+** bfscale_single_z24_z28_z0:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** bfscale {z24\.h - z25\.h}, {z24\.h - z25\.h}, z0\.h
+** |
+** bfscale {z28\.h - z29\.h}, {z28\.h - z29\.h}, z0\.h
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_XN_SINGLE (bfscale_single_z24_z28_z0, svbfloat16x2_t, svint16_t, z24,
+ svscale_single_bf16_x2 (z28, z0),
+ svscale (z28, z0))
+
+/*
+** bfscale_single_z24_z1_z0:
+** (
+** mov z24\.d, z1\.d
+** mov z25\.d, z2\.d
+** |
+** mov z25\.d, z2\.d
+** mov z24\.d, z1\.d
+** )
+** bfscale {z24\.h - z25\.h}, {z24\.h - z25\.h}, z0\.h
+** ret
+*/
+TEST_XN_SINGLE (bfscale_single_z24_z1_z0, svbfloat16x2_t, svint16_t, z24,
+ svscale_single_bf16_x2 (z1, z0),
+ svscale (z1, z0))
+
+/*
+** bfscale_single_z1_z24_z0:
+** bfscale {z24\.h - z25\.h}, {z24\.h - z25\.h}, z0\.h
+** (
+** mov z1\.d, z24\.d
+** mov z2\.d, z25\.d
+** |
+** mov z2\.d, z25\.d
+** mov z1\.d, z24\.d
+** )
+** ret
+*/
+TEST_XN_SINGLE (bfscale_single_z1_z24_z0, svbfloat16x2_t, svint16_t, z1,
+ svscale_single_bf16_x2 (z24, z0),
+ svscale (z24, z0))
+
+/*
+** bfscale_single_z1_z1_z0:
+** mov [^\n]+
+** mov [^\n]+
+** bfscale ({z[0-9]+\.h - z[0-9]+\.h}), \1, z0\.h
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN_SINGLE (bfscale_single_z1_z1_z0, svbfloat16x2_t, svint16_t, z1,
+ svscale_single_bf16_x2 (z1, z0),
+ svscale (z1, z0))
+
+/*
+** bfscale_single_z18_z18_z0:
+** bfscale {z18\.h - z19\.h}, {z18\.h - z19\.h}, z0\.h
+** ret
+*/
+TEST_XN_SINGLE (bfscale_single_z18_z18_z0, svbfloat16x2_t, svint16_t, z18,
+ svscale_single_bf16_x2 (z18, z0),
+ svscale (z18, z0))
+
+/*
+** bfscale_single_awkward:
+** ...
+** bfscale ({z[0-9]+\.h - z[0-9]+\.h}), \1, z[0-9]+\.h
+** ...
+** ret
+*/
+TEST_XN_SINGLE_AWKWARD (bfscale_single_awkward, svbfloat16x2_t, svint16_t,
+ z0_res = svscale_single_bf16_x2 (z1, z0),
+ z0_res = svscale (z1, z0))
+
+/*
+** bfscale_single_z0_z0_z15:
+** ...
+** bfscale {z0\.h - z1\.h}, {z0\.h - z1\.h}, z15\.h
+** ...
+** ret
+*/
+TEST_XN_SINGLE_Z15 (bfscale_single_z0_z0_z15, svbfloat16x2_t, svint16_t,
+ z0 = svscale_single_bf16_x2 (z0, z15),
+ z0 = svscale (z0, z15))
+
+/*
+** bfscale_single_z24_z24_z16:
+** mov (z[0-7])\.d, z16\.d
+** bfscale {z24\.h - z25\.h}, {z24\.h - z25\.h}, \1\.h
+** ret
+*/
+TEST_XN_SINGLE (bfscale_single_z24_z24_z16, svbfloat16x2_t, svint16_t, z24,
+ svscale_single_bf16_x2 (z24, z16),
+ svscale (z24, z16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/scale_bf16_x4.c
b/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/scale_bf16_x4.c
new file mode 100644
index 00000000000..d9e90746a98
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sme2/acle-asm/scale_bf16_x4.c
@@ -0,0 +1,231 @@
+/* { dg-do assemble { target aarch64_asm_sve-bfscale_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sve-bfscale_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sme2_acle.h"
+#pragma GCC target "+sve-bfscale"
+
+/*
+** bfscale_z0_z0_z4:
+** bfscale {z0\.h - z3\.h}, {z0\.h - z3\.h}, {z4\.h - z7\.h}
+** ret
+*/
+TEST_DUAL_XN (bfscale_z0_z0_z4, svbfloat16x4_t, svint16x4_t, z0,
+ svscale_bf16_x4 (z0, z4),
+ svscale (z0, z4))
+
+/*
+** bfscale_z4_z4_z0:
+** bfscale {z4\.h - z7\.h}, {z4\.h - z7\.h}, {z0\.h - z3\.h}
+** ret
+*/
+TEST_DUAL_XN (bfscale_z4_z4_z0, svint16x4_t, svbfloat16x4_t, z4,
+ svscale_bf16_x4 (z4, z0),
+ svscale (z4, z0))
+
+/*
+** bfscale_z18_z18_z4:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfscale [^\n]+, {z4\.h - z7\.h}
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_DUAL_XN (bfscale_z18_z18_z4, svbfloat16x4_t, svint16x4_t, z18,
+ svscale_bf16_x4 (z18, z4),
+ svscale (z18, z4))
+
+/*
+** bfscale_z23_z23_z28:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfscale [^\n]+, {z28\.h - z31\.h}
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_DUAL_XN (bfscale_z23_z23_z28, svint16x4_t, svbfloat16x4_t, z23,
+ svscale_bf16_x4 (z23, z28),
+ svscale (z23, z28))
+
+/*
+** bfscale_z28_z28_z4:
+** bfscale {z28\.h - z31\.h}, {z28\.h - z31\.h}, {z4\.h - z7\.h}
+** ret
+*/
+TEST_DUAL_XN (bfscale_z28_z28_z4, svbfloat16x4_t, svint16x4_t, z28,
+ svscale_bf16_x4 (z28, z4),
+ svscale (z28, z4))
+
+/*
+** bfscale_z4_z4_z18:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfscale {z4\.h - z7\.h}, {z4\.h - z7\.h}, [^\n]+
+** |
+** bfscale {z4\.h - z7\.h}, {z4\.h - z7\.h}, [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_DUAL_XN (bfscale_z4_z4_z18, svint16x4_t, svbfloat16x4_t, z4,
+ svscale_bf16_x4 (z4, z18),
+ svscale (z4, z18))
+
+/*
+** bfscale_z0_z0_z23:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfscale {z0\.h - z3\.h}, {z0\.h - z3\.h}, [^\n]+
+** |
+** bfscale {z0\.h - z3\.h}, {z0\.h - z3\.h}, [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_DUAL_XN (bfscale_z0_z0_z23, svbfloat16x4_t, svint16x4_t, z0,
+ svscale_bf16_x4 (z0, z23),
+ svscale (z0, z23))
+
+/*
+** bfscale_single_z24_z24_z0:
+** bfscale {z24\.h - z27\.h}, {z24\.h - z27\.h}, z0\.h
+** ret
+*/
+TEST_XN_SINGLE (bfscale_single_z24_z24_z0, svbfloat16x4_t, svint16_t, z24,
+ svscale_single_bf16_x4 (z24, z0),
+ svscale (z24, z0))
+
+/*
+** bfscale_single_z24_z28_z0:
+** (
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfscale {z24\.h - z27\.h}, {z24\.h - z27\.h}, z0\.h
+** |
+** bfscale {z28\.h - z31\.h}, {z28\.h - z31\.h}, z0\.h
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** )
+** ret
+*/
+TEST_XN_SINGLE (bfscale_single_z24_z28_z0, svbfloat16x4_t, svint16_t, z24,
+ svscale_single_bf16_x4 (z28, z0),
+ svscale (z28, z0))
+
+/*
+** bfscale_single_z24_z1_z0:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfscale {z24\.h - z27\.h}, {z24\.h - z27\.h}, z0\.h
+** ret
+*/
+TEST_XN_SINGLE (bfscale_single_z24_z1_z0, svbfloat16x4_t, svint16_t, z24,
+ svscale_single_bf16_x4 (z1, z0),
+ svscale (z1, z0))
+
+/*
+** bfscale_single_z1_z24_z0:
+** bfscale {z24\.h - z27\.h}, {z24\.h - z27\.h}, z0\.h
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN_SINGLE (bfscale_single_z1_z24_z0, svbfloat16x4_t, svint16_t, z1,
+ svscale_single_bf16_x4 (z24, z0),
+ svscale (z24, z0))
+
+/*
+** bfscale_single_z1_z1_z0:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfscale ({z[0-9]+\.h - z[0-9]+\.h}), \1, z0\.h
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN_SINGLE (bfscale_single_z1_z1_z0, svbfloat16x4_t, svint16_t, z1,
+ svscale_single_bf16_x4 (z1, z0),
+ svscale (z1, z0))
+
+/*
+** bfscale_single_z18_z18_z0:
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** bfscale [^\n]+, z0\.h
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** mov [^\n]+
+** ret
+*/
+TEST_XN_SINGLE (bfscale_single_z18_z18_z0, svbfloat16x4_t, svint16_t, z18,
+ svscale_single_bf16_x4 (z18, z0),
+ svscale (z18, z0))
+
+/*
+** bfscale_single_awkward:
+** ...
+** bfscale ({z[0-9]+\.h - z[0-9]+\.h}), \1, z[0-9]+\.h
+** ...
+** ret
+*/
+TEST_XN_SINGLE_AWKWARD (bfscale_single_awkward, svbfloat16x4_t, svint16_t,
+ z0_res = svscale_single_bf16_x4 (z1, z0),
+ z0_res = svscale (z1, z0))
+
+/*
+** bfscale_single_z0_z0_z15:
+** ...
+** bfscale {z0\.h - z3\.h}, {z0\.h - z3\.h}, z15\.h
+** ...
+** ret
+*/
+TEST_XN_SINGLE_Z15 (bfscale_single_z0_z0_z15, svbfloat16x4_t, svint16_t,
+ z0 = svscale_single_bf16_x4 (z0, z15),
+ z0 = svscale (z0, z15))
+
+/*
+** bfscale_single_z24_z24_z16:
+** mov (z[0-7])\.d, z16\.d
+** bfscale {z24\.h - z27\.h}, {z24\.h - z27\.h}, \1\.h
+** ret
+*/
+TEST_XN_SINGLE (bfscale_single_z24_z24_z16, svbfloat16x4_t, svint16_t, z24,
+ svscale_single_bf16_x4 (z24, z16),
+ svscale (z24, z16))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/scale_bf16.c
b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/scale_bf16.c
new file mode 100644
index 00000000000..4e8ff3392ae
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/scale_bf16.c
@@ -0,0 +1,337 @@
+/* { dg-do assemble { target aarch64_asm_sve-bfscale_ok } } */
+/* { dg-do compile { target { ! aarch64_asm_sve-bfscale_ok } } } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+#pragma GCC target "+sve2,+sve-bfscale"
+#ifdef STREAMING_COMPATIBLE
+#pragma GCC target "+sme2"
+#endif
+
+/*
+** scale_bf16_m_tied1:
+** bfscale z0\.h, p0/m, z0\.h, z4\.h
+** ret
+*/
+TEST_DUAL_Z (scale_bf16_m_tied1, svbfloat16_t, svint16_t,
+ z0 = svscale_bf16_m (p0, z0, z4),
+ z0 = svscale_m (p0, z0, z4))
+
+/*
+** scale_bf16_m_tied2:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z4
+** bfscale z0\.h, p0/m, z0\.h, \1\.h
+** ret
+*/
+TEST_DUAL_Z_REV (scale_bf16_m_tied2, svbfloat16_t, svint16_t,
+ z0_res = svscale_bf16_m (p0, z4, z0),
+ z0_res = svscale_m (p0, z4, z0))
+
+/*
+** scale_bf16_m_untied:
+** movprfx z0, z1
+** bfscale z0\.h, p0/m, z0\.h, z4\.h
+** ret
+*/
+TEST_DUAL_Z (scale_bf16_m_untied, svbfloat16_t, svint16_t,
+ z0 = svscale_bf16_m (p0, z1, z4),
+ z0 = svscale_m (p0, z1, z4))
+
+/*
+** scale_w0_bf16_m_tied1:
+** mov (z[0-9]+\.h), w0
+** bfscale z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZX (scale_w0_bf16_m_tied1, svbfloat16_t, int16_t,
+ z0 = svscale_n_bf16_m (p0, z0, x0),
+ z0 = svscale_m (p0, z0, x0))
+
+/*
+** scale_w0_bf16_m_untied:
+** mov (z[0-9]+\.h), w0
+** movprfx z0, z1
+** bfscale z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZX (scale_w0_bf16_m_untied, svbfloat16_t, int16_t,
+ z0 = svscale_n_bf16_m (p0, z1, x0),
+ z0 = svscale_m (p0, z1, x0))
+
+/*
+** scale_3_bf16_m_tied1:
+** mov (z[0-9]+\.h), #3
+** bfscale z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (scale_3_bf16_m_tied1, svbfloat16_t,
+ z0 = svscale_n_bf16_m (p0, z0, 3),
+ z0 = svscale_m (p0, z0, 3))
+
+/*
+** scale_3_bf16_m_untied:
+** mov (z[0-9]+\.h), #3
+** movprfx z0, z1
+** bfscale z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (scale_3_bf16_m_untied, svbfloat16_t,
+ z0 = svscale_n_bf16_m (p0, z1, 3),
+ z0 = svscale_m (p0, z1, 3))
+
+/*
+** scale_m3_bf16_m:
+** mov (z[0-9]+\.h), #-3
+** bfscale z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (scale_m3_bf16_m, svbfloat16_t,
+ z0 = svscale_n_bf16_m (p0, z0, -3),
+ z0 = svscale_m (p0, z0, -3))
+
+/*
+** scale_bf16_z_tied1:
+** movprfx z0\.h, p0/z, z0\.h
+** bfscale z0\.h, p0/m, z0\.h, z4\.h
+** ret
+*/
+TEST_DUAL_Z (scale_bf16_z_tied1, svbfloat16_t, svint16_t,
+ z0 = svscale_bf16_z (p0, z0, z4),
+ z0 = svscale_z (p0, z0, z4))
+
+/*
+** scale_bf16_z_tied2:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0\.h, p0/z, z4\.h
+** bfscale z0\.h, p0/m, z0\.h, \1\.h
+** ret
+*/
+TEST_DUAL_Z_REV (scale_bf16_z_tied2, svbfloat16_t, svint16_t,
+ z0_res = svscale_bf16_z (p0, z4, z0),
+ z0_res = svscale_z (p0, z4, z0))
+
+/*
+** scale_bf16_z_untied:
+** movprfx z0\.h, p0/z, z1\.h
+** bfscale z0\.h, p0/m, z0\.h, z4\.h
+** ret
+*/
+TEST_DUAL_Z (scale_bf16_z_untied, svbfloat16_t, svint16_t,
+ z0 = svscale_bf16_z (p0, z1, z4),
+ z0 = svscale_z (p0, z1, z4))
+
+/*
+** scale_w0_bf16_z_tied1:
+** mov (z[0-9]+\.h), w0
+** movprfx z0\.h, p0/z, z0\.h
+** bfscale z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZX (scale_w0_bf16_z_tied1, svbfloat16_t, int16_t,
+ z0 = svscale_n_bf16_z (p0, z0, x0),
+ z0 = svscale_z (p0, z0, x0))
+
+/*
+** scale_w0_bf16_z_untied:
+** mov (z[0-9]+\.h), w0
+** movprfx z0\.h, p0/z, z1\.h
+** bfscale z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZX (scale_w0_bf16_z_untied, svbfloat16_t, int16_t,
+ z0 = svscale_n_bf16_z (p0, z1, x0),
+ z0 = svscale_z (p0, z1, x0))
+
+/*
+** scale_3_bf16_z_tied1:
+** mov (z[0-9]+\.h), #3
+** movprfx z0\.h, p0/z, z0\.h
+** bfscale z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (scale_3_bf16_z_tied1, svbfloat16_t,
+ z0 = svscale_n_bf16_z (p0, z0, 3),
+ z0 = svscale_z (p0, z0, 3))
+
+/*
+** scale_3_bf16_z_untied:
+** mov (z[0-9]+\.h), #3
+** movprfx z0\.h, p0/z, z1\.h
+** bfscale z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (scale_3_bf16_z_untied, svbfloat16_t,
+ z0 = svscale_n_bf16_z (p0, z1, 3),
+ z0 = svscale_z (p0, z1, 3))
+
+/*
+** scale_m3_bf16_z:
+** mov (z[0-9]+\.h), #-3
+** movprfx z0\.h, p0/z, z0\.h
+** bfscale z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (scale_m3_bf16_z, svbfloat16_t,
+ z0 = svscale_n_bf16_z (p0, z0, -3),
+ z0 = svscale_z (p0, z0, -3))
+
+/*
+** scale_bf16_x_tied1:
+** bfscale z0\.h, p0/m, z0\.h, z4\.h
+** ret
+*/
+TEST_DUAL_Z (scale_bf16_x_tied1, svbfloat16_t, svint16_t,
+ z0 = svscale_bf16_x (p0, z0, z4),
+ z0 = svscale_x (p0, z0, z4))
+
+/*
+** scale_bf16_x_tied2:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z4
+** bfscale z0\.h, p0/m, z0\.h, \1\.h
+** ret
+*/
+TEST_DUAL_Z_REV (scale_bf16_x_tied2, svbfloat16_t, svint16_t,
+ z0_res = svscale_bf16_x (p0, z4, z0),
+ z0_res = svscale_x (p0, z4, z0))
+
+/*
+** scale_bf16_x_untied:
+** movprfx z0, z1
+** bfscale z0\.h, p0/m, z0\.h, z4\.h
+** ret
+*/
+TEST_DUAL_Z (scale_bf16_x_untied, svbfloat16_t, svint16_t,
+ z0 = svscale_bf16_x (p0, z1, z4),
+ z0 = svscale_x (p0, z1, z4))
+
+/*
+** scale_w0_bf16_x_tied1:
+** mov (z[0-9]+\.h), w0
+** bfscale z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZX (scale_w0_bf16_x_tied1, svbfloat16_t, int16_t,
+ z0 = svscale_n_bf16_x (p0, z0, x0),
+ z0 = svscale_x (p0, z0, x0))
+
+/*
+** scale_w0_bf16_x_untied:
+** mov (z[0-9]+\.h), w0
+** movprfx z0, z1
+** bfscale z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZX (scale_w0_bf16_x_untied, svbfloat16_t, int16_t,
+ z0 = svscale_n_bf16_x (p0, z1, x0),
+ z0 = svscale_x (p0, z1, x0))
+
+/*
+** scale_3_bf16_x_tied1:
+** mov (z[0-9]+\.h), #3
+** bfscale z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (scale_3_bf16_x_tied1, svbfloat16_t,
+ z0 = svscale_n_bf16_x (p0, z0, 3),
+ z0 = svscale_x (p0, z0, 3))
+
+/*
+** scale_3_bf16_x_untied:
+** mov (z[0-9]+\.h), #3
+** movprfx z0, z1
+** bfscale z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (scale_3_bf16_x_untied, svbfloat16_t,
+ z0 = svscale_n_bf16_x (p0, z1, 3),
+ z0 = svscale_x (p0, z1, 3))
+
+/*
+** scale_m3_bf16_x:
+** mov (z[0-9]+\.h), #-3
+** bfscale z0\.h, p0/m, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (scale_m3_bf16_x, svbfloat16_t,
+ z0 = svscale_n_bf16_x (p0, z0, -3),
+ z0 = svscale_x (p0, z0, -3))
+
+/*
+** ptrue_scale_bf16_x_tied1:
+** ...
+** ptrue p[0-9]+\.b[^\n]*
+** ...
+** ret
+*/
+TEST_DUAL_Z (ptrue_scale_bf16_x_tied1, svbfloat16_t, svint16_t,
+ z0 = svscale_bf16_x (svptrue_b16 (), z0, z4),
+ z0 = svscale_x (svptrue_b16 (), z0, z4))
+
+/*
+** ptrue_scale_bf16_x_tied2:
+** ...
+** ptrue p[0-9]+\.b[^\n]*
+** ...
+** ret
+*/
+TEST_DUAL_Z_REV (ptrue_scale_bf16_x_tied2, svbfloat16_t, svint16_t,
+ z0_res = svscale_bf16_x (svptrue_b16 (), z4, z0),
+ z0_res = svscale_x (svptrue_b16 (), z4, z0))
+
+/*
+** ptrue_scale_bf16_x_untied:
+** ...
+** ptrue p[0-9]+\.b[^\n]*
+** ...
+** ret
+*/
+TEST_DUAL_Z (ptrue_scale_bf16_x_untied, svbfloat16_t, svint16_t,
+ z0 = svscale_bf16_x (svptrue_b16 (), z1, z4),
+ z0 = svscale_x (svptrue_b16 (), z1, z4))
+
+/*
+** ptrue_scale_3_bf16_x_tied1:
+** ...
+** ptrue p[0-9]+\.b[^\n]*
+** ...
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_scale_3_bf16_x_tied1, svbfloat16_t,
+ z0 = svscale_n_bf16_x (svptrue_b16 (), z0, 3),
+ z0 = svscale_x (svptrue_b16 (), z0, 3))
+
+/*
+** ptrue_scale_3_bf16_x_untied:
+** ...
+** ptrue p[0-9]+\.b[^\n]*
+** ...
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_scale_3_bf16_x_untied, svbfloat16_t,
+ z0 = svscale_n_bf16_x (svptrue_b16 (), z1, 3),
+ z0 = svscale_x (svptrue_b16 (), z1, 3))
+
+/*
+** ptrue_scale_m3_bf16_x_tied1:
+** ...
+** ptrue p[0-9]+\.b[^\n]*
+** ...
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_scale_m3_bf16_x_tied1, svbfloat16_t,
+ z0 = svscale_n_bf16_x (svptrue_b16 (), z0, -3),
+ z0 = svscale_x (svptrue_b16 (), z0, -3))
+
+/*
+** ptrue_scale_m3_bf16_x_untied:
+** ...
+** ptrue p[0-9]+\.b[^\n]*
+** ...
+** ret
+*/
+TEST_UNIFORM_Z (ptrue_scale_m3_bf16_x_untied, svbfloat16_t,
+ z0 = svscale_n_bf16_x (svptrue_b16 (), z1, -3),
+ z0 = svscale_x (svptrue_b16 (), z1, -3))
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/bfscale.c
b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/bfscale.c
new file mode 100644
index 00000000000..051ff47b3bc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general-c/bfscale.c
@@ -0,0 +1,114 @@
+// { dg-options "-std=c23 -fsyntax-only" }
+// { dg-do compile }
+
+#pragma GCC target "+sve,+sve2,+sme,+sme2,+sve-bfscale"
+static_assert (__ARM_FEATURE_SVE2 == 1);
+static_assert (__ARM_FEATURE_SME2 == 1);
+static_assert (__ARM_FEATURE_SVE_BFSCALE == 1);
+#include <arm_sve.h>
+#include <arm_sme.h>
+
+/*
+- BFSCALE (predicated)
+ // Only if __ARM_FEATURE_SVE_BFSCALE != 0 && __ARM_FEATURE_SVE2 != 0
+ svbfloat16_t svscale[_bf16]_m (svbool_t pg, svbfloat16_t zdn, svint16_t zm);
+ svbfloat16_t svscale[_bf16]_x (svbool_t pg, svbfloat16_t zdn, svint16_t zm);
+ svbfloat16_t svscale[_bf16]_z (svbool_t pg, svbfloat16_t zdn, svint16_t zm);
+ svbfloat16_t svscale[_n_bf16]_m (svbool_t pg, svbfloat16_t zdn, int16_t zm);
+ svbfloat16_t svscale[_n_bf16]_x (svbool_t pg, svbfloat16_t zdn, int16_t zm);
+ svbfloat16_t svscale[_n_bf16]_z (svbool_t pg, svbfloat16_t zdn, int16_t zm);
*/
+
+void
+svscale_predicated_explicit_ok (svbool_t p, svbfloat16_t bf16x1,
+ svint16_t i16x1, int16_t i16)
+{
+ bf16x1 = svscale_bf16_m (p, bf16x1, i16x1);
+ bf16x1 = svscale_bf16_x (p, bf16x1, i16x1);
+ bf16x1 = svscale_bf16_z (p, bf16x1, i16x1);
+
+ bf16x1 = svscale_n_bf16_m (p, bf16x1, i16);
+ bf16x1 = svscale_n_bf16_x (p, bf16x1, i16);
+ bf16x1 = svscale_n_bf16_z (p, bf16x1, i16);
+}
+
+void
+svscale_predicated_inferred_ok (svbool_t p, svbfloat16_t bf16x1,
+ svbfloat16x4_t bf16x4, svint16_t i16x1,
+ int16_t i16)
+{
+ bf16x1 = svscale_m (p, bf16x1, i16x1);
+ bf16x1 = svscale_x (p, bf16x1, i16x1);
+ bf16x1 = svscale_z (p, bf16x1, i16x1);
+
+ bf16x1 = svscale_m (p, bf16x1, i16);
+ bf16x1 = svscale_x (p, bf16x1, i16);
+ bf16x1 = svscale_z (p, bf16x1, i16);
+}
+
+/*
+- BFSCALE (multiple vectors)
+ // Only if __ARM_FEATURE_SVE_BFSCALE != 0 && __ARM_FEATURE_SME2 != 0
+ svbfloat16x2_t svscale[_bf16_x2] (svbfloat16x2_t zdn, svint16x2_t zm)
__arm_streaming;
+ svbfloat16x4_t svscale[_bf16_x4] (svbfloat16x4_t zdn, svint16x4_t zm)
__arm_streaming;
+
+- BFSCALE (multiple and single vector)
+ // Only if __ARM_FEATURE_SVE_BFSCALE != 0 && __ARM_FEATURE_SME2 != 0
+ svbfloat16x2_t svscale[_single_bf16_x2] (svbfloat16x2_t zn, svint16_t zm)
__arm_streaming;
+ svbfloat16x4_t svscale[_single_bf16_x4] (svbfloat16x4_t zn, svint16_t zm)
__arm_streaming; */
+
+void
+svscale_explicit_ok (svbfloat16_t bf16x1, svbfloat16x2_t bf16x2,
+ svbfloat16x4_t bf16x4, svint16_t i16x1, svint16x2_t i16x2,
+ svint16x4_t i16x4) __arm_streaming
+{
+ bf16x2 = svscale_bf16_x2 (bf16x2, i16x2);
+ bf16x4 = svscale_bf16_x4 (bf16x4, i16x4);
+
+ bf16x2 = svscale_single_bf16_x2 (bf16x2, i16x1);
+ bf16x4 = svscale_single_bf16_x4 (bf16x4, i16x1);
+}
+
+void
+svscale_inferred_ok (svbfloat16x2_t bf16x2, svbfloat16x4_t bf16x4,
+ svint16_t i16x1, svint16x2_t i16x2,
+ svint16x4_t i16x4) __arm_streaming
+{
+ bf16x2 = svscale_bf16_x2 (bf16x2, i16x2);
+ bf16x4 = svscale_bf16_x4 (bf16x4, i16x4);
+
+ bf16x2 = svscale_single_bf16_x2 (bf16x2, i16x1);
+ bf16x4 = svscale_single_bf16_x4 (bf16x4, i16x1);
+}
+
+/*
+- BFMUL (multiple vectors)
+ // Only if __ARM_FEATURE_SVE_BFSCALE != 0 && __ARM_FEATURE_SME2 != 0
+ svbfloat16x2_t svmul[_bf16_x2] (svbfloat16x2_t zdn, svbfloat16x2_t zm)
__arm_streaming;
+ svbfloat16x4_t svmul[_bf16_x4] (svbfloat16x4_t zdn, svbfloat16x4_t zm)
__arm_streaming;
+
+- BFMUL (multiple and single vector)
+ // Only if __ARM_FEATURE_SVE_BFSCALE != 0 && __ARM_FEATURE_SME2 != 0
+ svbfloat16x2_t svmul[_single_bf16_x2] (svbfloat16x2_t zn, svbfloat16x2_t zm)
__arm_streaming;
+ svbfloat16x4_t svmul[_single_bf16_x4] (svbfloat16x4_t zn, svbfloat16x4_t zm)
__arm_streaming; */
+
+void
+svmul_explicit_ok (svbfloat16_t bf16x1, svbfloat16x2_t bf16x2,
+ svbfloat16x4_t bf16x4) __arm_streaming
+{
+ svmul_bf16_x2 (bf16x2, bf16x2);
+ svmul_bf16_x4 (bf16x4, bf16x4);
+
+ svmul_single_bf16_x2 (bf16x2, bf16x1);
+ svmul_single_bf16_x4 (bf16x4, bf16x1);
+}
+
+void
+svmul_inferred_ok (svbfloat16_t bf16x1, svbfloat16x2_t bf16x2,
+ svbfloat16x4_t bf16x4) __arm_streaming
+{
+ svmul (bf16x2, bf16x2);
+ svmul (bf16x4, bf16x4);
+
+ svmul (bf16x2, bf16x1);
+ svmul (bf16x4, bf16x1);
+}
diff --git a/gcc/testsuite/lib/target-supports.exp
b/gcc/testsuite/lib/target-supports.exp
index 2b450669c3d..87c21664422 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -12682,7 +12682,7 @@ set exts {
set exts_sve2 {
"sme-f8f16" "sme-f8f32"
"sme-b16b16" "sme-f16f16" "sme-i16i64" "sme" "sme2" "sme2p1"
- "ssve-fp8dot2" "ssve-fp8dot4" "ssve-fp8fma"
+ "ssve-fp8dot2" "ssve-fp8dot4" "ssve-fp8fma" "sve-bfscale"
}
foreach { aarch64_ext } $exts {
--
2.43.0