Sure, here it is. I'll do that for the other patch too.
Thanks,
Delia
On 1/31/20 3:37 PM, Richard Sandiford wrote:
> Delia Burduv <[email protected]> writes:
>> Thank you, Richard!
>>
>> Here is the updated patch. The test that checks for errors when bf16 is
>> disabled is in the bfcvt patch.
>
> Looks good. Just a couple of very minor things...
>
>>
>> Cheers,
>> Delia
>>
>> gcc/ChangeLog:
>>
>> 2019-11-06 Delia Burduv <[email protected]>
>>
>> * config/aarch64/aarch64-simd-builtins.def
>> (bfcvtn): New built-in function.
>> (bfcvtn_q): New built-in function.
>> (bfcvtn2): New built-in function.
>> (bfcvt): New built-in function.
>> * config/aarch64/aarch64-simd.md
>> (aarch64_bfcvtn<q><mode>): New pattern.
>> (aarch64_bfcvtn2v8bf): New pattern.
>> (aarch64_bfcvtbf): New pattern.
>> * config/aarch64/arm_bf16.h (float32_t): New typedef.
>> (vcvth_bf16_f32): New intrinsic.
>> * config/aarch64/arm_bf16.h (vcvt_bf16_f32): New intrinsic.
>> (vcvtq_low_bf16_f32): New intrinsic.
>> (vcvtq_high_bf16_f32): New intrinsic.
>> * config/aarch64/iterators.md (V4SF_TO_BF): New mode iterator.
>> (UNSPEC_BFCVTN): New UNSPEC.
>> (UNSPEC_BFCVTN2): New UNSPEC.
>> (UNSPEC_BFCVT): New UNSPEC.
>> * config/arm/types.md (bf_cvt): New type.
>
> The patch no longer changes types.md. :-)
>
>> diff --git
>> a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfmlalbt-compile.c
>> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfmlalbt-compile.c
>> new file mode 100644
>> index
>> 0000000000000000000000000000000000000000..9feb7ee7905cb14037427a36797fc67a6fa3fbc8
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfmlalbt-compile.c
>> @@ -0,0 +1,67 @@
>> +/* { dg-do assemble { target { aarch64*-*-* } } } */
>> +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
>> +/* { dg-add-options arm_v8_2a_bf16_neon } */
>> +/* { dg-additional-options "-save-temps" } */
>> +/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
>> +
>> +#include <arm_neon.h>
>> +
>> +/*
>> +**test_bfmlalb:
>> +** bfmlalb\tv0.4s, v1.8h, v2.8h
>
> This version uses \t while the previous one used literal tabs.
> TBH I think the literal tab is nicer (and what we use for SVE FWIW).
>
> OK with those changes, thanks. Seems silly to ask when the changes
> are so trivial, but: please could you post an updated patch so that
> I can apply verbatim?
>
> Richard
>
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index a118f4f121de067c0a80f691b852247b0ab27f7a..02b2154cf64dad02cf57b110af51b19dd7f91c51 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -692,3 +692,14 @@
VAR2 (TERNOP, bfdot, 0, v2sf, v4sf)
VAR2 (QUADOP_LANE_PAIR, bfdot_lane, 0, v2sf, v4sf)
VAR2 (QUADOP_LANE_PAIR, bfdot_laneq, 0, v2sf, v4sf)
+
+ /* Implemented by aarch64_bfmmlaqv4sf */
+ VAR1 (TERNOP, bfmmlaq, 0, v4sf)
+
+ /* Implemented by aarch64_bfmlal<bt>{_lane{q}}v4sf */
+ VAR1 (TERNOP, bfmlalb, 0, v4sf)
+ VAR1 (TERNOP, bfmlalt, 0, v4sf)
+ VAR1 (QUADOP_LANE, bfmlalb_lane, 0, v4sf)
+ VAR1 (QUADOP_LANE, bfmlalt_lane, 0, v4sf)
+ VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, v4sf)
+ VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, v4sf)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 97f46f96968a6bc2f93bbc812931537b819b3b19..6ba72d7dc82ed02b5b5001a13ca896ab245a9d41 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -7091,3 +7091,42 @@
}
[(set_attr "type" "neon_dot<VDQSF:q>")]
)
+
+;; bfmmla
+(define_insn "aarch64_bfmmlaqv4sf"
+ [(set (match_operand:V4SF 0 "register_operand" "=w")
+ (plus:V4SF (match_operand:V4SF 1 "register_operand" "0")
+ (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+ (match_operand:V8BF 3 "register_operand" "w")]
+ UNSPEC_BFMMLA)))]
+ "TARGET_BF16_SIMD"
+ "bfmmla\\t%0.4s, %2.8h, %3.8h"
+ [(set_attr "type" "neon_fp_mla_s_q")]
+)
+
+;; bfmlal<bt>
+(define_insn "aarch64_bfmlal<bt>v4sf"
+ [(set (match_operand:V4SF 0 "register_operand" "=w")
+ (plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
+ (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+ (match_operand:V8BF 3 "register_operand" "w")]
+ BF_MLA)))]
+ "TARGET_BF16_SIMD"
+ "bfmlal<bt>\\t%0.4s, %2.8h, %3.8h"
+ [(set_attr "type" "neon_fp_mla_s_q")]
+)
+
+(define_insn "aarch64_bfmlal<bt>_lane<q>v4sf"
+ [(set (match_operand:V4SF 0 "register_operand" "=w")
+ (plus: V4SF (match_operand:V4SF 1 "register_operand" "0")
+ (unspec:V4SF [(match_operand:V8BF 2 "register_operand" "w")
+ (match_operand:VBF 3 "register_operand" "w")
+ (match_operand:SI 4 "const_int_operand" "n")]
+ BF_MLA)))]
+ "TARGET_BF16_SIMD"
+{
+ operands[4] = aarch64_endian_lane_rtx (<MODE>mode, INTVAL (operands[4]));
+ return "bfmlal<bt>\\t%0.4s, %2.8h, %3.h[%4]";
+}
+ [(set_attr "type" "neon_fp_mla_s_scalar_q")]
+)
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 7f05c3f9eca844b0e7b824a191223a4906c825b1..db845a3d2d204d28f0e62fa61927e01dcb15f4a4 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -34660,6 +34660,60 @@ vbfdotq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
return __builtin_aarch64_bfdot_laneqv4sf (__r, __a, __b, __index);
}
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmmlaq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+
+{
+ return __builtin_aarch64_bfmmlaqv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+ return __builtin_aarch64_bfmlalbv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+ return __builtin_aarch64_bfmlaltv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
+ const int __index)
+{
+ return __builtin_aarch64_bfmlalb_lanev4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
+ const int __index)
+{
+ return __builtin_aarch64_bfmlalt_lanev4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
+ const int __index)
+{
+ return __builtin_aarch64_bfmlalb_lane_qv4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
+ const int __index)
+{
+ return __builtin_aarch64_bfmlalt_lane_qv4sf (__r, __a, __b, __index);
+}
+
#pragma GCC pop_options
/* AdvSIMD 8-bit Integer Matrix Multiply (I8MM) intrinsics. */
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index fc973086cb91ae0dc54eeeb0b832d522539d7982..a32b21c639c2fe7ce6e432901fb293f196cbfff0 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -808,6 +808,9 @@
UNSPEC_USDOT ; Used in aarch64-simd.md.
UNSPEC_SUDOT ; Used in aarch64-simd.md.
UNSPEC_BFDOT ; Used in aarch64-simd.md.
+ UNSPEC_BFMMLA ; Used in aarch64-simd.md.
+ UNSPEC_BFMLALB ; Used in aarch64-simd.md.
+ UNSPEC_BFMLALT ; Used in aarch64-simd.md.
])
;; ------------------------------------------------------------------
@@ -2553,6 +2556,9 @@
(define_int_iterator SVE_PITER [UNSPEC_PFIRST UNSPEC_PNEXT])
+(define_int_iterator BF_MLA [UNSPEC_BFMLALB
+ UNSPEC_BFMLALT])
+
;; Iterators for atomic operations.
(define_int_iterator ATOMIC_LDOP
@@ -2793,6 +2799,8 @@
(define_int_attr ab [(UNSPEC_CLASTA "a") (UNSPEC_CLASTB "b")
(UNSPEC_LASTA "a") (UNSPEC_LASTB "b")])
+(define_int_attr bt [(UNSPEC_BFMLALB "b") (UNSPEC_BFMLALT "t")])
+
(define_int_attr addsub [(UNSPEC_SHADD "add")
(UNSPEC_UHADD "add")
(UNSPEC_SRHADD "add")
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfmlalbt-compile.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfmlalbt-compile.c
new file mode 100644
index 0000000000000000000000000000000000000000..9810e4ba37444fe08425c1cceae086860d962453
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfmlalbt-compile.c
@@ -0,0 +1,67 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+/* { dg-additional-options "-save-temps" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include <arm_neon.h>
+
+/*
+**test_bfmlalb:
+** bfmlalb v0.4s, v1.8h, v2.8h
+** ret
+*/
+float32x4_t test_bfmlalb (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+ return vbfmlalbq_f32 (r, a, b);
+}
+
+/*
+**test_bfmlalt:
+** bfmlalt v0.4s, v1.8h, v2.8h
+** ret
+*/
+float32x4_t test_bfmlalt (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+ return vbfmlaltq_f32 (r, a, b);
+}
+
+/*
+**test_bfmlalb_lane:
+** bfmlalb v0.4s, v1.8h, v2.h[0]
+** ret
+*/
+float32x4_t test_bfmlalb_lane (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+ return vbfmlalbq_lane_f32 (r, a, b, 0);
+}
+
+/*
+**test_bfmlalt_lane:
+** bfmlalt v0.4s, v1.8h, v2.h[2]
+** ret
+*/
+float32x4_t test_bfmlalt_lane (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+ return vbfmlaltq_lane_f32 (r, a, b, 2);
+}
+
+/*
+**test_bfmlalb_laneq:
+** bfmlalb v0.4s, v1.8h, v2.h[4]
+** ret
+*/
+float32x4_t test_bfmlalb_laneq (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+ return vbfmlalbq_laneq_f32 (r, a, b, 4);
+}
+
+/*
+**test_bfmlalt_laneq:
+** bfmlalt v0.4s, v1.8h, v2.h[7]
+** ret
+*/
+float32x4_t test_bfmlalt_laneq (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+ return vbfmlaltq_laneq_f32 (r, a, b, 7);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfmmla-compile.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfmmla-compile.c
new file mode 100644
index 0000000000000000000000000000000000000000..0aaa69f0037fb5ed5c085e76ee0c7eb61e5e8090
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bfmmla-compile.c
@@ -0,0 +1,18 @@
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+/* { dg-additional-options "-save-temps" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include <arm_neon.h>
+
+
+/*
+**test_bfmmla:
+** bfmmla v0.4s, v1.8h, v2.8h
+** ret
+*/
+float32x4_t test_bfmmla (float32x4_t r, bfloat16x8_t x, bfloat16x8_t y)
+{
+ return vbfmmlaq_f32 (r, x, y);
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vbfmlalbt_lane_f32_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vbfmlalbt_lane_f32_indices_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..4d50ba3a3814cb6fe8a768bdf6e13a4207cf585a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vbfmlalbt_lane_f32_indices_1.c
@@ -0,0 +1,46 @@
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+
+#include <arm_neon.h>
+
+void
+f_vbfmlaltq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+ /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */
+ vbfmlaltq_lane_f32 (r, a, b, -1);
+ /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */
+ vbfmlaltq_lane_f32 (r, a, b, 4);
+ return;
+}
+
+void
+f_vbfmlaltq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+ /* { dg-error "lane -1 out of range 0 - 7" "" { target *-*-* } 0 } */
+ vbfmlaltq_laneq_f32 (r, a, b, -1);
+ /* { dg-error "lane 8 out of range 0 - 7" "" { target *-*-* } 0 } */
+ vbfmlaltq_laneq_f32 (r, a, b, 8);
+ return;
+}
+
+void
+f_vbfmlalbq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+ /* { dg-error "lane -2 out of range 0 - 3" "" { target *-*-* } 0 } */
+ vbfmlalbq_lane_f32 (r, a, b, -2);
+ /* { dg-error "lane 5 out of range 0 - 3" "" { target *-*-* } 0 } */
+ vbfmlalbq_lane_f32 (r, a, b, 5);
+ return;
+}
+
+void
+f_vbfmlalbq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+ /* { dg-error "lane -2 out of range 0 - 7" "" { target *-*-* } 0 } */
+ vbfmlalbq_laneq_f32 (r, a, b, -2);
+ /* { dg-error "lane 9 out of range 0 - 7" "" { target *-*-* } 0 } */
+ vbfmlalbq_laneq_f32 (r, a, b, 9);
+ return;
+}