[PATCH] D80928: [BFloat] Add convert/copy instrinsic support
This revision was automatically updated to reflect the committed changes. Closed by commit rG3f353a2e5a98: [BFloat] Add convert/copy instrinsic support (authored by miyuki). Changed prior to commit: https://reviews.llvm.org/D80928?vs=272406=272731#toc Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D80928/new/ https://reviews.llvm.org/D80928 Files: clang/include/clang/Basic/arm_neon.td clang/lib/CodeGen/CGBuiltin.cpp clang/test/CodeGen/aarch64-bf16-lane-intrinsics.c clang/test/CodeGen/arm-bf16-convert-intrinsics.c clang/test/Sema/aarch64-neon-bf16-ranges.c clang/utils/TableGen/NeonEmitter.cpp llvm/include/llvm/IR/IntrinsicsAArch64.td llvm/include/llvm/IR/IntrinsicsARM.td llvm/lib/Target/AArch64/AArch64InstrFormats.td llvm/lib/Target/AArch64/AArch64InstrInfo.td llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp llvm/test/CodeGen/AArch64/bf16-convert-intrinsics.ll llvm/test/CodeGen/AArch64/bf16-vector-shuffle.ll llvm/test/CodeGen/ARM/bf16-convert-intrinsics.ll Index: llvm/test/CodeGen/ARM/bf16-convert-intrinsics.ll === --- /dev/null +++ llvm/test/CodeGen/ARM/bf16-convert-intrinsics.ll @@ -0,0 +1,56 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -verify-machineinstrs -mtriple=armv8.6a-arm-none-eabi -mattr=+neon,+bf16,+fullfp16 | FileCheck %s + +declare bfloat @llvm.arm.neon.vcvtbfp2bf(float) + +; Hard float ABI +declare <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float>) + +define arm_aapcs_vfpcc <4 x bfloat> @test_vcvt_bf16_f32_hardfp(<4 x float> %a) { +; CHECK-LABEL: test_vcvt_bf16_f32_hardfp: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT:vcvt.bf16.f32 d0, q0 +; CHECK-NEXT:bx lr +entry: + %vcvtfp2bf1.i.i = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> %a) + ret <4 x bfloat> %vcvtfp2bf1.i.i +} + +define arm_aapcs_vfpcc bfloat @test_vcvth_bf16_f32_hardfp(float %a) { +; CHECK-LABEL: test_vcvth_bf16_f32_hardfp: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT:vcvtb.bf16.f32 s0, s0 +; CHECK-NEXT:bx lr +entry: + %vcvtbfp2bf.i = call bfloat @llvm.arm.neon.vcvtbfp2bf(float %a) + ret bfloat %vcvtbfp2bf.i +} + +; Soft float ABI +declare <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float>) + +define <2 x i32> @test_vcvt_bf16_f32_softfp(<4 x float> %a) { +; CHECK-LABEL: test_vcvt_bf16_f32_softfp: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT:vmov d17, r2, r3 +; CHECK-NEXT:vmov d16, r0, r1 +; CHECK-NEXT:vcvt.bf16.f32 d16, q8 +; CHECK-NEXT:vmov r0, r1, d16 +; CHECK-NEXT:bx lr +entry: + %vcvtfp2bf1.i.i = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> %a) + %.cast = bitcast <4 x i16> %vcvtfp2bf1.i.i to <2 x i32> + ret <2 x i32> %.cast +} + +define bfloat @test_vcvth_bf16_f32_softfp(float %a) #1 { +; CHECK-LABEL: test_vcvth_bf16_f32_softfp: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT:vmov s0, r0 +; CHECK-NEXT:vcvtb.bf16.f32 s0, s0 +; CHECK-NEXT:vmov r0, s0 +; CHECK-NEXT:bx lr +entry: + %vcvtbfp2bf.i = call bfloat @llvm.arm.neon.vcvtbfp2bf(float %a) #3 + ret bfloat %vcvtbfp2bf.i +} Index: llvm/test/CodeGen/AArch64/bf16-vector-shuffle.ll === --- llvm/test/CodeGen/AArch64/bf16-vector-shuffle.ll +++ llvm/test/CodeGen/AArch64/bf16-vector-shuffle.ll @@ -163,3 +163,87 @@ %vgetq_lane = extractelement <8 x bfloat> %v, i32 7 ret bfloat %vgetq_lane } + +; vcopy_lane_bf16(a, 1, b, 3); +define <4 x bfloat> @test_vcopy_lane_bf16_v1(<4 x bfloat> %a, <4 x bfloat> %b) nounwind { +; CHECK-LABEL: test_vcopy_lane_bf16_v1: +; CHECK-NEXT:mov v0.h[1], v1.h[3] +; CHECK-NEXT:ret +entry: + %vset_lane = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> + ret <4 x bfloat> %vset_lane +} + +; vcopy_lane_bf16(a, 2, b, 0); +define <4 x bfloat> @test_vcopy_lane_bf16_v2(<4 x bfloat> %a, <4 x bfloat> %b) nounwind { +; CHECK-LABEL: test_vcopy_lane_bf16_v2: +; CHECK-NEXT:mov v0.h[2], v1.h[0] +; CHECK-NEXT:ret +entry: + %vset_lane = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> + ret <4 x bfloat> %vset_lane +} + +; vcopyq_lane_bf16(a, 0, b, 2); +define <8 x bfloat> @test_vcopyq_lane_bf16_v1(<8 x bfloat> %a, <4 x bfloat> %b) nounwind { +; CHECK-LABEL: test_vcopyq_lane_bf16_v1: +; CHECK-NEXT:mov v0.h[0], v1.h[2] +; CHECK-NEXT:ret +entry: + %0 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> + %vset_lane = shufflevector <8 x bfloat> %a, <8 x bfloat> %0, <8 x i32> + ret <8 x bfloat> %vset_lane +} + +; vcopyq_lane_bf16(a, 6, b, 0); +define <8 x bfloat> @test_vcopyq_lane_bf16_v2(<8 x bfloat> %a, <4 x bfloat> %b) nounwind { +; CHECK-LABEL: test_vcopyq_lane_bf16_v2: +; CHECK-NEXT:mov v0.h[6], v1.h[0] +; CHECK-NEXT:ret +entry: + %0 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> + %vset_lane = shufflevector <8 x bfloat> %a,
[PATCH] D80928: [BFloat] Add convert/copy instrinsic support
stuij accepted this revision. stuij added a comment. This revision is now accepted and ready to land. LGTM CHANGES SINCE LAST ACTION https://reviews.llvm.org/D80928/new/ https://reviews.llvm.org/D80928 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D80928: [BFloat] Add convert/copy instrinsic support
miyuki marked 5 inline comments as done. miyuki added inline comments. Comment at: clang/utils/TableGen/NeonEmitter.cpp:1066 if (Name == "vcvt_f16_f32" || Name == "vcvt_f32_f16" || + Name == "vcvt_f32_f64" || Name == "vcvt_f64_f32" || LukeGeeson wrote: > I always wondered why we need special treatment for cvt intrinsics here, is > there a better place to put this? If we were to refactor this, it should be done in a separate patch. CHANGES SINCE LAST ACTION https://reviews.llvm.org/D80928/new/ https://reviews.llvm.org/D80928 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D80928: [BFloat] Add convert/copy instrinsic support
miyuki updated this revision to Diff 272406. miyuki added a comment. 1. Rebased and fixed failures 2. Added a test for AArch64 codegen of lane copying intrinsics 3. Addressed reviewers' comments CHANGES SINCE LAST ACTION https://reviews.llvm.org/D80928/new/ https://reviews.llvm.org/D80928 Files: clang/include/clang/Basic/arm_neon.td clang/lib/CodeGen/CGBuiltin.cpp clang/test/CodeGen/aarch64-bf16-lane-intrinsics.c clang/test/CodeGen/arm-bf16-convert-intrinsics.c clang/test/Sema/aarch64-neon-bf16-ranges.c clang/utils/TableGen/NeonEmitter.cpp llvm/include/llvm/IR/IntrinsicsAArch64.td llvm/include/llvm/IR/IntrinsicsARM.td llvm/lib/Target/AArch64/AArch64InstrFormats.td llvm/lib/Target/AArch64/AArch64InstrInfo.td llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp llvm/test/CodeGen/AArch64/bf16-convert-intrinsics.ll llvm/test/CodeGen/AArch64/bf16-vector-shuffle.ll llvm/test/CodeGen/ARM/bf16-convert-intrinsics.ll Index: llvm/test/CodeGen/ARM/bf16-convert-intrinsics.ll === --- /dev/null +++ llvm/test/CodeGen/ARM/bf16-convert-intrinsics.ll @@ -0,0 +1,56 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -verify-machineinstrs -mtriple=armv8.6a-arm-none-eabi -mattr=+neon,+bf16,+fullfp16 | FileCheck %s + +declare bfloat @llvm.arm.neon.vcvtbfp2bf(float) + +; Hard float ABI +declare <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float>) + +define arm_aapcs_vfpcc <4 x bfloat> @test_vcvt_bf16_f32_hardfp(<4 x float> %a) { +; CHECK-LABEL: test_vcvt_bf16_f32_hardfp: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT:vcvt.bf16.f32 d0, q0 +; CHECK-NEXT:bx lr +entry: + %vcvtfp2bf1.i.i = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> %a) + ret <4 x bfloat> %vcvtfp2bf1.i.i +} + +define arm_aapcs_vfpcc bfloat @test_vcvth_bf16_f32_hardfp(float %a) { +; CHECK-LABEL: test_vcvth_bf16_f32_hardfp: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT:vcvtb.bf16.f32 s0, s0 +; CHECK-NEXT:bx lr +entry: + %vcvtbfp2bf.i = call bfloat @llvm.arm.neon.vcvtbfp2bf(float %a) + ret bfloat %vcvtbfp2bf.i +} + +; Soft float ABI +declare <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float>) + +define <2 x i32> @test_vcvt_bf16_f32_softfp(<4 x float> %a) { +; CHECK-LABEL: test_vcvt_bf16_f32_softfp: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT:vmov d17, r2, r3 +; CHECK-NEXT:vmov d16, r0, r1 +; CHECK-NEXT:vcvt.bf16.f32 d16, q8 +; CHECK-NEXT:vmov r0, r1, d16 +; CHECK-NEXT:bx lr +entry: + %vcvtfp2bf1.i.i = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> %a) + %.cast = bitcast <4 x i16> %vcvtfp2bf1.i.i to <2 x i32> + ret <2 x i32> %.cast +} + +define bfloat @test_vcvth_bf16_f32_softfp(float %a) #1 { +; CHECK-LABEL: test_vcvth_bf16_f32_softfp: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT:vmov s0, r0 +; CHECK-NEXT:vcvtb.bf16.f32 s0, s0 +; CHECK-NEXT:vmov r0, s0 +; CHECK-NEXT:bx lr +entry: + %vcvtbfp2bf.i = call bfloat @llvm.arm.neon.vcvtbfp2bf(float %a) #3 + ret bfloat %vcvtbfp2bf.i +} Index: llvm/test/CodeGen/AArch64/bf16-vector-shuffle.ll === --- llvm/test/CodeGen/AArch64/bf16-vector-shuffle.ll +++ llvm/test/CodeGen/AArch64/bf16-vector-shuffle.ll @@ -163,3 +163,87 @@ %vgetq_lane = extractelement <8 x bfloat> %v, i32 7 ret bfloat %vgetq_lane } + +; vcopy_lane_bf16(a, 1, b, 3); +define <4 x bfloat> @test_vcopy_lane_bf16_v1(<4 x bfloat> %a, <4 x bfloat> %b) nounwind { +; CHECK-LABEL: test_vcopy_lane_bf16_v1: +; CHECK-NEXT:mov v0.h[1], v1.h[3] +; CHECK-NEXT:ret +entry: + %vset_lane = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> + ret <4 x bfloat> %vset_lane +} + +; vcopy_lane_bf16(a, 2, b, 0); +define <4 x bfloat> @test_vcopy_lane_bf16_v2(<4 x bfloat> %a, <4 x bfloat> %b) nounwind { +; CHECK-LABEL: test_vcopy_lane_bf16_v2: +; CHECK-NEXT:mov v0.h[2], v1.h[0] +; CHECK-NEXT:ret +entry: + %vset_lane = shufflevector <4 x bfloat> %a, <4 x bfloat> %b, <4 x i32> + ret <4 x bfloat> %vset_lane +} + +; vcopyq_lane_bf16(a, 0, b, 2); +define <8 x bfloat> @test_vcopyq_lane_bf16_v1(<8 x bfloat> %a, <4 x bfloat> %b) nounwind { +; CHECK-LABEL: test_vcopyq_lane_bf16_v1: +; CHECK-NEXT:mov v0.h[0], v1.h[2] +; CHECK-NEXT:ret +entry: + %0 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> + %vset_lane = shufflevector <8 x bfloat> %a, <8 x bfloat> %0, <8 x i32> + ret <8 x bfloat> %vset_lane +} + +; vcopyq_lane_bf16(a, 6, b, 0); +define <8 x bfloat> @test_vcopyq_lane_bf16_v2(<8 x bfloat> %a, <4 x bfloat> %b) nounwind { +; CHECK-LABEL: test_vcopyq_lane_bf16_v2: +; CHECK-NEXT:mov v0.h[6], v1.h[0] +; CHECK-NEXT:ret +entry: + %0 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> + %vset_lane = shufflevector <8 x bfloat> %a, <8 x bfloat> %0, <8 x i32> + ret <8 x bfloat> %vset_lane +} + +; vcopy_laneq_bf16(a, 0, b, 7);
[PATCH] D80928: [BFloat] Add convert/copy instrinsic support
ostannard added inline comments. Comment at: llvm/include/llvm/IR/IntrinsicsARM.td:789 +def int_arm_neon_vcvtfp2bf +: Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>; I only see this being used for f32 -> bf16 conversion, so could this have concrete types, instead of llvm_anyvector_ty? Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D80928/new/ https://reviews.llvm.org/D80928 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D80928: [BFloat] Add convert/copy instrinsic support
miyuki added inline comments. Comment at: llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp:4675 + SDValue Ops[] = { Src, Pred, Reg0 }; + CurDAG->SelectNodeTo(N, ARM::BF16_VCVT, MVT::v4i16, Ops); + return; Why does it return `MVT::v4i16`, not `MVT::v4bf16`? Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D80928/new/ https://reviews.llvm.org/D80928 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D80928: [BFloat] Add convert/copy instrinsic support
LukeGeeson added a comment. A few nits, it looks largely uncontroversial but I'll let someone else give the ok since an external eye is always good :) Comment at: clang/utils/TableGen/NeonEmitter.cpp:1066 if (Name == "vcvt_f16_f32" || Name == "vcvt_f32_f16" || + Name == "vcvt_f32_f64" || Name == "vcvt_f64_f32" || I always wondered why we need special treatment for cvt intrinsics here, is there a better place to put this? Comment at: llvm/test/CodeGen/ARM/bf16-intrinsics-nofp16.ll:1 +; RUN: llc < %s -verify-machineinstrs -mtriple=armv8.6a-arm-none-eabi -mattr=+neon -mattr=+bf16 | FileCheck %s + These files are very small, it's almost not worth having them on their own, can you merge them into 1 file unless we need 2 for a specific reason Comment at: llvm/test/CodeGen/ARM/bf16-intrinsics.ll:1 +; RUN: llc < %s -verify-machineinstrs -mtriple=armv8.6a-arm-none-eabi -mattr=+fullfp16 -mattr=+neon -mattr=+bf16 | FileCheck %s + same here Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D80928/new/ https://reviews.llvm.org/D80928 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D80928: [BFloat] Add convert/copy instrinsic support
labrinea created this revision. labrinea added reviewers: fpetrogalli, LukeGeeson, stuij, momchil.velikov, SjoerdMeijer, miyuki. Herald added subscribers: hiraditya, kristof.beyls. Herald added projects: clang, LLVM. This patch is part of a series implementing the Bfloat16 extension of the Armv8.6-a architecture, as detailed here: https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/arm-architecture-developments-armv8-6-a Specifically it adds intrinsic support in clang and llvm for Arm and AArch64. The bfloat type, and its properties are specified in the Arm Architecture Reference Manual: https://developer.arm.com/docs/ddi0487/latest/arm-architecture-reference-manual-armv8-for-armv8-a-architecture-profile The following people contributed to this patch: - Alexandros Lamprineas - Luke Cheeseman - Mikhail Maltsev - Momchil Velikov Repository: rG LLVM Github Monorepo https://reviews.llvm.org/D80928 Files: clang/include/clang/Basic/arm_neon.td clang/lib/CodeGen/CGBuiltin.cpp clang/test/CodeGen/aarch64-bf16-lane-intrinsics.c clang/test/CodeGen/arm-bf16-conv-copy-intrinsics.c clang/test/Sema/aarch64-neon-bf16-ranges.c clang/utils/TableGen/NeonEmitter.cpp llvm/include/llvm/IR/IntrinsicsAArch64.td llvm/include/llvm/IR/IntrinsicsARM.td llvm/lib/Target/AArch64/AArch64InstrFormats.td llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp llvm/test/CodeGen/AArch64/bf16-intrinsics.ll llvm/test/CodeGen/ARM/bf16-intrinsics-nofp16.ll llvm/test/CodeGen/ARM/bf16-intrinsics.ll Index: llvm/test/CodeGen/ARM/bf16-intrinsics.ll === --- /dev/null +++ llvm/test/CodeGen/ARM/bf16-intrinsics.ll @@ -0,0 +1,24 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=armv8.6a-arm-none-eabi -mattr=+fullfp16 -mattr=+neon -mattr=+bf16 | FileCheck %s + +declare bfloat @llvm.arm.neon.vcvtbfp2bf.bf16.f32(float) +declare <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16.v4f32(<4 x float>) + +; CHECK-LABEL: test_vcvth_bf16_f32 +; CHECK: vcvtb.bf16.f32 s0, s0 +define arm_aapcs_vfpcc float @test_vcvth_bf16_f32(float %a) { +entry: + %vcvtbfp2bf.i = tail call bfloat @llvm.arm.neon.vcvtbfp2bf.bf16.f32(float %a) + %0 = bitcast bfloat %vcvtbfp2bf.i to i16 + %tmp.0.insert.ext.i = zext i16 %0 to i32 + %1 = bitcast i32 %tmp.0.insert.ext.i to float + ret float %1 +} + +; CHECK-LABEL: test_vcvt_bf16_f32 +; CHECK: vcvt.bf16.f32 d0, q0 +define arm_aapcs_vfpcc <4 x bfloat> @test_vcvt_bf16_f32(<4 x float> %a) { +entry: + %vcvtfp2bf1.i.i = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16.v4f32(<4 x float> %a) + ret <4 x bfloat> %vcvtfp2bf1.i.i +} + Index: llvm/test/CodeGen/ARM/bf16-intrinsics-nofp16.ll === --- /dev/null +++ llvm/test/CodeGen/ARM/bf16-intrinsics-nofp16.ll @@ -0,0 +1,23 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=armv8.6a-arm-none-eabi -mattr=+neon -mattr=+bf16 | FileCheck %s + +declare i32 @llvm.arm.neon.vcvtbfp2bf.i32.f32(float) +declare <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16.v4f32(<4 x float>) + +; CHECK-LABEL: test_vcvth_bf16_f32 +; CHECK: vcvtb.bf16.f32 s0, s0 +define arm_aapcs_vfpcc float @test_vcvth_bf16_f32(float %a) { +entry: + %vcvtbfp2bf = tail call i32 @llvm.arm.neon.vcvtbfp2bf.i32.f32(float %a) + %tmp.0.insert.ext = and i32 %vcvtbfp2bf, 65535 + %0 = bitcast i32 %tmp.0.insert.ext to float + ret float %0 +} + +; CHECK-LABEL: test_vcvt_bf16_f32 +; CHECK: vcvt.bf16.f32 d0, q0 +define arm_aapcs_vfpcc <2 x i32> @test_vcvt_bf16_f32(<4 x float> %a) { +entry: + %vcvtfp2bf1.i.i = tail call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16.v4f32(<4 x float> %a) + %0 = bitcast <4 x i16> %vcvtfp2bf1.i.i to <2 x i32> + ret <2 x i32> %0 +} Index: llvm/test/CodeGen/AArch64/bf16-intrinsics.ll === --- /dev/null +++ llvm/test/CodeGen/AArch64/bf16-intrinsics.ll @@ -0,0 +1,34 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-arm-none-eabi -mattr=+neon -mattr=+bf16 | FileCheck %s + +declare bfloat @llvm.aarch64.neon.bfcvt.f16.f32(float) +declare <8 x bfloat> @llvm.aarch64.neon.bfcvtn.v8f16.v4f32(<4 x float>) +declare <8 x bfloat> @llvm.aarch64.neon.bfcvtn2.v8f16.v8f16.v4f32(<8 x bfloat>, <4 x float>) + +; CHECK-LABEL: test_vcvth_bf16_f32 +; CHECK: bfcvt h0, s0 +; CHECK-NEXT: ret +define bfloat @test_vcvth_bf16_f32(float %a) { +entry: + %vcvth_bf16_f32 = call bfloat @llvm.aarch64.neon.bfcvt.f16.f32(float %a) + ret bfloat %vcvth_bf16_f32 +} + +; CHECK-LABEL: test_vcvtq_low_bf16_f32 +; CHECK: bfcvtn v0.4h, v0.4s +; CHECK-NEXT: ret +define <8 x bfloat> @test_vcvtq_low_bf16_f32(<4 x float> %a) { +entry: + %cvt = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn.v8f16.v4f32(<4 x float> %a) + ret <8 x bfloat> %cvt +} + +; CHECK-LABEL: test_vcvtq_high_bf16_f32 +; CHECK: bfcvtn2 v1.8h, v0.4s +; CHECK-NEXT: mov v0.16b, v1.16b +; CHECK-NEXT: ret +define <8 x bfloat>