Author: Ami-zhang Date: 2025-06-09T11:15:41+08:00 New Revision: 0ed5d9aff6e72bdaf3f12bc71dbf83a5c116e8fd
URL: https://github.com/llvm/llvm-project/commit/0ed5d9aff6e72bdaf3f12bc71dbf83a5c116e8fd DIFF: https://github.com/llvm/llvm-project/commit/0ed5d9aff6e72bdaf3f12bc71dbf83a5c116e8fd.diff LOG: [LoongArch][BF16] Add support for the __bf16 type (#142548) The LoongArch psABI recently added __bf16 type support. Now we can enable this new type in clang. Currently, bf16 operations are automatically supported by promoting to float. This patch adds bf16 support by ensuring that load extension / truncate store operations are properly expanded. And this commit implements support for bf16 truncate/extend on hard FP targets. The extend operation is implemented by a shift just as in the standard legalization. This requires custom lowering of the truncate libcall on hard float ABIs (the normal libcall code path is used on soft ABIs). Added: clang/test/CodeGen/LoongArch/bfloat-abi.c clang/test/CodeGen/LoongArch/bfloat-mangle.cpp llvm/test/CodeGen/LoongArch/bf16-promote.ll llvm/test/CodeGen/LoongArch/bf16.ll Modified: clang/docs/LanguageExtensions.rst clang/lib/Basic/Targets/LoongArch.h llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp llvm/lib/Target/LoongArch/LoongArchISelLowering.h Removed: ################################################################################ diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index 73544826809c3..083c909088361 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -1009,6 +1009,7 @@ to ``float``; see below for more information on this emulation. * 64-bit ARM (AArch64) * RISC-V * X86 (when SSE2 is available) + * LoongArch (For X86, SSE2 is available on 64-bit and all recent 32-bit processors.) diff --git a/clang/lib/Basic/Targets/LoongArch.h b/clang/lib/Basic/Targets/LoongArch.h index 11636fa55cabd..3ad5abca927b7 100644 --- a/clang/lib/Basic/Targets/LoongArch.h +++ b/clang/lib/Basic/Targets/LoongArch.h @@ -49,6 +49,9 @@ class LLVM_LIBRARY_VISIBILITY LoongArchTargetInfo : public TargetInfo { HasFeatureLD_SEQ_SA = false; HasFeatureDiv32 = false; HasFeatureSCQ = false; + BFloat16Width = 16; + BFloat16Align = 16; + BFloat16Format = &llvm::APFloat::BFloat(); LongDoubleWidth = 128; LongDoubleAlign = 128; LongDoubleFormat = &llvm::APFloat::IEEEquad(); @@ -99,6 +102,8 @@ class LLVM_LIBRARY_VISIBILITY LoongArchTargetInfo : public TargetInfo { bool hasBitIntType() const override { return true; } + bool hasBFloat16Type() const override { return true; } + bool useFP16ConversionIntrinsics() const override { return false; } bool handleTargetFeatures(std::vector<std::string> &Features, diff --git a/clang/test/CodeGen/LoongArch/bfloat-abi.c b/clang/test/CodeGen/LoongArch/bfloat-abi.c new file mode 100644 index 0000000000000..a8a252919ef31 --- /dev/null +++ b/clang/test/CodeGen/LoongArch/bfloat-abi.c @@ -0,0 +1,532 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2 +// RUN: %clang_cc1 -triple loongarch64 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LA64 +// RUN: %clang_cc1 -triple loongarch32 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-LA32 + +struct bfloat1 { + __bf16 a; +}; + +// CHECK-LABEL: define dso_local bfloat @h1 +// CHECK-SAME: (bfloat noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_BFLOAT1:%.*]], align 2 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-NEXT: store bfloat [[A]], ptr [[A_ADDR]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2 +// CHECK-NEXT: [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT1]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-NEXT: store bfloat [[TMP0]], ptr [[A1]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { bfloat }, ptr [[RETVAL]], i32 0, i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = load bfloat, ptr [[TMP1]], align 2 +// CHECK-NEXT: ret bfloat [[TMP2]] +// +struct bfloat1 h1(__bf16 a) { + struct bfloat1 x; + x.a = a; + return x; +} + +struct bfloat2 { + __bf16 a; + __bf16 b; +}; + +// CHECK-LABEL: define dso_local { bfloat, bfloat } @h2 +// CHECK-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_BFLOAT2:%.*]], align 2 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-NEXT: store bfloat [[A]], ptr [[A_ADDR]], align 2 +// CHECK-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2 +// CHECK-NEXT: [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT2]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-NEXT: store bfloat [[TMP0]], ptr [[A1]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2 +// CHECK-NEXT: [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT2]], ptr [[RETVAL]], i32 0, i32 1 +// CHECK-NEXT: store bfloat [[TMP1]], ptr [[B2]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw { bfloat, bfloat }, ptr [[RETVAL]], i32 0, i32 0 +// CHECK-NEXT: [[TMP3:%.*]] = load bfloat, ptr [[TMP2]], align 2 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw { bfloat, bfloat }, ptr [[RETVAL]], i32 0, i32 1 +// CHECK-NEXT: [[TMP5:%.*]] = load bfloat, ptr [[TMP4]], align 2 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { bfloat, bfloat } poison, bfloat [[TMP3]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { bfloat, bfloat } [[TMP6]], bfloat [[TMP5]], 1 +// CHECK-NEXT: ret { bfloat, bfloat } [[TMP7]] +// +struct bfloat2 h2(__bf16 a, __bf16 b) { + struct bfloat2 x; + x.a = a; + x.b = b; + return x; +} + +struct bfloat3 { + __bf16 a; + __bf16 b; + __bf16 c; +}; + +// CHECK-LA64-LABEL: define dso_local i64 @h3 +// CHECK-LA64-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-LA64-NEXT: entry: +// CHECK-LA64-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_BFLOAT3:%.*]], align 2 +// CHECK-LA64-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: [[C_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: [[RETVAL_COERCE:%.*]] = alloca i64, align 8 +// CHECK-LA64-NEXT: store bfloat [[A]], ptr [[A_ADDR]], align 2 +// CHECK-LA64-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2 +// CHECK-LA64-NEXT: store bfloat [[C]], ptr [[C_ADDR]], align 2 +// CHECK-LA64-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2 +// CHECK-LA64-NEXT: [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT3]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-LA64-NEXT: store bfloat [[TMP0]], ptr [[A1]], align 2 +// CHECK-LA64-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2 +// CHECK-LA64-NEXT: [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT3]], ptr [[RETVAL]], i32 0, i32 1 +// CHECK-LA64-NEXT: store bfloat [[TMP1]], ptr [[B2]], align 2 +// CHECK-LA64-NEXT: [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2 +// CHECK-LA64-NEXT: [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT3]], ptr [[RETVAL]], i32 0, i32 2 +// CHECK-LA64-NEXT: store bfloat [[TMP2]], ptr [[C3]], align 2 +// CHECK-LA64-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL_COERCE]], ptr align 2 [[RETVAL]], i64 6, i1 false) +// CHECK-LA64-NEXT: [[TMP3:%.*]] = load i64, ptr [[RETVAL_COERCE]], align 8 +// CHECK-LA64-NEXT: ret i64 [[TMP3]] +// +// CHECK-LA32-LABEL: define dso_local [2 x i32] @h3 +// CHECK-LA32-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-LA32-NEXT: entry: +// CHECK-LA32-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_BFLOAT3:%.*]], align 2 +// CHECK-LA32-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: [[C_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: [[RETVAL_COERCE:%.*]] = alloca [2 x i32], align 4 +// CHECK-LA32-NEXT: store bfloat [[A]], ptr [[A_ADDR]], align 2 +// CHECK-LA32-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2 +// CHECK-LA32-NEXT: store bfloat [[C]], ptr [[C_ADDR]], align 2 +// CHECK-LA32-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2 +// CHECK-LA32-NEXT: [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT3]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-LA32-NEXT: store bfloat [[TMP0]], ptr [[A1]], align 2 +// CHECK-LA32-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2 +// CHECK-LA32-NEXT: [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT3]], ptr [[RETVAL]], i32 0, i32 1 +// CHECK-LA32-NEXT: store bfloat [[TMP1]], ptr [[B2]], align 2 +// CHECK-LA32-NEXT: [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2 +// CHECK-LA32-NEXT: [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT3]], ptr [[RETVAL]], i32 0, i32 2 +// CHECK-LA32-NEXT: store bfloat [[TMP2]], ptr [[C3]], align 2 +// CHECK-LA32-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[RETVAL_COERCE]], ptr align 2 [[RETVAL]], i32 6, i1 false) +// CHECK-LA32-NEXT: [[TMP3:%.*]] = load [2 x i32], ptr [[RETVAL_COERCE]], align 4 +// CHECK-LA32-NEXT: ret [2 x i32] [[TMP3]] +// +struct bfloat3 h3(__bf16 a, __bf16 b, __bf16 c) { + struct bfloat3 x; + x.a = a; + x.b = b; + x.c = c; + return x; +} + +struct bfloat4 { + __bf16 a; + __bf16 b; + __bf16 c; + __bf16 d; +}; + +// CHECK-LA64-LABEL: define dso_local i64 @h4 +// CHECK-LA64-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]], bfloat noundef [[D:%.*]]) #[[ATTR0]] { +// CHECK-LA64-NEXT: entry: +// CHECK-LA64-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_BFLOAT4:%.*]], align 2 +// CHECK-LA64-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: [[C_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: [[D_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: store bfloat [[A]], ptr [[A_ADDR]], align 2 +// CHECK-LA64-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2 +// CHECK-LA64-NEXT: store bfloat [[C]], ptr [[C_ADDR]], align 2 +// CHECK-LA64-NEXT: store bfloat [[D]], ptr [[D_ADDR]], align 2 +// CHECK-LA64-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2 +// CHECK-LA64-NEXT: [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT4]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-LA64-NEXT: store bfloat [[TMP0]], ptr [[A1]], align 2 +// CHECK-LA64-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2 +// CHECK-LA64-NEXT: [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT4]], ptr [[RETVAL]], i32 0, i32 1 +// CHECK-LA64-NEXT: store bfloat [[TMP1]], ptr [[B2]], align 2 +// CHECK-LA64-NEXT: [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2 +// CHECK-LA64-NEXT: [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT4]], ptr [[RETVAL]], i32 0, i32 2 +// CHECK-LA64-NEXT: store bfloat [[TMP2]], ptr [[C3]], align 2 +// CHECK-LA64-NEXT: [[TMP3:%.*]] = load bfloat, ptr [[D_ADDR]], align 2 +// CHECK-LA64-NEXT: [[D4:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT4]], ptr [[RETVAL]], i32 0, i32 3 +// CHECK-LA64-NEXT: store bfloat [[TMP3]], ptr [[D4]], align 2 +// CHECK-LA64-NEXT: [[TMP4:%.*]] = load i64, ptr [[RETVAL]], align 2 +// CHECK-LA64-NEXT: ret i64 [[TMP4]] +// +// CHECK-LA32-LABEL: define dso_local [2 x i32] @h4 +// CHECK-LA32-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]], bfloat noundef [[D:%.*]]) #[[ATTR0]] { +// CHECK-LA32-NEXT: entry: +// CHECK-LA32-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_BFLOAT4:%.*]], align 2 +// CHECK-LA32-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: [[C_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: [[D_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: store bfloat [[A]], ptr [[A_ADDR]], align 2 +// CHECK-LA32-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2 +// CHECK-LA32-NEXT: store bfloat [[C]], ptr [[C_ADDR]], align 2 +// CHECK-LA32-NEXT: store bfloat [[D]], ptr [[D_ADDR]], align 2 +// CHECK-LA32-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2 +// CHECK-LA32-NEXT: [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT4]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-LA32-NEXT: store bfloat [[TMP0]], ptr [[A1]], align 2 +// CHECK-LA32-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2 +// CHECK-LA32-NEXT: [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT4]], ptr [[RETVAL]], i32 0, i32 1 +// CHECK-LA32-NEXT: store bfloat [[TMP1]], ptr [[B2]], align 2 +// CHECK-LA32-NEXT: [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2 +// CHECK-LA32-NEXT: [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT4]], ptr [[RETVAL]], i32 0, i32 2 +// CHECK-LA32-NEXT: store bfloat [[TMP2]], ptr [[C3]], align 2 +// CHECK-LA32-NEXT: [[TMP3:%.*]] = load bfloat, ptr [[D_ADDR]], align 2 +// CHECK-LA32-NEXT: [[D4:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT4]], ptr [[RETVAL]], i32 0, i32 3 +// CHECK-LA32-NEXT: store bfloat [[TMP3]], ptr [[D4]], align 2 +// CHECK-LA32-NEXT: [[TMP4:%.*]] = load [2 x i32], ptr [[RETVAL]], align 2 +// CHECK-LA32-NEXT: ret [2 x i32] [[TMP4]] +// +struct bfloat4 h4(__bf16 a, __bf16 b, __bf16 c, __bf16 d) { + struct bfloat4 x; + x.a = a; + x.b = b; + x.c = c; + x.d = d; + return x; +} + +struct floatbfloat { + float a; + __bf16 b; +}; + +// CHECK-LABEL: define dso_local { float, bfloat } @fh +// CHECK-SAME: (float noundef [[A:%.*]], bfloat noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOATBFLOAT:%.*]], align 4 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca float, align 4 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-NEXT: store float [[A]], ptr [[A_ADDR]], align 4 +// CHECK-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR]], align 4 +// CHECK-NEXT: [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-NEXT: store float [[TMP0]], ptr [[A1]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2 +// CHECK-NEXT: [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT]], ptr [[RETVAL]], i32 0, i32 1 +// CHECK-NEXT: store bfloat [[TMP1]], ptr [[B2]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw { float, bfloat }, ptr [[RETVAL]], i32 0, i32 0 +// CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP2]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw { float, bfloat }, ptr [[RETVAL]], i32 0, i32 1 +// CHECK-NEXT: [[TMP5:%.*]] = load bfloat, ptr [[TMP4]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { float, bfloat } poison, float [[TMP3]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { float, bfloat } [[TMP6]], bfloat [[TMP5]], 1 +// CHECK-NEXT: ret { float, bfloat } [[TMP7]] +// +struct floatbfloat fh(float a, __bf16 b) { + struct floatbfloat x; + x.a = a; + x.b = b; + return x; +} + +struct floatbfloat2 { + float a; + __bf16 b; + __bf16 c; +}; + +// CHECK-LA64-LABEL: define dso_local i64 @fh2 +// CHECK-LA64-SAME: (float noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-LA64-NEXT: entry: +// CHECK-LA64-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOATBFLOAT2:%.*]], align 4 +// CHECK-LA64-NEXT: [[A_ADDR:%.*]] = alloca float, align 4 +// CHECK-LA64-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: [[C_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: store float [[A]], ptr [[A_ADDR]], align 4 +// CHECK-LA64-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2 +// CHECK-LA64-NEXT: store bfloat [[C]], ptr [[C_ADDR]], align 2 +// CHECK-LA64-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR]], align 4 +// CHECK-LA64-NEXT: [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT2]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-LA64-NEXT: store float [[TMP0]], ptr [[A1]], align 4 +// CHECK-LA64-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2 +// CHECK-LA64-NEXT: [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT2]], ptr [[RETVAL]], i32 0, i32 1 +// CHECK-LA64-NEXT: store bfloat [[TMP1]], ptr [[B2]], align 4 +// CHECK-LA64-NEXT: [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2 +// CHECK-LA64-NEXT: [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT2]], ptr [[RETVAL]], i32 0, i32 2 +// CHECK-LA64-NEXT: store bfloat [[TMP2]], ptr [[C3]], align 2 +// CHECK-LA64-NEXT: [[TMP3:%.*]] = load i64, ptr [[RETVAL]], align 4 +// CHECK-LA64-NEXT: ret i64 [[TMP3]] +// +// CHECK-LA32-LABEL: define dso_local [2 x i32] @fh2 +// CHECK-LA32-SAME: (float noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-LA32-NEXT: entry: +// CHECK-LA32-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOATBFLOAT2:%.*]], align 4 +// CHECK-LA32-NEXT: [[A_ADDR:%.*]] = alloca float, align 4 +// CHECK-LA32-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: [[C_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: store float [[A]], ptr [[A_ADDR]], align 4 +// CHECK-LA32-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2 +// CHECK-LA32-NEXT: store bfloat [[C]], ptr [[C_ADDR]], align 2 +// CHECK-LA32-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR]], align 4 +// CHECK-LA32-NEXT: [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT2]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-LA32-NEXT: store float [[TMP0]], ptr [[A1]], align 4 +// CHECK-LA32-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2 +// CHECK-LA32-NEXT: [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT2]], ptr [[RETVAL]], i32 0, i32 1 +// CHECK-LA32-NEXT: store bfloat [[TMP1]], ptr [[B2]], align 4 +// CHECK-LA32-NEXT: [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2 +// CHECK-LA32-NEXT: [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT2]], ptr [[RETVAL]], i32 0, i32 2 +// CHECK-LA32-NEXT: store bfloat [[TMP2]], ptr [[C3]], align 2 +// CHECK-LA32-NEXT: [[TMP3:%.*]] = load [2 x i32], ptr [[RETVAL]], align 4 +// CHECK-LA32-NEXT: ret [2 x i32] [[TMP3]] +// +struct floatbfloat2 fh2(float a, __bf16 b, __bf16 c) { + struct floatbfloat2 x; + x.a = a; + x.b = b; + x.c = c; + return x; +} + +struct bfloatfloat { + __bf16 a; + float b; +}; + +// CHECK-LABEL: define dso_local { bfloat, float } @hf +// CHECK-SAME: (bfloat noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_BFLOATFLOAT:%.*]], align 4 +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca float, align 4 +// CHECK-NEXT: store bfloat [[A]], ptr [[A_ADDR]], align 2 +// CHECK-NEXT: store float [[B]], ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2 +// CHECK-NEXT: [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOATFLOAT]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-NEXT: store bfloat [[TMP0]], ptr [[A1]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[B_ADDR]], align 4 +// CHECK-NEXT: [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOATFLOAT]], ptr [[RETVAL]], i32 0, i32 1 +// CHECK-NEXT: store float [[TMP1]], ptr [[B2]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw { bfloat, float }, ptr [[RETVAL]], i32 0, i32 0 +// CHECK-NEXT: [[TMP3:%.*]] = load bfloat, ptr [[TMP2]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw { bfloat, float }, ptr [[RETVAL]], i32 0, i32 1 +// CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { bfloat, float } poison, bfloat [[TMP3]], 0 +// CHECK-NEXT: [[TMP7:%.*]] = insertvalue { bfloat, float } [[TMP6]], float [[TMP5]], 1 +// CHECK-NEXT: ret { bfloat, float } [[TMP7]] +// +struct bfloatfloat hf(__bf16 a, float b) { + struct bfloatfloat x; + x.a = a; + x.b = b; + return x; +} + +struct bfloat2float { + __bf16 a; + __bf16 b; + float c; +}; + +// CHECK-LA64-LABEL: define dso_local i64 @h2f +// CHECK-LA64-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]], float noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-LA64-NEXT: entry: +// CHECK-LA64-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_BFLOAT2FLOAT:%.*]], align 4 +// CHECK-LA64-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: [[C_ADDR:%.*]] = alloca float, align 4 +// CHECK-LA64-NEXT: store bfloat [[A]], ptr [[A_ADDR]], align 2 +// CHECK-LA64-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2 +// CHECK-LA64-NEXT: store float [[C]], ptr [[C_ADDR]], align 4 +// CHECK-LA64-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2 +// CHECK-LA64-NEXT: [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT2FLOAT]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-LA64-NEXT: store bfloat [[TMP0]], ptr [[A1]], align 4 +// CHECK-LA64-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2 +// CHECK-LA64-NEXT: [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT2FLOAT]], ptr [[RETVAL]], i32 0, i32 1 +// CHECK-LA64-NEXT: store bfloat [[TMP1]], ptr [[B2]], align 2 +// CHECK-LA64-NEXT: [[TMP2:%.*]] = load float, ptr [[C_ADDR]], align 4 +// CHECK-LA64-NEXT: [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT2FLOAT]], ptr [[RETVAL]], i32 0, i32 2 +// CHECK-LA64-NEXT: store float [[TMP2]], ptr [[C3]], align 4 +// CHECK-LA64-NEXT: [[TMP3:%.*]] = load i64, ptr [[RETVAL]], align 4 +// CHECK-LA64-NEXT: ret i64 [[TMP3]] +// +// CHECK-LA32-LABEL: define dso_local [2 x i32] @h2f +// CHECK-LA32-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]], float noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-LA32-NEXT: entry: +// CHECK-LA32-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_BFLOAT2FLOAT:%.*]], align 4 +// CHECK-LA32-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: [[C_ADDR:%.*]] = alloca float, align 4 +// CHECK-LA32-NEXT: store bfloat [[A]], ptr [[A_ADDR]], align 2 +// CHECK-LA32-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2 +// CHECK-LA32-NEXT: store float [[C]], ptr [[C_ADDR]], align 4 +// CHECK-LA32-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2 +// CHECK-LA32-NEXT: [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT2FLOAT]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-LA32-NEXT: store bfloat [[TMP0]], ptr [[A1]], align 4 +// CHECK-LA32-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2 +// CHECK-LA32-NEXT: [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT2FLOAT]], ptr [[RETVAL]], i32 0, i32 1 +// CHECK-LA32-NEXT: store bfloat [[TMP1]], ptr [[B2]], align 2 +// CHECK-LA32-NEXT: [[TMP2:%.*]] = load float, ptr [[C_ADDR]], align 4 +// CHECK-LA32-NEXT: [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT2FLOAT]], ptr [[RETVAL]], i32 0, i32 2 +// CHECK-LA32-NEXT: store float [[TMP2]], ptr [[C3]], align 4 +// CHECK-LA32-NEXT: [[TMP3:%.*]] = load [2 x i32], ptr [[RETVAL]], align 4 +// CHECK-LA32-NEXT: ret [2 x i32] [[TMP3]] +// +struct bfloat2float h2f(__bf16 a, __bf16 b, float c) { + struct bfloat2float x; + x.a = a; + x.b = b; + x.c = c; + return x; +} + +struct floatbfloat3 { + float a; + __bf16 b; + __bf16 c; + __bf16 d; +}; + +// CHECK-LA64-LABEL: define dso_local [2 x i64] @fh3 +// CHECK-LA64-SAME: (float noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]], bfloat noundef [[D:%.*]]) #[[ATTR0]] { +// CHECK-LA64-NEXT: entry: +// CHECK-LA64-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_FLOATBFLOAT3:%.*]], align 4 +// CHECK-LA64-NEXT: [[A_ADDR:%.*]] = alloca float, align 4 +// CHECK-LA64-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: [[C_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: [[D_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: [[RETVAL_COERCE:%.*]] = alloca [2 x i64], align 8 +// CHECK-LA64-NEXT: store float [[A]], ptr [[A_ADDR]], align 4 +// CHECK-LA64-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2 +// CHECK-LA64-NEXT: store bfloat [[C]], ptr [[C_ADDR]], align 2 +// CHECK-LA64-NEXT: store bfloat [[D]], ptr [[D_ADDR]], align 2 +// CHECK-LA64-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR]], align 4 +// CHECK-LA64-NEXT: [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT3]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-LA64-NEXT: store float [[TMP0]], ptr [[A1]], align 4 +// CHECK-LA64-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2 +// CHECK-LA64-NEXT: [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT3]], ptr [[RETVAL]], i32 0, i32 1 +// CHECK-LA64-NEXT: store bfloat [[TMP1]], ptr [[B2]], align 4 +// CHECK-LA64-NEXT: [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2 +// CHECK-LA64-NEXT: [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT3]], ptr [[RETVAL]], i32 0, i32 2 +// CHECK-LA64-NEXT: store bfloat [[TMP2]], ptr [[C3]], align 2 +// CHECK-LA64-NEXT: [[TMP3:%.*]] = load bfloat, ptr [[D_ADDR]], align 2 +// CHECK-LA64-NEXT: [[D4:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT3]], ptr [[RETVAL]], i32 0, i32 3 +// CHECK-LA64-NEXT: store bfloat [[TMP3]], ptr [[D4]], align 4 +// CHECK-LA64-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL_COERCE]], ptr align 4 [[RETVAL]], i64 12, i1 false) +// CHECK-LA64-NEXT: [[TMP4:%.*]] = load [2 x i64], ptr [[RETVAL_COERCE]], align 8 +// CHECK-LA64-NEXT: ret [2 x i64] [[TMP4]] +// +// CHECK-LA32-LABEL: define dso_local void @fh3 +// CHECK-LA32-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_FLOATBFLOAT3:%.*]]) align 4 [[AGG_RESULT:%.*]], float noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]], bfloat noundef [[D:%.*]]) #[[ATTR0]] { +// CHECK-LA32-NEXT: entry: +// CHECK-LA32-NEXT: [[RESULT_PTR:%.*]] = alloca ptr, align 4 +// CHECK-LA32-NEXT: [[A_ADDR:%.*]] = alloca float, align 4 +// CHECK-LA32-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: [[C_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: [[D_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4 +// CHECK-LA32-NEXT: store float [[A]], ptr [[A_ADDR]], align 4 +// CHECK-LA32-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2 +// CHECK-LA32-NEXT: store bfloat [[C]], ptr [[C_ADDR]], align 2 +// CHECK-LA32-NEXT: store bfloat [[D]], ptr [[D_ADDR]], align 2 +// CHECK-LA32-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR]], align 4 +// CHECK-LA32-NEXT: [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT3]], ptr [[AGG_RESULT]], i32 0, i32 0 +// CHECK-LA32-NEXT: store float [[TMP0]], ptr [[A1]], align 4 +// CHECK-LA32-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2 +// CHECK-LA32-NEXT: [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT3]], ptr [[AGG_RESULT]], i32 0, i32 1 +// CHECK-LA32-NEXT: store bfloat [[TMP1]], ptr [[B2]], align 4 +// CHECK-LA32-NEXT: [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2 +// CHECK-LA32-NEXT: [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT3]], ptr [[AGG_RESULT]], i32 0, i32 2 +// CHECK-LA32-NEXT: store bfloat [[TMP2]], ptr [[C3]], align 2 +// CHECK-LA32-NEXT: [[TMP3:%.*]] = load bfloat, ptr [[D_ADDR]], align 2 +// CHECK-LA32-NEXT: [[D4:%.*]] = getelementptr inbounds nuw [[STRUCT_FLOATBFLOAT3]], ptr [[AGG_RESULT]], i32 0, i32 3 +// CHECK-LA32-NEXT: store bfloat [[TMP3]], ptr [[D4]], align 4 +// CHECK-LA32-NEXT: ret void +// +struct floatbfloat3 fh3(float a, __bf16 b, __bf16 c, __bf16 d) { + struct floatbfloat3 x; + x.a = a; + x.b = b; + x.c = c; + x.d = d; + return x; +} + +struct bfloat5 { + __bf16 a; + __bf16 b; + __bf16 c; + __bf16 d; + __bf16 e; +}; + +// CHECK-LA64-LABEL: define dso_local [2 x i64] @h5 +// CHECK-LA64-SAME: (bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]], bfloat noundef [[D:%.*]], bfloat noundef [[E:%.*]]) #[[ATTR0]] { +// CHECK-LA64-NEXT: entry: +// CHECK-LA64-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_BFLOAT5:%.*]], align 2 +// CHECK-LA64-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: [[C_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: [[D_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: [[E_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA64-NEXT: [[RETVAL_COERCE:%.*]] = alloca [2 x i64], align 8 +// CHECK-LA64-NEXT: store bfloat [[A]], ptr [[A_ADDR]], align 2 +// CHECK-LA64-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2 +// CHECK-LA64-NEXT: store bfloat [[C]], ptr [[C_ADDR]], align 2 +// CHECK-LA64-NEXT: store bfloat [[D]], ptr [[D_ADDR]], align 2 +// CHECK-LA64-NEXT: store bfloat [[E]], ptr [[E_ADDR]], align 2 +// CHECK-LA64-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2 +// CHECK-LA64-NEXT: [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT5]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-LA64-NEXT: store bfloat [[TMP0]], ptr [[A1]], align 2 +// CHECK-LA64-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2 +// CHECK-LA64-NEXT: [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT5]], ptr [[RETVAL]], i32 0, i32 1 +// CHECK-LA64-NEXT: store bfloat [[TMP1]], ptr [[B2]], align 2 +// CHECK-LA64-NEXT: [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2 +// CHECK-LA64-NEXT: [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT5]], ptr [[RETVAL]], i32 0, i32 2 +// CHECK-LA64-NEXT: store bfloat [[TMP2]], ptr [[C3]], align 2 +// CHECK-LA64-NEXT: [[TMP3:%.*]] = load bfloat, ptr [[D_ADDR]], align 2 +// CHECK-LA64-NEXT: [[D4:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT5]], ptr [[RETVAL]], i32 0, i32 3 +// CHECK-LA64-NEXT: store bfloat [[TMP3]], ptr [[D4]], align 2 +// CHECK-LA64-NEXT: [[TMP4:%.*]] = load bfloat, ptr [[E_ADDR]], align 2 +// CHECK-LA64-NEXT: [[E5:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT5]], ptr [[RETVAL]], i32 0, i32 4 +// CHECK-LA64-NEXT: store bfloat [[TMP4]], ptr [[E5]], align 2 +// CHECK-LA64-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[RETVAL_COERCE]], ptr align 2 [[RETVAL]], i64 10, i1 false) +// CHECK-LA64-NEXT: [[TMP5:%.*]] = load [2 x i64], ptr [[RETVAL_COERCE]], align 8 +// CHECK-LA64-NEXT: ret [2 x i64] [[TMP5]] +// +// CHECK-LA32-LABEL: define dso_local void @h5 +// CHECK-LA32-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_BFLOAT5:%.*]]) align 2 [[AGG_RESULT:%.*]], bfloat noundef [[A:%.*]], bfloat noundef [[B:%.*]], bfloat noundef [[C:%.*]], bfloat noundef [[D:%.*]], bfloat noundef [[E:%.*]]) #[[ATTR0]] { +// CHECK-LA32-NEXT: entry: +// CHECK-LA32-NEXT: [[RESULT_PTR:%.*]] = alloca ptr, align 4 +// CHECK-LA32-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: [[C_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: [[D_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: [[E_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-LA32-NEXT: store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4 +// CHECK-LA32-NEXT: store bfloat [[A]], ptr [[A_ADDR]], align 2 +// CHECK-LA32-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2 +// CHECK-LA32-NEXT: store bfloat [[C]], ptr [[C_ADDR]], align 2 +// CHECK-LA32-NEXT: store bfloat [[D]], ptr [[D_ADDR]], align 2 +// CHECK-LA32-NEXT: store bfloat [[E]], ptr [[E_ADDR]], align 2 +// CHECK-LA32-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR]], align 2 +// CHECK-LA32-NEXT: [[A1:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT5]], ptr [[AGG_RESULT]], i32 0, i32 0 +// CHECK-LA32-NEXT: store bfloat [[TMP0]], ptr [[A1]], align 2 +// CHECK-LA32-NEXT: [[TMP1:%.*]] = load bfloat, ptr [[B_ADDR]], align 2 +// CHECK-LA32-NEXT: [[B2:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT5]], ptr [[AGG_RESULT]], i32 0, i32 1 +// CHECK-LA32-NEXT: store bfloat [[TMP1]], ptr [[B2]], align 2 +// CHECK-LA32-NEXT: [[TMP2:%.*]] = load bfloat, ptr [[C_ADDR]], align 2 +// CHECK-LA32-NEXT: [[C3:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT5]], ptr [[AGG_RESULT]], i32 0, i32 2 +// CHECK-LA32-NEXT: store bfloat [[TMP2]], ptr [[C3]], align 2 +// CHECK-LA32-NEXT: [[TMP3:%.*]] = load bfloat, ptr [[D_ADDR]], align 2 +// CHECK-LA32-NEXT: [[D4:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT5]], ptr [[AGG_RESULT]], i32 0, i32 3 +// CHECK-LA32-NEXT: store bfloat [[TMP3]], ptr [[D4]], align 2 +// CHECK-LA32-NEXT: [[TMP4:%.*]] = load bfloat, ptr [[E_ADDR]], align 2 +// CHECK-LA32-NEXT: [[E5:%.*]] = getelementptr inbounds nuw [[STRUCT_BFLOAT5]], ptr [[AGG_RESULT]], i32 0, i32 4 +// CHECK-LA32-NEXT: store bfloat [[TMP4]], ptr [[E5]], align 2 +// CHECK-LA32-NEXT: ret void +// +struct bfloat5 h5(__bf16 a, __bf16 b, __bf16 c, __bf16 d, __bf16 e) { + struct bfloat5 x; + x.a = a; + x.b = b; + x.c = c; + x.d = d; + x.e = e; + return x; +} diff --git a/clang/test/CodeGen/LoongArch/bfloat-mangle.cpp b/clang/test/CodeGen/LoongArch/bfloat-mangle.cpp new file mode 100644 index 0000000000000..de4a10dbe44bd --- /dev/null +++ b/clang/test/CodeGen/LoongArch/bfloat-mangle.cpp @@ -0,0 +1,12 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2 +// RUN: %clang_cc1 -triple loongarch64 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple loongarch32 -emit-llvm -o - %s | FileCheck %s + +// CHECK-LABEL: define dso_local void @_Z3fooDF16b +// CHECK-SAME: (bfloat noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca bfloat, align 2 +// CHECK-NEXT: store bfloat [[B]], ptr [[B_ADDR]], align 2 +// CHECK-NEXT: ret void +// +void foo(__bf16 b) {} diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 8c0d92ab8cd62..b968e051acb00 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -182,6 +182,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, if (Subtarget.hasBasicF()) { setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); setTruncStoreAction(MVT::f32, MVT::f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand); + setTruncStoreAction(MVT::f32, MVT::bf16, Expand); setCondCodeAction(FPCCToExpand, MVT::f32, Expand); setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); @@ -203,6 +205,9 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, Subtarget.isSoftFPABI() ? LibCall : Custom); setOperationAction(ISD::FP_TO_FP16, MVT::f32, Subtarget.isSoftFPABI() ? LibCall : Custom); + setOperationAction(ISD::BF16_TO_FP, MVT::f32, Custom); + setOperationAction(ISD::FP_TO_BF16, MVT::f32, + Subtarget.isSoftFPABI() ? LibCall : Custom); if (Subtarget.is64Bit()) setOperationAction(ISD::FRINT, MVT::f32, Legal); @@ -221,6 +226,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, if (Subtarget.hasBasicD()) { setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand); + setTruncStoreAction(MVT::f64, MVT::bf16, Expand); setTruncStoreAction(MVT::f64, MVT::f16, Expand); setTruncStoreAction(MVT::f64, MVT::f32, Expand); setCondCodeAction(FPCCToExpand, MVT::f64, Expand); @@ -243,6 +250,9 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f64, Subtarget.isSoftFPABI() ? LibCall : Custom); + setOperationAction(ISD::BF16_TO_FP, MVT::f64, Custom); + setOperationAction(ISD::FP_TO_BF16, MVT::f64, + Subtarget.isSoftFPABI() ? LibCall : Custom); if (Subtarget.is64Bit()) setOperationAction(ISD::FRINT, MVT::f64, Legal); @@ -499,6 +509,10 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op, return lowerFP_TO_FP16(Op, DAG); case ISD::FP16_TO_FP: return lowerFP16_TO_FP(Op, DAG); + case ISD::FP_TO_BF16: + return lowerFP_TO_BF16(Op, DAG); + case ISD::BF16_TO_FP: + return lowerBF16_TO_FP(Op, DAG); } return SDValue(); } @@ -2333,6 +2347,36 @@ SDValue LoongArchTargetLowering::lowerFP16_TO_FP(SDValue Op, return Res; } +SDValue LoongArchTargetLowering::lowerFP_TO_BF16(SDValue Op, + SelectionDAG &DAG) const { + assert(Subtarget.hasBasicF() && "Unexpected custom legalization"); + SDLoc DL(Op); + MakeLibCallOptions CallOptions; + RTLIB::Libcall LC = + RTLIB::getFPROUND(Op.getOperand(0).getValueType(), MVT::bf16); + SDValue Res = + makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first; + if (Subtarget.is64Bit()) + return DAG.getNode(LoongArchISD::MOVFR2GR_S_LA64, DL, MVT::i64, Res); + return DAG.getBitcast(MVT::i32, Res); +} + +SDValue LoongArchTargetLowering::lowerBF16_TO_FP(SDValue Op, + SelectionDAG &DAG) const { + assert(Subtarget.hasBasicF() && "Unexpected custom legalization"); + MVT VT = Op.getSimpleValueType(); + SDLoc DL(Op); + Op = DAG.getNode( + ISD::SHL, DL, Op.getOperand(0).getValueType(), Op.getOperand(0), + DAG.getShiftAmountConstant(16, Op.getOperand(0).getValueType(), DL)); + SDValue Res = Subtarget.is64Bit() ? DAG.getNode(LoongArchISD::MOVGR2FR_W_LA64, + DL, MVT::f32, Op) + : DAG.getBitcast(MVT::f32, Op); + if (VT != MVT::f32) + return DAG.getNode(ISD::FP_EXTEND, DL, VT, Res); + return Res; +} + static bool isConstantOrUndef(const SDValue Op) { if (Op->isUndef()) return true; @@ -7993,8 +8037,9 @@ bool LoongArchTargetLowering::splitValueIntoRegisterParts( bool IsABIRegCopy = CC.has_value(); EVT ValueVT = Val.getValueType(); - if (IsABIRegCopy && ValueVT == MVT::f16 && PartVT == MVT::f32) { - // Cast the f16 to i16, extend to i32, pad with ones to make a float + if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) && + PartVT == MVT::f32) { + // Cast the [b]f16 to i16, extend to i32, pad with ones to make a float // nan, and cast to f32. Val = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Val); Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Val); @@ -8013,10 +8058,11 @@ SDValue LoongArchTargetLowering::joinRegisterPartsIntoValue( MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const { bool IsABIRegCopy = CC.has_value(); - if (IsABIRegCopy && ValueVT == MVT::f16 && PartVT == MVT::f32) { + if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) && + PartVT == MVT::f32) { SDValue Val = Parts[0]; - // Cast the f32 to i32, truncate to i16, and cast back to f16. + // Cast the f32 to i32, truncate to i16, and cast back to [b]f16. Val = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Val); Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Val); Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val); diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h index 4b6d3272db2c9..53e3f1adb8d27 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h @@ -373,6 +373,8 @@ class LoongArchTargetLowering : public TargetLowering { SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerBF16_TO_FP(SDValue Op, SelectionDAG &DAG) const; bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override; diff --git a/llvm/test/CodeGen/LoongArch/bf16-promote.ll b/llvm/test/CodeGen/LoongArch/bf16-promote.ll new file mode 100644 index 0000000000000..42651eb53acea --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/bf16-promote.ll @@ -0,0 +1,172 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=loongarch64 -mattr=+d -target-abi=lp64d < %s | FileCheck --check-prefixes=CHECK,LA64 %s +; RUN: llc -mtriple=loongarch32 -mattr=+d -target-abi=ilp32d < %s | FileCheck --check-prefixes=CHECK,LA32 %s + +define void @test_load_store(ptr %p, ptr %q) nounwind { +; CHECK-LABEL: test_load_store: +; CHECK: # %bb.0: +; CHECK-NEXT: ld.h $a0, $a0, 0 +; CHECK-NEXT: st.h $a0, $a1, 0 +; CHECK-NEXT: ret + %a = load bfloat, ptr %p + store bfloat %a, ptr %q + ret void +} + +define float @test_fpextend_float(ptr %p) nounwind { +; LA64-LABEL: test_fpextend_float: +; LA64: # %bb.0: +; LA64-NEXT: ld.hu $a0, $a0, 0 +; LA64-NEXT: slli.d $a0, $a0, 16 +; LA64-NEXT: movgr2fr.w $fa0, $a0 +; LA64-NEXT: ret +; +; LA32-LABEL: test_fpextend_float: +; LA32: # %bb.0: +; LA32-NEXT: ld.hu $a0, $a0, 0 +; LA32-NEXT: slli.w $a0, $a0, 16 +; LA32-NEXT: movgr2fr.w $fa0, $a0 +; LA32-NEXT: ret + %a = load bfloat, ptr %p + %r = fpext bfloat %a to float + ret float %r +} + +define double @test_fpextend_double(ptr %p) nounwind { +; LA64-LABEL: test_fpextend_double: +; LA64: # %bb.0: +; LA64-NEXT: ld.hu $a0, $a0, 0 +; LA64-NEXT: slli.d $a0, $a0, 16 +; LA64-NEXT: movgr2fr.w $fa0, $a0 +; LA64-NEXT: fcvt.d.s $fa0, $fa0 +; LA64-NEXT: ret +; +; LA32-LABEL: test_fpextend_double: +; LA32: # %bb.0: +; LA32-NEXT: ld.hu $a0, $a0, 0 +; LA32-NEXT: slli.w $a0, $a0, 16 +; LA32-NEXT: movgr2fr.w $fa0, $a0 +; LA32-NEXT: fcvt.d.s $fa0, $fa0 +; LA32-NEXT: ret + %a = load bfloat, ptr %p + %r = fpext bfloat %a to double + ret double %r +} + +define void @test_fptrunc_float(float %f, ptr %p) nounwind { +; LA64-LABEL: test_fptrunc_float: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 0 # 8-byte Folded Spill +; LA64-NEXT: move $fp, $a0 +; LA64-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: movfr2gr.s $a0, $fa0 +; LA64-NEXT: st.h $a0, $fp, 0 +; LA64-NEXT: ld.d $fp, $sp, 0 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +; +; LA32-LABEL: test_fptrunc_float: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 8 # 4-byte Folded Spill +; LA32-NEXT: move $fp, $a0 +; LA32-NEXT: bl __truncsfbf2 +; LA32-NEXT: movfr2gr.s $a0, $fa0 +; LA32-NEXT: st.h $a0, $fp, 0 +; LA32-NEXT: ld.w $fp, $sp, 8 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret + %a = fptrunc float %f to bfloat + store bfloat %a, ptr %p + ret void +} + +define void @test_fptrunc_double(double %d, ptr %p) nounwind { +; LA64-LABEL: test_fptrunc_double: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 0 # 8-byte Folded Spill +; LA64-NEXT: move $fp, $a0 +; LA64-NEXT: pcaddu18i $ra, %call36(__truncdfbf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: movfr2gr.s $a0, $fa0 +; LA64-NEXT: st.h $a0, $fp, 0 +; LA64-NEXT: ld.d $fp, $sp, 0 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +; +; LA32-LABEL: test_fptrunc_double: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 8 # 4-byte Folded Spill +; LA32-NEXT: move $fp, $a0 +; LA32-NEXT: bl __truncdfbf2 +; LA32-NEXT: movfr2gr.s $a0, $fa0 +; LA32-NEXT: st.h $a0, $fp, 0 +; LA32-NEXT: ld.w $fp, $sp, 8 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret + %a = fptrunc double %d to bfloat + store bfloat %a, ptr %p + ret void +} + +define void @test_fadd(ptr %p, ptr %q) nounwind { +; LA64-LABEL: test_fadd: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 0 # 8-byte Folded Spill +; LA64-NEXT: ld.hu $a1, $a1, 0 +; LA64-NEXT: move $fp, $a0 +; LA64-NEXT: ld.hu $a0, $a0, 0 +; LA64-NEXT: slli.d $a1, $a1, 16 +; LA64-NEXT: movgr2fr.w $fa0, $a1 +; LA64-NEXT: slli.d $a0, $a0, 16 +; LA64-NEXT: movgr2fr.w $fa1, $a0 +; LA64-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA64-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: movfr2gr.s $a0, $fa0 +; LA64-NEXT: st.h $a0, $fp, 0 +; LA64-NEXT: ld.d $fp, $sp, 0 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +; +; LA32-LABEL: test_fadd: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 8 # 4-byte Folded Spill +; LA32-NEXT: ld.hu $a1, $a1, 0 +; LA32-NEXT: move $fp, $a0 +; LA32-NEXT: ld.hu $a0, $a0, 0 +; LA32-NEXT: slli.w $a1, $a1, 16 +; LA32-NEXT: movgr2fr.w $fa0, $a1 +; LA32-NEXT: slli.w $a0, $a0, 16 +; LA32-NEXT: movgr2fr.w $fa1, $a0 +; LA32-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA32-NEXT: bl __truncsfbf2 +; LA32-NEXT: movfr2gr.s $a0, $fa0 +; LA32-NEXT: st.h $a0, $fp, 0 +; LA32-NEXT: ld.w $fp, $sp, 8 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret + %a = load bfloat, ptr %p + %b = load bfloat, ptr %q + %r = fadd bfloat %a, %b + store bfloat %r, ptr %p + ret void +} diff --git a/llvm/test/CodeGen/LoongArch/bf16.ll b/llvm/test/CodeGen/LoongArch/bf16.ll new file mode 100644 index 0000000000000..e580bcc69f52b --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/bf16.ll @@ -0,0 +1,1048 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +;; For `double` parameters and return values, compiling on loongarch32 with `-mattr=+d` and +;; `-target-abi=ilp32s` is incompatible, resulting in the error 'Passing f64 with GPR on LA32 is undefined'. +;; Therefore, such case is currently skipped in testing. +; RUN: llc -mtriple=loongarch32 -verify-machineinstrs < %s | FileCheck %s -check-prefix=LA32 +; RUN: llc -mtriple=loongarch64 -verify-machineinstrs < %s | FileCheck %s -check-prefix=LA64 +; RUN: llc -mtriple=loongarch32 -mattr=+f -target-abi=ilp32s -verify-machineinstrs < %s | FileCheck %s -check-prefix=LA32F-ILP32S +; RUN: llc -mtriple=loongarch32 -mattr=+f -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s -check-prefix=LA32F-ILP32D +; RUN: llc -mtriple=loongarch32 -mattr=+d -target-abi=ilp32d -verify-machineinstrs < %s | FileCheck %s -check-prefix=LA32D-ILP32D +; RUN: llc -mtriple=loongarch64 -mattr=+f -target-abi=lp64s -verify-machineinstrs < %s | FileCheck %s -check-prefix=LA64F-LP64S +; RUN: llc -mtriple=loongarch64 -mattr=+f -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s -check-prefix=LA64F-LP64D +; RUN: llc -mtriple=loongarch64 -mattr=+d -target-abi=lp64s -verify-machineinstrs < %s | FileCheck %s -check-prefix=LA64D-LP64S +; RUN: llc -mtriple=loongarch64 -mattr=+d -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s -check-prefix=LA64D-LP64D + +define bfloat @float_to_bfloat(float %a) nounwind { +; LA32-LABEL: float_to_bfloat: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: bl __truncsfbf2 +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: float_to_bfloat: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: movfr2gr.s $a0, $fa0 +; LA64-NEXT: lu12i.w $a1, -16 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: movgr2fr.w $fa0, $a0 +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +; +; LA32F-ILP32S-LABEL: float_to_bfloat: +; LA32F-ILP32S: # %bb.0: +; LA32F-ILP32S-NEXT: addi.w $sp, $sp, -16 +; LA32F-ILP32S-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-ILP32S-NEXT: bl __truncsfbf2 +; LA32F-ILP32S-NEXT: lu12i.w $a1, -16 +; LA32F-ILP32S-NEXT: or $a0, $a0, $a1 +; LA32F-ILP32S-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-ILP32S-NEXT: addi.w $sp, $sp, 16 +; LA32F-ILP32S-NEXT: ret +; +; LA32F-ILP32D-LABEL: float_to_bfloat: +; LA32F-ILP32D: # %bb.0: +; LA32F-ILP32D-NEXT: addi.w $sp, $sp, -16 +; LA32F-ILP32D-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-ILP32D-NEXT: bl __truncsfbf2 +; LA32F-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32F-ILP32D-NEXT: lu12i.w $a1, -16 +; LA32F-ILP32D-NEXT: or $a0, $a0, $a1 +; LA32F-ILP32D-NEXT: movgr2fr.w $fa0, $a0 +; LA32F-ILP32D-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-ILP32D-NEXT: addi.w $sp, $sp, 16 +; LA32F-ILP32D-NEXT: ret +; +; LA32D-ILP32D-LABEL: float_to_bfloat: +; LA32D-ILP32D: # %bb.0: +; LA32D-ILP32D-NEXT: addi.w $sp, $sp, -16 +; LA32D-ILP32D-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32D-ILP32D-NEXT: bl __truncsfbf2 +; LA32D-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32D-ILP32D-NEXT: lu12i.w $a1, -16 +; LA32D-ILP32D-NEXT: or $a0, $a0, $a1 +; LA32D-ILP32D-NEXT: movgr2fr.w $fa0, $a0 +; LA32D-ILP32D-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32D-ILP32D-NEXT: addi.w $sp, $sp, 16 +; LA32D-ILP32D-NEXT: ret +; +; LA64F-LP64S-LABEL: float_to_bfloat: +; LA64F-LP64S: # %bb.0: +; LA64F-LP64S-NEXT: addi.d $sp, $sp, -16 +; LA64F-LP64S-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64F-LP64S-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64F-LP64S-NEXT: jirl $ra, $ra, 0 +; LA64F-LP64S-NEXT: lu12i.w $a1, -16 +; LA64F-LP64S-NEXT: or $a0, $a0, $a1 +; LA64F-LP64S-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64F-LP64S-NEXT: addi.d $sp, $sp, 16 +; LA64F-LP64S-NEXT: ret +; +; LA64F-LP64D-LABEL: float_to_bfloat: +; LA64F-LP64D: # %bb.0: +; LA64F-LP64D-NEXT: addi.d $sp, $sp, -16 +; LA64F-LP64D-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64F-LP64D-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64F-LP64D-NEXT: jirl $ra, $ra, 0 +; LA64F-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64F-LP64D-NEXT: lu12i.w $a1, -16 +; LA64F-LP64D-NEXT: or $a0, $a0, $a1 +; LA64F-LP64D-NEXT: movgr2fr.w $fa0, $a0 +; LA64F-LP64D-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64F-LP64D-NEXT: addi.d $sp, $sp, 16 +; LA64F-LP64D-NEXT: ret +; +; LA64D-LP64S-LABEL: float_to_bfloat: +; LA64D-LP64S: # %bb.0: +; LA64D-LP64S-NEXT: addi.d $sp, $sp, -16 +; LA64D-LP64S-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64D-LP64S-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64D-LP64S-NEXT: jirl $ra, $ra, 0 +; LA64D-LP64S-NEXT: lu12i.w $a1, -16 +; LA64D-LP64S-NEXT: or $a0, $a0, $a1 +; LA64D-LP64S-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64D-LP64S-NEXT: addi.d $sp, $sp, 16 +; LA64D-LP64S-NEXT: ret +; +; LA64D-LP64D-LABEL: float_to_bfloat: +; LA64D-LP64D: # %bb.0: +; LA64D-LP64D-NEXT: addi.d $sp, $sp, -16 +; LA64D-LP64D-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64D-LP64D-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64D-LP64D-NEXT: jirl $ra, $ra, 0 +; LA64D-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64D-LP64D-NEXT: lu12i.w $a1, -16 +; LA64D-LP64D-NEXT: or $a0, $a0, $a1 +; LA64D-LP64D-NEXT: movgr2fr.w $fa0, $a0 +; LA64D-LP64D-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64D-LP64D-NEXT: addi.d $sp, $sp, 16 +; LA64D-LP64D-NEXT: ret + %1 = fptrunc float %a to bfloat + ret bfloat %1 +} + +define bfloat @double_to_bfloat(double %a) nounwind { +; LA32-LABEL: double_to_bfloat: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: bl __truncdfbf2 +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: double_to_bfloat: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: pcaddu18i $ra, %call36(__truncdfbf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: movfr2gr.s $a0, $fa0 +; LA64-NEXT: lu12i.w $a1, -16 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: movgr2fr.w $fa0, $a0 +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +; +; LA32F-ILP32S-LABEL: double_to_bfloat: +; LA32F-ILP32S: # %bb.0: +; LA32F-ILP32S-NEXT: addi.w $sp, $sp, -16 +; LA32F-ILP32S-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-ILP32S-NEXT: bl __truncdfbf2 +; LA32F-ILP32S-NEXT: lu12i.w $a1, -16 +; LA32F-ILP32S-NEXT: or $a0, $a0, $a1 +; LA32F-ILP32S-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-ILP32S-NEXT: addi.w $sp, $sp, 16 +; LA32F-ILP32S-NEXT: ret +; +; LA32F-ILP32D-LABEL: double_to_bfloat: +; LA32F-ILP32D: # %bb.0: +; LA32F-ILP32D-NEXT: addi.w $sp, $sp, -16 +; LA32F-ILP32D-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-ILP32D-NEXT: bl __truncdfbf2 +; LA32F-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32F-ILP32D-NEXT: lu12i.w $a1, -16 +; LA32F-ILP32D-NEXT: or $a0, $a0, $a1 +; LA32F-ILP32D-NEXT: movgr2fr.w $fa0, $a0 +; LA32F-ILP32D-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-ILP32D-NEXT: addi.w $sp, $sp, 16 +; LA32F-ILP32D-NEXT: ret +; +; LA32D-ILP32D-LABEL: double_to_bfloat: +; LA32D-ILP32D: # %bb.0: +; LA32D-ILP32D-NEXT: addi.w $sp, $sp, -16 +; LA32D-ILP32D-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32D-ILP32D-NEXT: bl __truncdfbf2 +; LA32D-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32D-ILP32D-NEXT: lu12i.w $a1, -16 +; LA32D-ILP32D-NEXT: or $a0, $a0, $a1 +; LA32D-ILP32D-NEXT: movgr2fr.w $fa0, $a0 +; LA32D-ILP32D-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32D-ILP32D-NEXT: addi.w $sp, $sp, 16 +; LA32D-ILP32D-NEXT: ret +; +; LA64F-LP64S-LABEL: double_to_bfloat: +; LA64F-LP64S: # %bb.0: +; LA64F-LP64S-NEXT: addi.d $sp, $sp, -16 +; LA64F-LP64S-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64F-LP64S-NEXT: pcaddu18i $ra, %call36(__truncdfbf2) +; LA64F-LP64S-NEXT: jirl $ra, $ra, 0 +; LA64F-LP64S-NEXT: lu12i.w $a1, -16 +; LA64F-LP64S-NEXT: or $a0, $a0, $a1 +; LA64F-LP64S-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64F-LP64S-NEXT: addi.d $sp, $sp, 16 +; LA64F-LP64S-NEXT: ret +; +; LA64F-LP64D-LABEL: double_to_bfloat: +; LA64F-LP64D: # %bb.0: +; LA64F-LP64D-NEXT: addi.d $sp, $sp, -16 +; LA64F-LP64D-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64F-LP64D-NEXT: pcaddu18i $ra, %call36(__truncdfbf2) +; LA64F-LP64D-NEXT: jirl $ra, $ra, 0 +; LA64F-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64F-LP64D-NEXT: lu12i.w $a1, -16 +; LA64F-LP64D-NEXT: or $a0, $a0, $a1 +; LA64F-LP64D-NEXT: movgr2fr.w $fa0, $a0 +; LA64F-LP64D-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64F-LP64D-NEXT: addi.d $sp, $sp, 16 +; LA64F-LP64D-NEXT: ret +; +; LA64D-LP64S-LABEL: double_to_bfloat: +; LA64D-LP64S: # %bb.0: +; LA64D-LP64S-NEXT: addi.d $sp, $sp, -16 +; LA64D-LP64S-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64D-LP64S-NEXT: pcaddu18i $ra, %call36(__truncdfbf2) +; LA64D-LP64S-NEXT: jirl $ra, $ra, 0 +; LA64D-LP64S-NEXT: lu12i.w $a1, -16 +; LA64D-LP64S-NEXT: or $a0, $a0, $a1 +; LA64D-LP64S-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64D-LP64S-NEXT: addi.d $sp, $sp, 16 +; LA64D-LP64S-NEXT: ret +; +; LA64D-LP64D-LABEL: double_to_bfloat: +; LA64D-LP64D: # %bb.0: +; LA64D-LP64D-NEXT: addi.d $sp, $sp, -16 +; LA64D-LP64D-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64D-LP64D-NEXT: pcaddu18i $ra, %call36(__truncdfbf2) +; LA64D-LP64D-NEXT: jirl $ra, $ra, 0 +; LA64D-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64D-LP64D-NEXT: lu12i.w $a1, -16 +; LA64D-LP64D-NEXT: or $a0, $a0, $a1 +; LA64D-LP64D-NEXT: movgr2fr.w $fa0, $a0 +; LA64D-LP64D-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64D-LP64D-NEXT: addi.d $sp, $sp, 16 +; LA64D-LP64D-NEXT: ret + %1 = fptrunc double %a to bfloat + ret bfloat %1 +} + +define float @bfloat_to_float(bfloat %a) nounwind { +; LA32-LABEL: bfloat_to_float: +; LA32: # %bb.0: +; LA32-NEXT: slli.w $a0, $a0, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: bfloat_to_float: +; LA64: # %bb.0: +; LA64-NEXT: movfr2gr.s $a0, $fa0 +; LA64-NEXT: slli.d $a0, $a0, 16 +; LA64-NEXT: movgr2fr.w $fa0, $a0 +; LA64-NEXT: ret +; +; LA32F-ILP32S-LABEL: bfloat_to_float: +; LA32F-ILP32S: # %bb.0: +; LA32F-ILP32S-NEXT: slli.w $a0, $a0, 16 +; LA32F-ILP32S-NEXT: ret +; +; LA32F-ILP32D-LABEL: bfloat_to_float: +; LA32F-ILP32D: # %bb.0: +; LA32F-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32F-ILP32D-NEXT: slli.w $a0, $a0, 16 +; LA32F-ILP32D-NEXT: movgr2fr.w $fa0, $a0 +; LA32F-ILP32D-NEXT: ret +; +; LA32D-ILP32D-LABEL: bfloat_to_float: +; LA32D-ILP32D: # %bb.0: +; LA32D-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32D-ILP32D-NEXT: slli.w $a0, $a0, 16 +; LA32D-ILP32D-NEXT: movgr2fr.w $fa0, $a0 +; LA32D-ILP32D-NEXT: ret +; +; LA64F-LP64S-LABEL: bfloat_to_float: +; LA64F-LP64S: # %bb.0: +; LA64F-LP64S-NEXT: slli.d $a0, $a0, 16 +; LA64F-LP64S-NEXT: ret +; +; LA64F-LP64D-LABEL: bfloat_to_float: +; LA64F-LP64D: # %bb.0: +; LA64F-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64F-LP64D-NEXT: slli.d $a0, $a0, 16 +; LA64F-LP64D-NEXT: movgr2fr.w $fa0, $a0 +; LA64F-LP64D-NEXT: ret +; +; LA64D-LP64S-LABEL: bfloat_to_float: +; LA64D-LP64S: # %bb.0: +; LA64D-LP64S-NEXT: slli.d $a0, $a0, 16 +; LA64D-LP64S-NEXT: ret +; +; LA64D-LP64D-LABEL: bfloat_to_float: +; LA64D-LP64D: # %bb.0: +; LA64D-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64D-LP64D-NEXT: slli.d $a0, $a0, 16 +; LA64D-LP64D-NEXT: movgr2fr.w $fa0, $a0 +; LA64D-LP64D-NEXT: ret + %1 = fpext bfloat %a to float + ret float %1 +} + +define double @bfloat_to_double(bfloat %a) nounwind { +; LA32-LABEL: bfloat_to_double: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: slli.w $a0, $a0, 16 +; LA32-NEXT: bl __extendsfdf2 +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: bfloat_to_double: +; LA64: # %bb.0: +; LA64-NEXT: movfr2gr.s $a0, $fa0 +; LA64-NEXT: slli.d $a0, $a0, 16 +; LA64-NEXT: movgr2fr.w $fa0, $a0 +; LA64-NEXT: fcvt.d.s $fa0, $fa0 +; LA64-NEXT: ret +; +; LA32F-ILP32S-LABEL: bfloat_to_double: +; LA32F-ILP32S: # %bb.0: +; LA32F-ILP32S-NEXT: addi.w $sp, $sp, -16 +; LA32F-ILP32S-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-ILP32S-NEXT: slli.w $a0, $a0, 16 +; LA32F-ILP32S-NEXT: bl __extendsfdf2 +; LA32F-ILP32S-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-ILP32S-NEXT: addi.w $sp, $sp, 16 +; LA32F-ILP32S-NEXT: ret +; +; LA32F-ILP32D-LABEL: bfloat_to_double: +; LA32F-ILP32D: # %bb.0: +; LA32F-ILP32D-NEXT: addi.w $sp, $sp, -16 +; LA32F-ILP32D-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32F-ILP32D-NEXT: slli.w $a0, $a0, 16 +; LA32F-ILP32D-NEXT: movgr2fr.w $fa0, $a0 +; LA32F-ILP32D-NEXT: bl __extendsfdf2 +; LA32F-ILP32D-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-ILP32D-NEXT: addi.w $sp, $sp, 16 +; LA32F-ILP32D-NEXT: ret +; +; LA32D-ILP32D-LABEL: bfloat_to_double: +; LA32D-ILP32D: # %bb.0: +; LA32D-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32D-ILP32D-NEXT: slli.w $a0, $a0, 16 +; LA32D-ILP32D-NEXT: movgr2fr.w $fa0, $a0 +; LA32D-ILP32D-NEXT: fcvt.d.s $fa0, $fa0 +; LA32D-ILP32D-NEXT: ret +; +; LA64F-LP64S-LABEL: bfloat_to_double: +; LA64F-LP64S: # %bb.0: +; LA64F-LP64S-NEXT: slli.d $a0, $a0, 16 +; LA64F-LP64S-NEXT: movgr2fr.w $fa0, $a0 +; LA64F-LP64S-NEXT: fcvt.d.s $fa0, $fa0 +; LA64F-LP64S-NEXT: movfr2gr.d $a0, $fa0 +; LA64F-LP64S-NEXT: ret +; +; LA64F-LP64D-LABEL: bfloat_to_double: +; LA64F-LP64D: # %bb.0: +; LA64F-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64F-LP64D-NEXT: slli.d $a0, $a0, 16 +; LA64F-LP64D-NEXT: movgr2fr.w $fa0, $a0 +; LA64F-LP64D-NEXT: fcvt.d.s $fa0, $fa0 +; LA64F-LP64D-NEXT: ret +; +; LA64D-LP64S-LABEL: bfloat_to_double: +; LA64D-LP64S: # %bb.0: +; LA64D-LP64S-NEXT: slli.d $a0, $a0, 16 +; LA64D-LP64S-NEXT: movgr2fr.w $fa0, $a0 +; LA64D-LP64S-NEXT: fcvt.d.s $fa0, $fa0 +; LA64D-LP64S-NEXT: movfr2gr.d $a0, $fa0 +; LA64D-LP64S-NEXT: ret +; +; LA64D-LP64D-LABEL: bfloat_to_double: +; LA64D-LP64D: # %bb.0: +; LA64D-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64D-LP64D-NEXT: slli.d $a0, $a0, 16 +; LA64D-LP64D-NEXT: movgr2fr.w $fa0, $a0 +; LA64D-LP64D-NEXT: fcvt.d.s $fa0, $fa0 +; LA64D-LP64D-NEXT: ret + %1 = fpext bfloat %a to double + ret double %1 +} + +define bfloat @i16_to_bfloat(i16 %a) nounwind { +; LA32-LABEL: i16_to_bfloat: +; LA32: # %bb.0: +; LA32-NEXT: ret +; +; LA64-LABEL: i16_to_bfloat: +; LA64: # %bb.0: +; LA64-NEXT: lu12i.w $a1, -16 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: movgr2fr.w $fa0, $a0 +; LA64-NEXT: ret +; +; LA32F-ILP32S-LABEL: i16_to_bfloat: +; LA32F-ILP32S: # %bb.0: +; LA32F-ILP32S-NEXT: lu12i.w $a1, -16 +; LA32F-ILP32S-NEXT: or $a0, $a0, $a1 +; LA32F-ILP32S-NEXT: ret +; +; LA32F-ILP32D-LABEL: i16_to_bfloat: +; LA32F-ILP32D: # %bb.0: +; LA32F-ILP32D-NEXT: lu12i.w $a1, -16 +; LA32F-ILP32D-NEXT: or $a0, $a0, $a1 +; LA32F-ILP32D-NEXT: movgr2fr.w $fa0, $a0 +; LA32F-ILP32D-NEXT: ret +; +; LA32D-ILP32D-LABEL: i16_to_bfloat: +; LA32D-ILP32D: # %bb.0: +; LA32D-ILP32D-NEXT: lu12i.w $a1, -16 +; LA32D-ILP32D-NEXT: or $a0, $a0, $a1 +; LA32D-ILP32D-NEXT: movgr2fr.w $fa0, $a0 +; LA32D-ILP32D-NEXT: ret +; +; LA64F-LP64S-LABEL: i16_to_bfloat: +; LA64F-LP64S: # %bb.0: +; LA64F-LP64S-NEXT: lu12i.w $a1, -16 +; LA64F-LP64S-NEXT: or $a0, $a0, $a1 +; LA64F-LP64S-NEXT: ret +; +; LA64F-LP64D-LABEL: i16_to_bfloat: +; LA64F-LP64D: # %bb.0: +; LA64F-LP64D-NEXT: lu12i.w $a1, -16 +; LA64F-LP64D-NEXT: or $a0, $a0, $a1 +; LA64F-LP64D-NEXT: movgr2fr.w $fa0, $a0 +; LA64F-LP64D-NEXT: ret +; +; LA64D-LP64S-LABEL: i16_to_bfloat: +; LA64D-LP64S: # %bb.0: +; LA64D-LP64S-NEXT: lu12i.w $a1, -16 +; LA64D-LP64S-NEXT: or $a0, $a0, $a1 +; LA64D-LP64S-NEXT: ret +; +; LA64D-LP64D-LABEL: i16_to_bfloat: +; LA64D-LP64D: # %bb.0: +; LA64D-LP64D-NEXT: lu12i.w $a1, -16 +; LA64D-LP64D-NEXT: or $a0, $a0, $a1 +; LA64D-LP64D-NEXT: movgr2fr.w $fa0, $a0 +; LA64D-LP64D-NEXT: ret + %1 = bitcast i16 %a to bfloat + ret bfloat %1 +} + +define i16 @bfloat_to_i16(bfloat %a) nounwind { +; LA32-LABEL: bfloat_to_i16: +; LA32: # %bb.0: +; LA32-NEXT: ret +; +; LA64-LABEL: bfloat_to_i16: +; LA64: # %bb.0: +; LA64-NEXT: movfr2gr.s $a0, $fa0 +; LA64-NEXT: ret +; +; LA32F-ILP32S-LABEL: bfloat_to_i16: +; LA32F-ILP32S: # %bb.0: +; LA32F-ILP32S-NEXT: ret +; +; LA32F-ILP32D-LABEL: bfloat_to_i16: +; LA32F-ILP32D: # %bb.0: +; LA32F-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32F-ILP32D-NEXT: ret +; +; LA32D-ILP32D-LABEL: bfloat_to_i16: +; LA32D-ILP32D: # %bb.0: +; LA32D-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32D-ILP32D-NEXT: ret +; +; LA64F-LP64S-LABEL: bfloat_to_i16: +; LA64F-LP64S: # %bb.0: +; LA64F-LP64S-NEXT: ret +; +; LA64F-LP64D-LABEL: bfloat_to_i16: +; LA64F-LP64D: # %bb.0: +; LA64F-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64F-LP64D-NEXT: ret +; +; LA64D-LP64S-LABEL: bfloat_to_i16: +; LA64D-LP64S: # %bb.0: +; LA64D-LP64S-NEXT: ret +; +; LA64D-LP64D-LABEL: bfloat_to_i16: +; LA64D-LP64D: # %bb.0: +; LA64D-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64D-LP64D-NEXT: ret + %1 = bitcast bfloat %a to i16 + ret i16 %1 +} + +define bfloat @bfloat_add(bfloat %a, bfloat %b) nounwind { +; LA32-LABEL: bfloat_add: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: slli.w $a0, $a0, 16 +; LA32-NEXT: slli.w $a1, $a1, 16 +; LA32-NEXT: bl __addsf3 +; LA32-NEXT: bl __truncsfbf2 +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: bfloat_add: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: movfr2gr.s $a0, $fa0 +; LA64-NEXT: movfr2gr.s $a1, $fa1 +; LA64-NEXT: slli.d $a1, $a1, 16 +; LA64-NEXT: movgr2fr.w $fa0, $a1 +; LA64-NEXT: slli.d $a0, $a0, 16 +; LA64-NEXT: movgr2fr.w $fa1, $a0 +; LA64-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA64-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: movfr2gr.s $a0, $fa0 +; LA64-NEXT: lu12i.w $a1, -16 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: movgr2fr.w $fa0, $a0 +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +; +; LA32F-ILP32S-LABEL: bfloat_add: +; LA32F-ILP32S: # %bb.0: +; LA32F-ILP32S-NEXT: addi.w $sp, $sp, -16 +; LA32F-ILP32S-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-ILP32S-NEXT: slli.w $a1, $a1, 16 +; LA32F-ILP32S-NEXT: movgr2fr.w $fa0, $a1 +; LA32F-ILP32S-NEXT: slli.w $a0, $a0, 16 +; LA32F-ILP32S-NEXT: movgr2fr.w $fa1, $a0 +; LA32F-ILP32S-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA32F-ILP32S-NEXT: movfr2gr.s $a0, $fa0 +; LA32F-ILP32S-NEXT: bl __truncsfbf2 +; LA32F-ILP32S-NEXT: lu12i.w $a1, -16 +; LA32F-ILP32S-NEXT: or $a0, $a0, $a1 +; LA32F-ILP32S-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-ILP32S-NEXT: addi.w $sp, $sp, 16 +; LA32F-ILP32S-NEXT: ret +; +; LA32F-ILP32D-LABEL: bfloat_add: +; LA32F-ILP32D: # %bb.0: +; LA32F-ILP32D-NEXT: addi.w $sp, $sp, -16 +; LA32F-ILP32D-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32F-ILP32D-NEXT: movfr2gr.s $a1, $fa1 +; LA32F-ILP32D-NEXT: slli.w $a1, $a1, 16 +; LA32F-ILP32D-NEXT: movgr2fr.w $fa0, $a1 +; LA32F-ILP32D-NEXT: slli.w $a0, $a0, 16 +; LA32F-ILP32D-NEXT: movgr2fr.w $fa1, $a0 +; LA32F-ILP32D-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA32F-ILP32D-NEXT: bl __truncsfbf2 +; LA32F-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32F-ILP32D-NEXT: lu12i.w $a1, -16 +; LA32F-ILP32D-NEXT: or $a0, $a0, $a1 +; LA32F-ILP32D-NEXT: movgr2fr.w $fa0, $a0 +; LA32F-ILP32D-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-ILP32D-NEXT: addi.w $sp, $sp, 16 +; LA32F-ILP32D-NEXT: ret +; +; LA32D-ILP32D-LABEL: bfloat_add: +; LA32D-ILP32D: # %bb.0: +; LA32D-ILP32D-NEXT: addi.w $sp, $sp, -16 +; LA32D-ILP32D-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32D-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32D-ILP32D-NEXT: movfr2gr.s $a1, $fa1 +; LA32D-ILP32D-NEXT: slli.w $a1, $a1, 16 +; LA32D-ILP32D-NEXT: movgr2fr.w $fa0, $a1 +; LA32D-ILP32D-NEXT: slli.w $a0, $a0, 16 +; LA32D-ILP32D-NEXT: movgr2fr.w $fa1, $a0 +; LA32D-ILP32D-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA32D-ILP32D-NEXT: bl __truncsfbf2 +; LA32D-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32D-ILP32D-NEXT: lu12i.w $a1, -16 +; LA32D-ILP32D-NEXT: or $a0, $a0, $a1 +; LA32D-ILP32D-NEXT: movgr2fr.w $fa0, $a0 +; LA32D-ILP32D-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32D-ILP32D-NEXT: addi.w $sp, $sp, 16 +; LA32D-ILP32D-NEXT: ret +; +; LA64F-LP64S-LABEL: bfloat_add: +; LA64F-LP64S: # %bb.0: +; LA64F-LP64S-NEXT: addi.d $sp, $sp, -16 +; LA64F-LP64S-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64F-LP64S-NEXT: slli.d $a1, $a1, 16 +; LA64F-LP64S-NEXT: movgr2fr.w $fa0, $a1 +; LA64F-LP64S-NEXT: slli.d $a0, $a0, 16 +; LA64F-LP64S-NEXT: movgr2fr.w $fa1, $a0 +; LA64F-LP64S-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA64F-LP64S-NEXT: movfr2gr.s $a0, $fa0 +; LA64F-LP64S-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64F-LP64S-NEXT: jirl $ra, $ra, 0 +; LA64F-LP64S-NEXT: lu12i.w $a1, -16 +; LA64F-LP64S-NEXT: or $a0, $a0, $a1 +; LA64F-LP64S-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64F-LP64S-NEXT: addi.d $sp, $sp, 16 +; LA64F-LP64S-NEXT: ret +; +; LA64F-LP64D-LABEL: bfloat_add: +; LA64F-LP64D: # %bb.0: +; LA64F-LP64D-NEXT: addi.d $sp, $sp, -16 +; LA64F-LP64D-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64F-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64F-LP64D-NEXT: movfr2gr.s $a1, $fa1 +; LA64F-LP64D-NEXT: slli.d $a1, $a1, 16 +; LA64F-LP64D-NEXT: movgr2fr.w $fa0, $a1 +; LA64F-LP64D-NEXT: slli.d $a0, $a0, 16 +; LA64F-LP64D-NEXT: movgr2fr.w $fa1, $a0 +; LA64F-LP64D-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA64F-LP64D-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64F-LP64D-NEXT: jirl $ra, $ra, 0 +; LA64F-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64F-LP64D-NEXT: lu12i.w $a1, -16 +; LA64F-LP64D-NEXT: or $a0, $a0, $a1 +; LA64F-LP64D-NEXT: movgr2fr.w $fa0, $a0 +; LA64F-LP64D-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64F-LP64D-NEXT: addi.d $sp, $sp, 16 +; LA64F-LP64D-NEXT: ret +; +; LA64D-LP64S-LABEL: bfloat_add: +; LA64D-LP64S: # %bb.0: +; LA64D-LP64S-NEXT: addi.d $sp, $sp, -16 +; LA64D-LP64S-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64D-LP64S-NEXT: slli.d $a1, $a1, 16 +; LA64D-LP64S-NEXT: movgr2fr.w $fa0, $a1 +; LA64D-LP64S-NEXT: slli.d $a0, $a0, 16 +; LA64D-LP64S-NEXT: movgr2fr.w $fa1, $a0 +; LA64D-LP64S-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA64D-LP64S-NEXT: movfr2gr.s $a0, $fa0 +; LA64D-LP64S-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64D-LP64S-NEXT: jirl $ra, $ra, 0 +; LA64D-LP64S-NEXT: lu12i.w $a1, -16 +; LA64D-LP64S-NEXT: or $a0, $a0, $a1 +; LA64D-LP64S-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64D-LP64S-NEXT: addi.d $sp, $sp, 16 +; LA64D-LP64S-NEXT: ret +; +; LA64D-LP64D-LABEL: bfloat_add: +; LA64D-LP64D: # %bb.0: +; LA64D-LP64D-NEXT: addi.d $sp, $sp, -16 +; LA64D-LP64D-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64D-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64D-LP64D-NEXT: movfr2gr.s $a1, $fa1 +; LA64D-LP64D-NEXT: slli.d $a1, $a1, 16 +; LA64D-LP64D-NEXT: movgr2fr.w $fa0, $a1 +; LA64D-LP64D-NEXT: slli.d $a0, $a0, 16 +; LA64D-LP64D-NEXT: movgr2fr.w $fa1, $a0 +; LA64D-LP64D-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA64D-LP64D-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64D-LP64D-NEXT: jirl $ra, $ra, 0 +; LA64D-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64D-LP64D-NEXT: lu12i.w $a1, -16 +; LA64D-LP64D-NEXT: or $a0, $a0, $a1 +; LA64D-LP64D-NEXT: movgr2fr.w $fa0, $a0 +; LA64D-LP64D-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64D-LP64D-NEXT: addi.d $sp, $sp, 16 +; LA64D-LP64D-NEXT: ret + %1 = fadd bfloat %a, %b + ret bfloat %1 +} + +define bfloat @bfloat_load(ptr %a) nounwind { +; LA32-LABEL: bfloat_load: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: ld.h $a1, $a0, 0 +; LA32-NEXT: ld.h $a2, $a0, 6 +; LA32-NEXT: slli.w $a0, $a1, 16 +; LA32-NEXT: slli.w $a1, $a2, 16 +; LA32-NEXT: bl __addsf3 +; LA32-NEXT: bl __truncsfbf2 +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: bfloat_load: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: ld.hu $a1, $a0, 6 +; LA64-NEXT: ld.hu $a0, $a0, 0 +; LA64-NEXT: slli.d $a1, $a1, 16 +; LA64-NEXT: movgr2fr.w $fa0, $a1 +; LA64-NEXT: slli.d $a0, $a0, 16 +; LA64-NEXT: movgr2fr.w $fa1, $a0 +; LA64-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA64-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: movfr2gr.s $a0, $fa0 +; LA64-NEXT: lu12i.w $a1, -16 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: movgr2fr.w $fa0, $a0 +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +; +; LA32F-ILP32S-LABEL: bfloat_load: +; LA32F-ILP32S: # %bb.0: +; LA32F-ILP32S-NEXT: addi.w $sp, $sp, -16 +; LA32F-ILP32S-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-ILP32S-NEXT: ld.hu $a1, $a0, 6 +; LA32F-ILP32S-NEXT: ld.hu $a0, $a0, 0 +; LA32F-ILP32S-NEXT: slli.w $a1, $a1, 16 +; LA32F-ILP32S-NEXT: movgr2fr.w $fa0, $a1 +; LA32F-ILP32S-NEXT: slli.w $a0, $a0, 16 +; LA32F-ILP32S-NEXT: movgr2fr.w $fa1, $a0 +; LA32F-ILP32S-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA32F-ILP32S-NEXT: movfr2gr.s $a0, $fa0 +; LA32F-ILP32S-NEXT: bl __truncsfbf2 +; LA32F-ILP32S-NEXT: lu12i.w $a1, -16 +; LA32F-ILP32S-NEXT: or $a0, $a0, $a1 +; LA32F-ILP32S-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-ILP32S-NEXT: addi.w $sp, $sp, 16 +; LA32F-ILP32S-NEXT: ret +; +; LA32F-ILP32D-LABEL: bfloat_load: +; LA32F-ILP32D: # %bb.0: +; LA32F-ILP32D-NEXT: addi.w $sp, $sp, -16 +; LA32F-ILP32D-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-ILP32D-NEXT: ld.hu $a1, $a0, 6 +; LA32F-ILP32D-NEXT: ld.hu $a0, $a0, 0 +; LA32F-ILP32D-NEXT: slli.w $a1, $a1, 16 +; LA32F-ILP32D-NEXT: movgr2fr.w $fa0, $a1 +; LA32F-ILP32D-NEXT: slli.w $a0, $a0, 16 +; LA32F-ILP32D-NEXT: movgr2fr.w $fa1, $a0 +; LA32F-ILP32D-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA32F-ILP32D-NEXT: bl __truncsfbf2 +; LA32F-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32F-ILP32D-NEXT: lu12i.w $a1, -16 +; LA32F-ILP32D-NEXT: or $a0, $a0, $a1 +; LA32F-ILP32D-NEXT: movgr2fr.w $fa0, $a0 +; LA32F-ILP32D-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-ILP32D-NEXT: addi.w $sp, $sp, 16 +; LA32F-ILP32D-NEXT: ret +; +; LA32D-ILP32D-LABEL: bfloat_load: +; LA32D-ILP32D: # %bb.0: +; LA32D-ILP32D-NEXT: addi.w $sp, $sp, -16 +; LA32D-ILP32D-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32D-ILP32D-NEXT: ld.hu $a1, $a0, 6 +; LA32D-ILP32D-NEXT: ld.hu $a0, $a0, 0 +; LA32D-ILP32D-NEXT: slli.w $a1, $a1, 16 +; LA32D-ILP32D-NEXT: movgr2fr.w $fa0, $a1 +; LA32D-ILP32D-NEXT: slli.w $a0, $a0, 16 +; LA32D-ILP32D-NEXT: movgr2fr.w $fa1, $a0 +; LA32D-ILP32D-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA32D-ILP32D-NEXT: bl __truncsfbf2 +; LA32D-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32D-ILP32D-NEXT: lu12i.w $a1, -16 +; LA32D-ILP32D-NEXT: or $a0, $a0, $a1 +; LA32D-ILP32D-NEXT: movgr2fr.w $fa0, $a0 +; LA32D-ILP32D-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32D-ILP32D-NEXT: addi.w $sp, $sp, 16 +; LA32D-ILP32D-NEXT: ret +; +; LA64F-LP64S-LABEL: bfloat_load: +; LA64F-LP64S: # %bb.0: +; LA64F-LP64S-NEXT: addi.d $sp, $sp, -16 +; LA64F-LP64S-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64F-LP64S-NEXT: ld.hu $a1, $a0, 6 +; LA64F-LP64S-NEXT: ld.hu $a0, $a0, 0 +; LA64F-LP64S-NEXT: slli.d $a1, $a1, 16 +; LA64F-LP64S-NEXT: movgr2fr.w $fa0, $a1 +; LA64F-LP64S-NEXT: slli.d $a0, $a0, 16 +; LA64F-LP64S-NEXT: movgr2fr.w $fa1, $a0 +; LA64F-LP64S-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA64F-LP64S-NEXT: movfr2gr.s $a0, $fa0 +; LA64F-LP64S-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64F-LP64S-NEXT: jirl $ra, $ra, 0 +; LA64F-LP64S-NEXT: lu12i.w $a1, -16 +; LA64F-LP64S-NEXT: or $a0, $a0, $a1 +; LA64F-LP64S-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64F-LP64S-NEXT: addi.d $sp, $sp, 16 +; LA64F-LP64S-NEXT: ret +; +; LA64F-LP64D-LABEL: bfloat_load: +; LA64F-LP64D: # %bb.0: +; LA64F-LP64D-NEXT: addi.d $sp, $sp, -16 +; LA64F-LP64D-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64F-LP64D-NEXT: ld.hu $a1, $a0, 6 +; LA64F-LP64D-NEXT: ld.hu $a0, $a0, 0 +; LA64F-LP64D-NEXT: slli.d $a1, $a1, 16 +; LA64F-LP64D-NEXT: movgr2fr.w $fa0, $a1 +; LA64F-LP64D-NEXT: slli.d $a0, $a0, 16 +; LA64F-LP64D-NEXT: movgr2fr.w $fa1, $a0 +; LA64F-LP64D-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA64F-LP64D-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64F-LP64D-NEXT: jirl $ra, $ra, 0 +; LA64F-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64F-LP64D-NEXT: lu12i.w $a1, -16 +; LA64F-LP64D-NEXT: or $a0, $a0, $a1 +; LA64F-LP64D-NEXT: movgr2fr.w $fa0, $a0 +; LA64F-LP64D-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64F-LP64D-NEXT: addi.d $sp, $sp, 16 +; LA64F-LP64D-NEXT: ret +; +; LA64D-LP64S-LABEL: bfloat_load: +; LA64D-LP64S: # %bb.0: +; LA64D-LP64S-NEXT: addi.d $sp, $sp, -16 +; LA64D-LP64S-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64D-LP64S-NEXT: ld.hu $a1, $a0, 6 +; LA64D-LP64S-NEXT: ld.hu $a0, $a0, 0 +; LA64D-LP64S-NEXT: slli.d $a1, $a1, 16 +; LA64D-LP64S-NEXT: movgr2fr.w $fa0, $a1 +; LA64D-LP64S-NEXT: slli.d $a0, $a0, 16 +; LA64D-LP64S-NEXT: movgr2fr.w $fa1, $a0 +; LA64D-LP64S-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA64D-LP64S-NEXT: movfr2gr.s $a0, $fa0 +; LA64D-LP64S-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64D-LP64S-NEXT: jirl $ra, $ra, 0 +; LA64D-LP64S-NEXT: lu12i.w $a1, -16 +; LA64D-LP64S-NEXT: or $a0, $a0, $a1 +; LA64D-LP64S-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64D-LP64S-NEXT: addi.d $sp, $sp, 16 +; LA64D-LP64S-NEXT: ret +; +; LA64D-LP64D-LABEL: bfloat_load: +; LA64D-LP64D: # %bb.0: +; LA64D-LP64D-NEXT: addi.d $sp, $sp, -16 +; LA64D-LP64D-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64D-LP64D-NEXT: ld.hu $a1, $a0, 6 +; LA64D-LP64D-NEXT: ld.hu $a0, $a0, 0 +; LA64D-LP64D-NEXT: slli.d $a1, $a1, 16 +; LA64D-LP64D-NEXT: movgr2fr.w $fa0, $a1 +; LA64D-LP64D-NEXT: slli.d $a0, $a0, 16 +; LA64D-LP64D-NEXT: movgr2fr.w $fa1, $a0 +; LA64D-LP64D-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA64D-LP64D-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64D-LP64D-NEXT: jirl $ra, $ra, 0 +; LA64D-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64D-LP64D-NEXT: lu12i.w $a1, -16 +; LA64D-LP64D-NEXT: or $a0, $a0, $a1 +; LA64D-LP64D-NEXT: movgr2fr.w $fa0, $a0 +; LA64D-LP64D-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64D-LP64D-NEXT: addi.d $sp, $sp, 16 +; LA64D-LP64D-NEXT: ret + %1 = load bfloat, ptr %a + %2 = getelementptr bfloat, ptr %a, i32 3 + %3 = load bfloat, ptr %2 + %4 = fadd bfloat %1, %3 + ret bfloat %4 +} + +define void @bfloat_store(ptr %a, bfloat %b, bfloat %c) nounwind { +; LA32-LABEL: bfloat_store: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 8 # 4-byte Folded Spill +; LA32-NEXT: move $fp, $a0 +; LA32-NEXT: slli.w $a0, $a1, 16 +; LA32-NEXT: slli.w $a1, $a2, 16 +; LA32-NEXT: bl __addsf3 +; LA32-NEXT: bl __truncsfbf2 +; LA32-NEXT: st.h $a0, $fp, 0 +; LA32-NEXT: st.h $a0, $fp, 16 +; LA32-NEXT: ld.w $fp, $sp, 8 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: bfloat_store: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 0 # 8-byte Folded Spill +; LA64-NEXT: move $fp, $a0 +; LA64-NEXT: movfr2gr.s $a0, $fa0 +; LA64-NEXT: movfr2gr.s $a1, $fa1 +; LA64-NEXT: slli.d $a1, $a1, 16 +; LA64-NEXT: movgr2fr.w $fa0, $a1 +; LA64-NEXT: slli.d $a0, $a0, 16 +; LA64-NEXT: movgr2fr.w $fa1, $a0 +; LA64-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA64-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: movfr2gr.s $a0, $fa0 +; LA64-NEXT: st.h $a0, $fp, 0 +; LA64-NEXT: st.h $a0, $fp, 16 +; LA64-NEXT: ld.d $fp, $sp, 0 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +; +; LA32F-ILP32S-LABEL: bfloat_store: +; LA32F-ILP32S: # %bb.0: +; LA32F-ILP32S-NEXT: addi.w $sp, $sp, -16 +; LA32F-ILP32S-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-ILP32S-NEXT: st.w $fp, $sp, 8 # 4-byte Folded Spill +; LA32F-ILP32S-NEXT: move $fp, $a0 +; LA32F-ILP32S-NEXT: slli.w $a0, $a2, 16 +; LA32F-ILP32S-NEXT: movgr2fr.w $fa0, $a0 +; LA32F-ILP32S-NEXT: slli.w $a0, $a1, 16 +; LA32F-ILP32S-NEXT: movgr2fr.w $fa1, $a0 +; LA32F-ILP32S-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA32F-ILP32S-NEXT: movfr2gr.s $a0, $fa0 +; LA32F-ILP32S-NEXT: bl __truncsfbf2 +; LA32F-ILP32S-NEXT: st.h $a0, $fp, 0 +; LA32F-ILP32S-NEXT: st.h $a0, $fp, 16 +; LA32F-ILP32S-NEXT: ld.w $fp, $sp, 8 # 4-byte Folded Reload +; LA32F-ILP32S-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-ILP32S-NEXT: addi.w $sp, $sp, 16 +; LA32F-ILP32S-NEXT: ret +; +; LA32F-ILP32D-LABEL: bfloat_store: +; LA32F-ILP32D: # %bb.0: +; LA32F-ILP32D-NEXT: addi.w $sp, $sp, -16 +; LA32F-ILP32D-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-ILP32D-NEXT: st.w $fp, $sp, 8 # 4-byte Folded Spill +; LA32F-ILP32D-NEXT: move $fp, $a0 +; LA32F-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32F-ILP32D-NEXT: movfr2gr.s $a1, $fa1 +; LA32F-ILP32D-NEXT: slli.w $a1, $a1, 16 +; LA32F-ILP32D-NEXT: movgr2fr.w $fa0, $a1 +; LA32F-ILP32D-NEXT: slli.w $a0, $a0, 16 +; LA32F-ILP32D-NEXT: movgr2fr.w $fa1, $a0 +; LA32F-ILP32D-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA32F-ILP32D-NEXT: bl __truncsfbf2 +; LA32F-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32F-ILP32D-NEXT: st.h $a0, $fp, 0 +; LA32F-ILP32D-NEXT: st.h $a0, $fp, 16 +; LA32F-ILP32D-NEXT: ld.w $fp, $sp, 8 # 4-byte Folded Reload +; LA32F-ILP32D-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-ILP32D-NEXT: addi.w $sp, $sp, 16 +; LA32F-ILP32D-NEXT: ret +; +; LA32D-ILP32D-LABEL: bfloat_store: +; LA32D-ILP32D: # %bb.0: +; LA32D-ILP32D-NEXT: addi.w $sp, $sp, -16 +; LA32D-ILP32D-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32D-ILP32D-NEXT: st.w $fp, $sp, 8 # 4-byte Folded Spill +; LA32D-ILP32D-NEXT: move $fp, $a0 +; LA32D-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32D-ILP32D-NEXT: movfr2gr.s $a1, $fa1 +; LA32D-ILP32D-NEXT: slli.w $a1, $a1, 16 +; LA32D-ILP32D-NEXT: movgr2fr.w $fa0, $a1 +; LA32D-ILP32D-NEXT: slli.w $a0, $a0, 16 +; LA32D-ILP32D-NEXT: movgr2fr.w $fa1, $a0 +; LA32D-ILP32D-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA32D-ILP32D-NEXT: bl __truncsfbf2 +; LA32D-ILP32D-NEXT: movfr2gr.s $a0, $fa0 +; LA32D-ILP32D-NEXT: st.h $a0, $fp, 0 +; LA32D-ILP32D-NEXT: st.h $a0, $fp, 16 +; LA32D-ILP32D-NEXT: ld.w $fp, $sp, 8 # 4-byte Folded Reload +; LA32D-ILP32D-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32D-ILP32D-NEXT: addi.w $sp, $sp, 16 +; LA32D-ILP32D-NEXT: ret +; +; LA64F-LP64S-LABEL: bfloat_store: +; LA64F-LP64S: # %bb.0: +; LA64F-LP64S-NEXT: addi.d $sp, $sp, -16 +; LA64F-LP64S-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64F-LP64S-NEXT: st.d $fp, $sp, 0 # 8-byte Folded Spill +; LA64F-LP64S-NEXT: move $fp, $a0 +; LA64F-LP64S-NEXT: slli.d $a0, $a2, 16 +; LA64F-LP64S-NEXT: movgr2fr.w $fa0, $a0 +; LA64F-LP64S-NEXT: slli.d $a0, $a1, 16 +; LA64F-LP64S-NEXT: movgr2fr.w $fa1, $a0 +; LA64F-LP64S-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA64F-LP64S-NEXT: movfr2gr.s $a0, $fa0 +; LA64F-LP64S-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64F-LP64S-NEXT: jirl $ra, $ra, 0 +; LA64F-LP64S-NEXT: st.h $a0, $fp, 0 +; LA64F-LP64S-NEXT: st.h $a0, $fp, 16 +; LA64F-LP64S-NEXT: ld.d $fp, $sp, 0 # 8-byte Folded Reload +; LA64F-LP64S-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64F-LP64S-NEXT: addi.d $sp, $sp, 16 +; LA64F-LP64S-NEXT: ret +; +; LA64F-LP64D-LABEL: bfloat_store: +; LA64F-LP64D: # %bb.0: +; LA64F-LP64D-NEXT: addi.d $sp, $sp, -16 +; LA64F-LP64D-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64F-LP64D-NEXT: st.d $fp, $sp, 0 # 8-byte Folded Spill +; LA64F-LP64D-NEXT: move $fp, $a0 +; LA64F-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64F-LP64D-NEXT: movfr2gr.s $a1, $fa1 +; LA64F-LP64D-NEXT: slli.d $a1, $a1, 16 +; LA64F-LP64D-NEXT: movgr2fr.w $fa0, $a1 +; LA64F-LP64D-NEXT: slli.d $a0, $a0, 16 +; LA64F-LP64D-NEXT: movgr2fr.w $fa1, $a0 +; LA64F-LP64D-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA64F-LP64D-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64F-LP64D-NEXT: jirl $ra, $ra, 0 +; LA64F-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64F-LP64D-NEXT: st.h $a0, $fp, 0 +; LA64F-LP64D-NEXT: st.h $a0, $fp, 16 +; LA64F-LP64D-NEXT: ld.d $fp, $sp, 0 # 8-byte Folded Reload +; LA64F-LP64D-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64F-LP64D-NEXT: addi.d $sp, $sp, 16 +; LA64F-LP64D-NEXT: ret +; +; LA64D-LP64S-LABEL: bfloat_store: +; LA64D-LP64S: # %bb.0: +; LA64D-LP64S-NEXT: addi.d $sp, $sp, -16 +; LA64D-LP64S-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64D-LP64S-NEXT: st.d $fp, $sp, 0 # 8-byte Folded Spill +; LA64D-LP64S-NEXT: move $fp, $a0 +; LA64D-LP64S-NEXT: slli.d $a0, $a2, 16 +; LA64D-LP64S-NEXT: movgr2fr.w $fa0, $a0 +; LA64D-LP64S-NEXT: slli.d $a0, $a1, 16 +; LA64D-LP64S-NEXT: movgr2fr.w $fa1, $a0 +; LA64D-LP64S-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA64D-LP64S-NEXT: movfr2gr.s $a0, $fa0 +; LA64D-LP64S-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64D-LP64S-NEXT: jirl $ra, $ra, 0 +; LA64D-LP64S-NEXT: st.h $a0, $fp, 0 +; LA64D-LP64S-NEXT: st.h $a0, $fp, 16 +; LA64D-LP64S-NEXT: ld.d $fp, $sp, 0 # 8-byte Folded Reload +; LA64D-LP64S-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64D-LP64S-NEXT: addi.d $sp, $sp, 16 +; LA64D-LP64S-NEXT: ret +; +; LA64D-LP64D-LABEL: bfloat_store: +; LA64D-LP64D: # %bb.0: +; LA64D-LP64D-NEXT: addi.d $sp, $sp, -16 +; LA64D-LP64D-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64D-LP64D-NEXT: st.d $fp, $sp, 0 # 8-byte Folded Spill +; LA64D-LP64D-NEXT: move $fp, $a0 +; LA64D-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64D-LP64D-NEXT: movfr2gr.s $a1, $fa1 +; LA64D-LP64D-NEXT: slli.d $a1, $a1, 16 +; LA64D-LP64D-NEXT: movgr2fr.w $fa0, $a1 +; LA64D-LP64D-NEXT: slli.d $a0, $a0, 16 +; LA64D-LP64D-NEXT: movgr2fr.w $fa1, $a0 +; LA64D-LP64D-NEXT: fadd.s $fa0, $fa1, $fa0 +; LA64D-LP64D-NEXT: pcaddu18i $ra, %call36(__truncsfbf2) +; LA64D-LP64D-NEXT: jirl $ra, $ra, 0 +; LA64D-LP64D-NEXT: movfr2gr.s $a0, $fa0 +; LA64D-LP64D-NEXT: st.h $a0, $fp, 0 +; LA64D-LP64D-NEXT: st.h $a0, $fp, 16 +; LA64D-LP64D-NEXT: ld.d $fp, $sp, 0 # 8-byte Folded Reload +; LA64D-LP64D-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64D-LP64D-NEXT: addi.d $sp, $sp, 16 +; LA64D-LP64D-NEXT: ret + %1 = fadd bfloat %b, %c + store bfloat %1, ptr %a + %2 = getelementptr bfloat, ptr %a, i32 8 + store bfloat %1, ptr %2 + ret void +} _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits