https://github.com/Wolfram70 updated https://github.com/llvm/llvm-project/pull/167641
>From e0f41d4498b87a558cf8fabd2a8ec5430c208fba Mon Sep 17 00:00:00 2001 From: Srinivasa Ravi <[email protected]> Date: Tue, 11 Nov 2025 17:19:12 +0000 Subject: [PATCH] [clang][NVPTX] Add remaining float to fp16 conversions This change adds intrinsics and clang builtins for the remaining float to fp16 conversions. This includes the following conversions: - float to bf16x2 - satfinite variants - float to f16x2 - satfinite variants - float to bf16 - satfinite variants - float to f16 - all variants Tests are added in `convert-sm80.ll` and `convert-sm80-sf.ll` for the intrinsics and in `builtins-nvptx.c` for the clang builtins. --- clang/include/clang/Basic/BuiltinsNVPTX.td | 21 ++ clang/test/CodeGen/builtins-nvptx.c | 49 ++++ llvm/include/llvm/IR/IntrinsicsNVVM.td | 21 +- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 14 ++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 30 ++- llvm/test/CodeGen/NVPTX/convert-sm80-sf.ll | 260 +++++++++++++++++++++ llvm/test/CodeGen/NVPTX/convert-sm80.ll | 65 ++++++ 7 files changed, 451 insertions(+), 9 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td b/clang/include/clang/Basic/BuiltinsNVPTX.td index ad448766e665f..6fbd2222ab289 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.td +++ b/clang/include/clang/Basic/BuiltinsNVPTX.td @@ -579,6 +579,10 @@ def __nvvm_ff2bf16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float) def __nvvm_ff2bf16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX70>; def __nvvm_ff2bf16x2_rz : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX70>; def __nvvm_ff2bf16x2_rz_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX70>; +def __nvvm_ff2bf16x2_rn_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX81>; +def __nvvm_ff2bf16x2_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX81>; +def __nvvm_ff2bf16x2_rz_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX81>; +def __nvvm_ff2bf16x2_rz_relu_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX81>; def __nvvm_ff2bf16x2_rs : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; @@ -596,6 +600,10 @@ def __nvvm_ff2f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)" def __nvvm_ff2f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX70>; def __nvvm_ff2f16x2_rz : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX70>; def __nvvm_ff2f16x2_rz_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX70>; +def __nvvm_ff2f16x2_rn_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX81>; +def __nvvm_ff2f16x2_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX81>; +def __nvvm_ff2f16x2_rz_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX81>; +def __nvvm_ff2f16x2_rz_relu_satfinite : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX81>; def __nvvm_ff2f16x2_rs : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float, uint32_t)", SM<"100a", [SM_103a]>, PTX87>; @@ -613,6 +621,19 @@ def __nvvm_f2bf16_rn : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>; def __nvvm_f2bf16_rn_relu : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>; def __nvvm_f2bf16_rz : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>; def __nvvm_f2bf16_rz_relu : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>; +def __nvvm_f2bf16_rn_satfinite : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX81>; +def __nvvm_f2bf16_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX81>; +def __nvvm_f2bf16_rz_satfinite : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX81>; +def __nvvm_f2bf16_rz_relu_satfinite : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX81>; + +def __nvvm_f2f16_rn : NVPTXBuiltinSMAndPTX<"__fp16(float)", SM_80, PTX70>; +def __nvvm_f2f16_rn_relu : NVPTXBuiltinSMAndPTX<"__fp16(float)", SM_80, PTX70>; +def __nvvm_f2f16_rz : NVPTXBuiltinSMAndPTX<"__fp16(float)", SM_80, PTX70>; +def __nvvm_f2f16_rz_relu : NVPTXBuiltinSMAndPTX<"__fp16(float)", SM_80, PTX70>; +def __nvvm_f2f16_rn_satfinite : NVPTXBuiltinSMAndPTX<"__fp16(float)", SM_80, PTX81>; +def __nvvm_f2f16_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"__fp16(float)", SM_80, PTX81>; +def __nvvm_f2f16_rz_satfinite : NVPTXBuiltinSMAndPTX<"__fp16(float)", SM_80, PTX81>; +def __nvvm_f2f16_rz_relu_satfinite : NVPTXBuiltinSMAndPTX<"__fp16(float)", SM_80, PTX81>; def __nvvm_f2tf32_rna : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_80, PTX70>; def __nvvm_f2tf32_rna_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_80, PTX81>; diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index c0ed799970122..75f2588f4837b 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -1007,6 +1007,16 @@ __device__ void nvvm_cvt_sm80() { __nvvm_ff2bf16x2_rz(1, 1); // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.relu(float 1.000000e+00, float 1.000000e+00) __nvvm_ff2bf16x2_rz_relu(1, 1); + #if PTX >= 81 + // CHECK_PTX81_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.satfinite(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff2bf16x2_rn_satfinite(1, 1); + // CHECK_PTX81_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.relu.satfinite(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff2bf16x2_rn_relu_satfinite(1, 1); + // CHECK_PTX81_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.satfinite(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff2bf16x2_rz_satfinite(1, 1); + // CHECK_PTX81_SM80: call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.relu.satfinite(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff2bf16x2_rz_relu_satfinite(1, 1); + #endif // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rn(float 1.000000e+00, float 1.000000e+00) __nvvm_ff2f16x2_rn(1, 1); @@ -1016,6 +1026,16 @@ __device__ void nvvm_cvt_sm80() { __nvvm_ff2f16x2_rz(1, 1); // CHECK_PTX70_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rz.relu(float 1.000000e+00, float 1.000000e+00) __nvvm_ff2f16x2_rz_relu(1, 1); + #if PTX >= 81 + // CHECK_PTX81_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rn.satfinite(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff2f16x2_rn_satfinite(1, 1); + // CHECK_PTX81_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rn.relu.satfinite(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff2f16x2_rn_relu_satfinite(1, 1); + // CHECK_PTX81_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rz.satfinite(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff2f16x2_rz_satfinite(1, 1); + // CHECK_PTX81_SM80: call <2 x half> @llvm.nvvm.ff2f16x2.rz.relu.satfinite(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff2f16x2_rz_relu_satfinite(1, 1); + #endif // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.f2bf16.rn(float 1.000000e+00) __nvvm_f2bf16_rn(1); @@ -1025,6 +1045,35 @@ __device__ void nvvm_cvt_sm80() { __nvvm_f2bf16_rz(1); // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.f2bf16.rz.relu(float 1.000000e+00) __nvvm_f2bf16_rz_relu(1); + #if PTX >= 81 + // CHECK_PTX81_SM80: call bfloat @llvm.nvvm.f2bf16.rn.satfinite(float 1.000000e+00) + __nvvm_f2bf16_rn_satfinite(1); + // CHECK_PTX81_SM80: call bfloat @llvm.nvvm.f2bf16.rn.relu.satfinite(float 1.000000e+00) + __nvvm_f2bf16_rn_relu_satfinite(1); + // CHECK_PTX81_SM80: call bfloat @llvm.nvvm.f2bf16.rz.satfinite(float 1.000000e+00) + __nvvm_f2bf16_rz_satfinite(1); + // CHECK_PTX81_SM80: call bfloat @llvm.nvvm.f2bf16.rz.relu.satfinite(float 1.000000e+00) + __nvvm_f2bf16_rz_relu_satfinite(1); + #endif + + // CHECK_PTX70_SM80: call half @llvm.nvvm.f2f16.rn(float 1.000000e+00) + __nvvm_f2f16_rn(1); + // CHECK_PTX70_SM80: call half @llvm.nvvm.f2f16.rn.relu(float 1.000000e+00) + __nvvm_f2f16_rn_relu(1); + // CHECK_PTX70_SM80: call half @llvm.nvvm.f2f16.rz(float 1.000000e+00) + __nvvm_f2f16_rz(1); + // CHECK_PTX70_SM80: call half @llvm.nvvm.f2f16.rz.relu(float 1.000000e+00) + __nvvm_f2f16_rz_relu(1); + #if PTX >= 81 + // CHECK_PTX81_SM80: call half @llvm.nvvm.f2f16.rn.satfinite(float 1.000000e+00) + __nvvm_f2f16_rn_satfinite(1); + // CHECK_PTX81_SM80: call half @llvm.nvvm.f2f16.rn.relu.satfinite(float 1.000000e+00) + __nvvm_f2f16_rn_relu_satfinite(1); + // CHECK_PTX81_SM80: call half @llvm.nvvm.f2f16.rz.satfinite(float 1.000000e+00) + __nvvm_f2f16_rz_satfinite(1); + // CHECK_PTX81_SM80: call half @llvm.nvvm.f2f16.rz.relu.satfinite(float 1.000000e+00) + __nvvm_f2f16_rz_relu_satfinite(1); + #endif // CHECK_PTX70_SM80: call i32 @llvm.nvvm.f2tf32.rna(float 1.000000e+00) __nvvm_f2tf32_rna(1); diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 1b485dc8ccd1e..aef92206187d3 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -1566,14 +1566,19 @@ let TargetPrefix = "nvvm" in { foreach rnd = ["rn", "rz"] in { foreach relu = ["", "_relu"] in { - def int_nvvm_ff2bf16x2_ # rnd # relu : NVVMBuiltin, - PureIntrinsic<[llvm_v2bf16_ty], [llvm_float_ty, llvm_float_ty]>; - - def int_nvvm_ff2f16x2_ # rnd # relu : NVVMBuiltin, - PureIntrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty]>; - - def int_nvvm_f2bf16_ # rnd # relu : NVVMBuiltin, - PureIntrinsic<[llvm_bfloat_ty], [llvm_float_ty]>; + foreach satfinite = ["", "_satfinite"] in { + def int_nvvm_ff2bf16x2_ # rnd # relu # satfinite : NVVMBuiltin, + PureIntrinsic<[llvm_v2bf16_ty], [llvm_float_ty, llvm_float_ty]>; + + def int_nvvm_ff2f16x2_ # rnd # relu # satfinite : NVVMBuiltin, + PureIntrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty]>; + + def int_nvvm_f2bf16_ # rnd # relu # satfinite : NVVMBuiltin, + PureIntrinsic<[llvm_bfloat_ty], [llvm_float_ty]>; + + def int_nvvm_f2f16_ # rnd # relu # satfinite : NVVMBuiltin, + PureIntrinsic<[llvm_half_ty], [llvm_float_ty]>; + } } } diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index ff9d9723dddea..84cb39ba0d909 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -595,6 +595,15 @@ let hasSideEffects = false in { defm CVT_bf16 : CVT_FROM_ALL<"bf16", B16, [hasPTX<78>, hasSM<90>]>; defm CVT_f32 : CVT_FROM_ALL<"f32", B32>; defm CVT_f64 : CVT_FROM_ALL<"f64", B64>; + + multiclass CVT_FROM_FLOAT_SATFINITE<string ToName, RegisterClass RC> { + def _f32_sf : + BasicFlagsNVPTXInst<(outs RC:$dst), + (ins B32:$src), (ins CvtMode:$mode), + "cvt${mode:base}${mode:relu}.satfinite." # ToName # ".f32">; + } + defm CVT_bf16 : CVT_FROM_FLOAT_SATFINITE<"bf16", B16>; + defm CVT_f16 : CVT_FROM_FLOAT_SATFINITE<"f16", B16>; // These cvts are different from those above: The source and dest registers // are of the same type. @@ -611,6 +620,11 @@ let hasSideEffects = false in { (ins B32:$src1, B32:$src2), (ins CvtMode:$mode), "cvt${mode:base}${mode:relu}." # FromName # ".f32">, Requires<[hasPTX<70>, hasSM<80>]>; + + def _f32_sf : + BasicFlagsNVPTXInst<(outs RC:$dst), + (ins B32:$src1, B32:$src2), (ins CvtMode:$mode), + "cvt${mode:base}${mode:relu}.satfinite." # FromName # ".f32">; } defm CVT_f16x2 : CVT_FROM_FLOAT_V2_SM80<"f16x2", B32>; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index ea69a54e6db37..0430aa7723ceb 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1917,7 +1917,12 @@ def : Pat<(int_nvvm_ff2bf16x2_rn f32:$a, f32:$b), (CVT_bf16x2_f32 $a, $b, C def : Pat<(int_nvvm_ff2bf16x2_rn_relu f32:$a, f32:$b), (CVT_bf16x2_f32 $a, $b, CvtRN_RELU)>; def : Pat<(int_nvvm_ff2bf16x2_rz f32:$a, f32:$b), (CVT_bf16x2_f32 $a, $b, CvtRZ)>; def : Pat<(int_nvvm_ff2bf16x2_rz_relu f32:$a, f32:$b), (CVT_bf16x2_f32 $a, $b, CvtRZ_RELU)>; - +let Predicates = [hasPTX<81>, hasSM<80>] in { + def : Pat<(int_nvvm_ff2bf16x2_rn_satfinite f32:$a, f32:$b), (CVT_bf16x2_f32_sf $a, $b, CvtRN)>; + def : Pat<(int_nvvm_ff2bf16x2_rn_relu_satfinite f32:$a, f32:$b), (CVT_bf16x2_f32_sf $a, $b, CvtRN_RELU)>; + def : Pat<(int_nvvm_ff2bf16x2_rz_satfinite f32:$a, f32:$b), (CVT_bf16x2_f32_sf $a, $b, CvtRZ)>; + def : Pat<(int_nvvm_ff2bf16x2_rz_relu_satfinite f32:$a, f32:$b), (CVT_bf16x2_f32_sf $a, $b, CvtRZ_RELU)>; +} let Predicates = [hasPTX<87>, hasSM100aOrSM103a] in { def : Pat<(int_nvvm_ff2bf16x2_rs f32:$a, f32:$b, i32:$c), (CVT_bf16x2_f32_rs $a, $b, $c, CvtRS)>; @@ -1933,6 +1938,12 @@ def : Pat<(int_nvvm_ff2f16x2_rn f32:$a, f32:$b), (CVT_f16x2_f32 $a, $b, Cvt def : Pat<(int_nvvm_ff2f16x2_rn_relu f32:$a, f32:$b), (CVT_f16x2_f32 $a, $b, CvtRN_RELU)>; def : Pat<(int_nvvm_ff2f16x2_rz f32:$a, f32:$b), (CVT_f16x2_f32 $a, $b, CvtRZ)>; def : Pat<(int_nvvm_ff2f16x2_rz_relu f32:$a, f32:$b), (CVT_f16x2_f32 $a, $b, CvtRZ_RELU)>; +let Predicates = [hasPTX<81>, hasSM<80>] in { + def : Pat<(int_nvvm_ff2f16x2_rn_satfinite f32:$a, f32:$b), (CVT_f16x2_f32_sf $a, $b, CvtRN)>; + def : Pat<(int_nvvm_ff2f16x2_rn_relu_satfinite f32:$a, f32:$b), (CVT_f16x2_f32_sf $a, $b, CvtRN_RELU)>; + def : Pat<(int_nvvm_ff2f16x2_rz_satfinite f32:$a, f32:$b), (CVT_f16x2_f32_sf $a, $b, CvtRZ)>; + def : Pat<(int_nvvm_ff2f16x2_rz_relu_satfinite f32:$a, f32:$b), (CVT_f16x2_f32_sf $a, $b, CvtRZ_RELU)>; +} let Predicates = [hasPTX<87>, hasSM100aOrSM103a] in { def : Pat<(int_nvvm_ff2f16x2_rs f32:$a, f32:$b, i32:$c), @@ -1948,6 +1959,23 @@ def : Pat<(int_nvvm_f2bf16_rn f32:$a), (CVT_bf16_f32 $a, CvtRN)>; def : Pat<(int_nvvm_f2bf16_rn_relu f32:$a), (CVT_bf16_f32 $a, CvtRN_RELU)>; def : Pat<(int_nvvm_f2bf16_rz f32:$a), (CVT_bf16_f32 $a, CvtRZ)>; def : Pat<(int_nvvm_f2bf16_rz_relu f32:$a), (CVT_bf16_f32 $a, CvtRZ_RELU)>; +let Predicates = [hasPTX<81>, hasSM<80>] in { + def : Pat<(int_nvvm_f2bf16_rz_satfinite f32:$a), (CVT_bf16_f32_sf $a, CvtRZ)>; + def : Pat<(int_nvvm_f2bf16_rz_relu_satfinite f32:$a), (CVT_bf16_f32_sf $a, CvtRZ_RELU)>; + def : Pat<(int_nvvm_f2bf16_rn_satfinite f32:$a), (CVT_bf16_f32_sf $a, CvtRN)>; + def : Pat<(int_nvvm_f2bf16_rn_relu_satfinite f32:$a), (CVT_bf16_f32_sf $a, CvtRN_RELU)>; +} + +def : Pat<(int_nvvm_f2f16_rn f32:$a), (CVT_f16_f32 $a, CvtRN)>; +def : Pat<(int_nvvm_f2f16_rn_relu f32:$a), (CVT_f16_f32 $a, CvtRN_RELU)>; +def : Pat<(int_nvvm_f2f16_rz f32:$a), (CVT_f16_f32 $a, CvtRZ)>; +def : Pat<(int_nvvm_f2f16_rz_relu f32:$a), (CVT_f16_f32 $a, CvtRZ_RELU)>; +let Predicates = [hasPTX<81>, hasSM<80>] in { + def : Pat<(int_nvvm_f2f16_rz_satfinite f32:$a), (CVT_f16_f32_sf $a, CvtRZ)>; + def : Pat<(int_nvvm_f2f16_rz_relu_satfinite f32:$a), (CVT_f16_f32_sf $a, CvtRZ_RELU)>; + def : Pat<(int_nvvm_f2f16_rn_satfinite f32:$a), (CVT_f16_f32_sf $a, CvtRN)>; + def : Pat<(int_nvvm_f2f16_rn_relu_satfinite f32:$a), (CVT_f16_f32_sf $a, CvtRN_RELU)>; +} def : Pat<(int_nvvm_lohi_i2d i32:$a, i32:$b), (V2I32toI64 $a, $b)>; def : Pat<(int_nvvm_d2i_lo f64:$a), (I64toI32L $a)>; diff --git a/llvm/test/CodeGen/NVPTX/convert-sm80-sf.ll b/llvm/test/CodeGen/NVPTX/convert-sm80-sf.ll index f47c2f2a85156..b773c8d11248a 100644 --- a/llvm/test/CodeGen/NVPTX/convert-sm80-sf.ll +++ b/llvm/test/CodeGen/NVPTX/convert-sm80-sf.ll @@ -16,3 +16,263 @@ define i32 @cvt_rna_satfinite_tf32_f32(float %f1) { %val = call i32 @llvm.nvvm.f2tf32.rna.satfinite(float %f1) ret i32 %val } + +define <2 x bfloat> @cvt_rn_bf16x2_f32_sf(float %f1, float %f2) { +; CHECK-LABEL: cvt_rn_bf16x2_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_bf16x2_f32_sf_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rn_bf16x2_f32_sf_param_1]; +; CHECK-NEXT: cvt.rn.satfinite.bf16x2.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.satfinite(float %f1, float %f2) + ret <2 x bfloat> %val +} + +define <2 x bfloat> @cvt_rn_relu_bf16x2_f32_sf(float %f1, float %f2) { +; CHECK-LABEL: cvt_rn_relu_bf16x2_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_relu_bf16x2_f32_sf_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rn_relu_bf16x2_f32_sf_param_1]; +; CHECK-NEXT: cvt.rn.relu.satfinite.bf16x2.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.relu.satfinite(float %f1, float %f2) + ret <2 x bfloat> %val +} + +define <2 x bfloat> @cvt_rz_bf16x2_f32_sf(float %f1, float %f2) { +; CHECK-LABEL: cvt_rz_bf16x2_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_bf16x2_f32_sf_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rz_bf16x2_f32_sf_param_1]; +; CHECK-NEXT: cvt.rz.satfinite.bf16x2.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.satfinite(float %f1, float %f2) + ret <2 x bfloat> %val +} + +define <2 x bfloat> @cvt_rz_relu_bf16x2_f32_sf(float %f1, float %f2) { +; CHECK-LABEL: cvt_rz_relu_bf16x2_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_relu_bf16x2_f32_sf_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rz_relu_bf16x2_f32_sf_param_1]; +; CHECK-NEXT: cvt.rz.relu.satfinite.bf16x2.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %val = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.relu.satfinite(float %f1, float %f2) + ret <2 x bfloat> %val +} + +declare <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.satfinite(float, float) +declare <2 x bfloat> @llvm.nvvm.ff2bf16x2.rn.relu.satfinite(float, float) +declare <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.satfinite(float, float) +declare <2 x bfloat> @llvm.nvvm.ff2bf16x2.rz.relu.satfinite(float, float) + +define <2 x half> @cvt_rn_f16x2_f32_sf(float %f1, float %f2) { +; CHECK-LABEL: cvt_rn_f16x2_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_f16x2_f32_sf_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rn_f16x2_f32_sf_param_1]; +; CHECK-NEXT: cvt.rn.satfinite.f16x2.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %val = call <2 x half> @llvm.nvvm.ff2f16x2.rn.satfinite(float %f1, float %f2) + ret <2 x half> %val +} + +define <2 x half> @cvt_rn_relu_f16x2_f32_sf(float %f1, float %f2) { +; CHECK-LABEL: cvt_rn_relu_f16x2_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_relu_f16x2_f32_sf_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rn_relu_f16x2_f32_sf_param_1]; +; CHECK-NEXT: cvt.rn.relu.satfinite.f16x2.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %val = call <2 x half> @llvm.nvvm.ff2f16x2.rn.relu.satfinite(float %f1, float %f2) + ret <2 x half> %val +} + +define <2 x half> @cvt_rz_f16x2_f32_sf(float %f1, float %f2) { +; CHECK-LABEL: cvt_rz_f16x2_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_f16x2_f32_sf_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rz_f16x2_f32_sf_param_1]; +; CHECK-NEXT: cvt.rz.satfinite.f16x2.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %val = call <2 x half> @llvm.nvvm.ff2f16x2.rz.satfinite(float %f1, float %f2) + ret <2 x half> %val +} + +define <2 x half> @cvt_rz_relu_f16x2_f32_sf(float %f1, float %f2) { +; CHECK-LABEL: cvt_rz_relu_f16x2_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_relu_f16x2_f32_sf_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [cvt_rz_relu_f16x2_f32_sf_param_1]; +; CHECK-NEXT: cvt.rz.relu.satfinite.f16x2.f32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %val = call <2 x half> @llvm.nvvm.ff2f16x2.rz.relu.satfinite(float %f1, float %f2) + ret <2 x half> %val +} + +declare <2 x half> @llvm.nvvm.ff2f16x2.rn.satfinite(float, float) +declare <2 x half> @llvm.nvvm.ff2f16x2.rn.relu.satfinite(float, float) +declare <2 x half> @llvm.nvvm.ff2f16x2.rz.satfinite(float, float) +declare <2 x half> @llvm.nvvm.ff2f16x2.rz.relu.satfinite(float, float) + +define bfloat @cvt_rn_bf16_f32_sf(float %f1) { +; CHECK-LABEL: cvt_rn_bf16_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_bf16_f32_sf_param_0]; +; CHECK-NEXT: cvt.rn.satfinite.bf16.f32 %rs1, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; + %val = call bfloat @llvm.nvvm.f2bf16.rn.satfinite(float %f1) + ret bfloat %val +} + +define bfloat @cvt_rn_relu_bf16_f32_sf(float %f1) { +; CHECK-LABEL: cvt_rn_relu_bf16_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_relu_bf16_f32_sf_param_0]; +; CHECK-NEXT: cvt.rn.relu.satfinite.bf16.f32 %rs1, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; + %val = call bfloat @llvm.nvvm.f2bf16.rn.relu.satfinite(float %f1) + ret bfloat %val +} + +define bfloat @cvt_rz_bf16_f32_sf(float %f1) { +; CHECK-LABEL: cvt_rz_bf16_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_bf16_f32_sf_param_0]; +; CHECK-NEXT: cvt.rz.satfinite.bf16.f32 %rs1, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; + %val = call bfloat @llvm.nvvm.f2bf16.rz.satfinite(float %f1) + ret bfloat %val +} + +define bfloat @cvt_rz_relu_bf16_f32_sf(float %f1) { +; CHECK-LABEL: cvt_rz_relu_bf16_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_relu_bf16_f32_sf_param_0]; +; CHECK-NEXT: cvt.rz.relu.satfinite.bf16.f32 %rs1, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; + %val = call bfloat @llvm.nvvm.f2bf16.rz.relu.satfinite(float %f1) + ret bfloat %val +} + +declare bfloat @llvm.nvvm.f2bf16.rn.satfinite(float) +declare bfloat @llvm.nvvm.f2bf16.rn.relu.satfinite(float) +declare bfloat @llvm.nvvm.f2bf16.rz.satfinite(float) +declare bfloat @llvm.nvvm.f2bf16.rz.relu.satfinite(float) + +define half @cvt_rn_f16_f32_sf(float %f1) { +; CHECK-LABEL: cvt_rn_f16_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_f16_f32_sf_param_0]; +; CHECK-NEXT: cvt.rn.satfinite.f16.f32 %rs1, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; + %val = call half @llvm.nvvm.f2f16.rn.satfinite(float %f1) + ret half %val +} + +define half @cvt_rn_relu_f16_f32_sf(float %f1) { +; CHECK-LABEL: cvt_rn_relu_f16_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_relu_f16_f32_sf_param_0]; +; CHECK-NEXT: cvt.rn.relu.satfinite.f16.f32 %rs1, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; + %val = call half @llvm.nvvm.f2f16.rn.relu.satfinite(float %f1) + ret half %val +} + +define half @cvt_rz_f16_f32_sf(float %f1) { +; CHECK-LABEL: cvt_rz_f16_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_f16_f32_sf_param_0]; +; CHECK-NEXT: cvt.rz.satfinite.f16.f32 %rs1, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; + %val = call half @llvm.nvvm.f2f16.rz.satfinite(float %f1) + ret half %val +} + +define half @cvt_rz_relu_f16_f32_sf(float %f1) { +; CHECK-LABEL: cvt_rz_relu_f16_f32_sf( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_relu_f16_f32_sf_param_0]; +; CHECK-NEXT: cvt.rz.relu.satfinite.f16.f32 %rs1, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; + %val = call half @llvm.nvvm.f2f16.rz.relu.satfinite(float %f1) + ret half %val +} + +declare half @llvm.nvvm.f2f16.rn.satfinite(float) +declare half @llvm.nvvm.f2f16.rn.relu.satfinite(float) +declare half @llvm.nvvm.f2f16.rz.satfinite(float) +declare half @llvm.nvvm.f2f16.rz.relu.satfinite(float) diff --git a/llvm/test/CodeGen/NVPTX/convert-sm80.ll b/llvm/test/CodeGen/NVPTX/convert-sm80.ll index edf1739ae9928..a47bbabdd448c 100644 --- a/llvm/test/CodeGen/NVPTX/convert-sm80.ll +++ b/llvm/test/CodeGen/NVPTX/convert-sm80.ll @@ -198,6 +198,71 @@ declare bfloat @llvm.nvvm.f2bf16.rn.relu(float) declare bfloat @llvm.nvvm.f2bf16.rz(float) declare bfloat @llvm.nvvm.f2bf16.rz.relu(float) +define half @cvt_rn_f16_f32(float %f1) { +; CHECK-LABEL: cvt_rn_f16_f32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_f16_f32_param_0]; +; CHECK-NEXT: cvt.rn.f16.f32 %rs1, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; + %val = call half @llvm.nvvm.f2f16.rn(float %f1) + ret half %val +} + +define half @cvt_rn_relu_f16_f32(float %f1) { +; CHECK-LABEL: cvt_rn_relu_f16_f32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rn_relu_f16_f32_param_0]; +; CHECK-NEXT: cvt.rn.relu.f16.f32 %rs1, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; + %val = call half @llvm.nvvm.f2f16.rn.relu(float %f1) + ret half %val +} + +define half @cvt_rz_f16_f32(float %f1) { +; CHECK-LABEL: cvt_rz_f16_f32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_f16_f32_param_0]; +; CHECK-NEXT: cvt.rz.f16.f32 %rs1, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; + %val = call half @llvm.nvvm.f2f16.rz(float %f1) + ret half %val +} + +define half @cvt_rz_relu_f16_f32(float %f1) { +; CHECK-LABEL: cvt_rz_relu_f16_f32( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [cvt_rz_relu_f16_f32_param_0]; +; CHECK-NEXT: cvt.rz.relu.f16.f32 %rs1, %r1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; +; CHECK-NEXT: ret; + %val = call half @llvm.nvvm.f2f16.rz.relu(float %f1) + ret half %val +} + +declare half @llvm.nvvm.f2f16.rn(float) +declare half @llvm.nvvm.f2f16.rn.relu(float) +declare half @llvm.nvvm.f2f16.rz(float) +declare half @llvm.nvvm.f2f16.rz.relu(float) + define i32 @cvt_rna_tf32_f32(float %f1) { ; CHECK-LABEL: cvt_rna_tf32_f32( ; CHECK: { _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
