llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-clang Author: Srinivasa Ravi (Wolfram70) <details> <summary>Changes</summary> This change adds NVVM intrinsics and clang builtins for mixed-precision FP arithmetic instructions. Tests are added in `mixed-precision-fp.ll` and `builtins-nvptx.c` and verified through `ptxas-13.0`. PTX Spec Reference: https://docs.nvidia.com/cuda/parallel-thread-execution/#mixed-precision-floating-point-instructions --- Patch is 37.10 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/168359.diff 6 Files Affected: - (modified) clang/include/clang/Basic/BuiltinsNVPTX.td (+64) - (modified) clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp (+123) - (modified) clang/test/CodeGen/builtins-nvptx.c (+133) - (modified) llvm/include/llvm/IR/IntrinsicsNVVM.td (+25) - (modified) llvm/lib/Target/NVPTX/NVPTXIntrinsics.td (+44) - (added) llvm/test/CodeGen/NVPTX/mixed-precision-fp.ll (+225) ``````````diff diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td b/clang/include/clang/Basic/BuiltinsNVPTX.td index d923d2a90e908..47ba12bef058c 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.td +++ b/clang/include/clang/Basic/BuiltinsNVPTX.td @@ -401,6 +401,24 @@ def __nvvm_fma_rz_d : NVPTXBuiltin<"double(double, double, double)">; def __nvvm_fma_rm_d : NVPTXBuiltin<"double(double, double, double)">; def __nvvm_fma_rp_d : NVPTXBuiltin<"double(double, double, double)">; +def __nvvm_fma_mixed_rn_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, __fp16, float)", SM_100, PTX86>; +def __nvvm_fma_mixed_rz_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, __fp16, float)", SM_100, PTX86>; +def __nvvm_fma_mixed_rm_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, __fp16, float)", SM_100, PTX86>; +def __nvvm_fma_mixed_rp_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, __fp16, float)", SM_100, PTX86>; +def __nvvm_fma_mixed_rn_sat_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, __fp16, float)", SM_100, PTX86>; +def __nvvm_fma_mixed_rz_sat_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, __fp16, float)", SM_100, PTX86>; +def __nvvm_fma_mixed_rm_sat_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, __fp16, float)", SM_100, PTX86>; +def __nvvm_fma_mixed_rp_sat_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, __fp16, float)", SM_100, PTX86>; + +def __nvvm_fma_mixed_rn_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, __bf16, float)", SM_100, PTX86>; +def __nvvm_fma_mixed_rz_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, __bf16, float)", SM_100, PTX86>; +def __nvvm_fma_mixed_rm_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, __bf16, float)", SM_100, PTX86>; +def __nvvm_fma_mixed_rp_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, __bf16, float)", SM_100, PTX86>; +def __nvvm_fma_mixed_rn_sat_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, __bf16, float)", SM_100, PTX86>; +def __nvvm_fma_mixed_rz_sat_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, __bf16, float)", SM_100, PTX86>; +def __nvvm_fma_mixed_rm_sat_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, __bf16, float)", SM_100, PTX86>; +def __nvvm_fma_mixed_rp_sat_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, __bf16, float)", SM_100, PTX86>; + // Rcp def __nvvm_rcp_rn_ftz_f : NVPTXBuiltin<"float(float)">; @@ -460,6 +478,52 @@ def __nvvm_add_rz_d : NVPTXBuiltin<"double(double, double)">; def __nvvm_add_rm_d : NVPTXBuiltin<"double(double, double)">; def __nvvm_add_rp_d : NVPTXBuiltin<"double(double, double)">; +def __nvvm_add_mixed_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>; +def __nvvm_add_mixed_rn_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>; +def __nvvm_add_mixed_rz_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>; +def __nvvm_add_mixed_rm_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>; +def __nvvm_add_mixed_rp_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>; +def __nvvm_add_mixed_sat_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>; +def __nvvm_add_mixed_rn_sat_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>; +def __nvvm_add_mixed_rz_sat_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>; +def __nvvm_add_mixed_rm_sat_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>; +def __nvvm_add_mixed_rp_sat_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>; + +def __nvvm_add_mixed_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>; +def __nvvm_add_mixed_rn_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>; +def __nvvm_add_mixed_rz_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>; +def __nvvm_add_mixed_rm_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>; +def __nvvm_add_mixed_rp_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>; +def __nvvm_add_mixed_sat_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>; +def __nvvm_add_mixed_rn_sat_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>; +def __nvvm_add_mixed_rz_sat_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>; +def __nvvm_add_mixed_rm_sat_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>; +def __nvvm_add_mixed_rp_sat_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>; + +// Sub + +def __nvvm_sub_mixed_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>; +def __nvvm_sub_mixed_rn_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>; +def __nvvm_sub_mixed_rz_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>; +def __nvvm_sub_mixed_rm_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>; +def __nvvm_sub_mixed_rp_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>; +def __nvvm_sub_mixed_sat_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>; +def __nvvm_sub_mixed_rn_sat_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>; +def __nvvm_sub_mixed_rz_sat_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>; +def __nvvm_sub_mixed_rm_sat_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>; +def __nvvm_sub_mixed_rp_sat_f16_f32 : NVPTXBuiltinSMAndPTX<"float(__fp16, float)", SM_100, PTX86>; + +def __nvvm_sub_mixed_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>; +def __nvvm_sub_mixed_rn_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>; +def __nvvm_sub_mixed_rz_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>; +def __nvvm_sub_mixed_rm_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>; +def __nvvm_sub_mixed_rp_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>; +def __nvvm_sub_mixed_sat_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>; +def __nvvm_sub_mixed_rn_sat_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>; +def __nvvm_sub_mixed_rz_sat_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>; +def __nvvm_sub_mixed_rm_sat_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>; +def __nvvm_sub_mixed_rp_sat_bf16_f32 : NVPTXBuiltinSMAndPTX<"float(__bf16, float)", SM_100, PTX86>; + // Convert def __nvvm_d2f_rn_ftz : NVPTXBuiltin<"float(double)">; diff --git a/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp b/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp index 8a1cab3417d98..6f57620f0fb00 100644 --- a/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp @@ -415,6 +415,17 @@ static Value *MakeHalfType(unsigned IntrinsicID, unsigned BuiltinID, return MakeHalfType(CGF.CGM.getIntrinsic(IntrinsicID), BuiltinID, E, CGF); } +static Value *MakeMixedPrecisionFPArithmetic(unsigned IntrinsicID, + const CallExpr *E, + CodeGenFunction &CGF) { + SmallVector<llvm::Value *, 3> Args; + for (unsigned i = 0; i < E->getNumArgs(); ++i) { + Args.push_back(CGF.EmitScalarExpr(E->getArg(i))); + } + return CGF.Builder.CreateCall( + CGF.CGM.getIntrinsic(IntrinsicID, {Args[0]->getType()}), Args); +} + } // namespace Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, @@ -1197,6 +1208,118 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, return Builder.CreateCall( CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_count), {EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1))}); + case NVPTX::BI__nvvm_add_mixed_f16_f32: + case NVPTX::BI__nvvm_add_mixed_bf16_f32: + return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_add_mixed_f32, E, + *this); + case NVPTX::BI__nvvm_add_mixed_rn_f16_f32: + case NVPTX::BI__nvvm_add_mixed_rn_bf16_f32: + return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_add_mixed_rn_f32, E, + *this); + case NVPTX::BI__nvvm_add_mixed_rz_f16_f32: + case NVPTX::BI__nvvm_add_mixed_rz_bf16_f32: + return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_add_mixed_rz_f32, E, + *this); + case NVPTX::BI__nvvm_add_mixed_rm_f16_f32: + case NVPTX::BI__nvvm_add_mixed_rm_bf16_f32: + return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_add_mixed_rm_f32, E, + *this); + case NVPTX::BI__nvvm_add_mixed_rp_f16_f32: + case NVPTX::BI__nvvm_add_mixed_rp_bf16_f32: + return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_add_mixed_rp_f32, E, + *this); + case NVPTX::BI__nvvm_add_mixed_sat_f16_f32: + case NVPTX::BI__nvvm_add_mixed_sat_bf16_f32: + return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_add_mixed_sat_f32, E, + *this); + case NVPTX::BI__nvvm_add_mixed_rn_sat_f16_f32: + case NVPTX::BI__nvvm_add_mixed_rn_sat_bf16_f32: + return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_add_mixed_rn_sat_f32, + E, *this); + case NVPTX::BI__nvvm_add_mixed_rz_sat_f16_f32: + case NVPTX::BI__nvvm_add_mixed_rz_sat_bf16_f32: + return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_add_mixed_rz_sat_f32, + E, *this); + case NVPTX::BI__nvvm_add_mixed_rm_sat_f16_f32: + case NVPTX::BI__nvvm_add_mixed_rm_sat_bf16_f32: + return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_add_mixed_rm_sat_f32, + E, *this); + case NVPTX::BI__nvvm_add_mixed_rp_sat_f16_f32: + case NVPTX::BI__nvvm_add_mixed_rp_sat_bf16_f32: + return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_add_mixed_rp_sat_f32, + E, *this); + case NVPTX::BI__nvvm_sub_mixed_f16_f32: + case NVPTX::BI__nvvm_sub_mixed_bf16_f32: + return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_sub_mixed_f32, E, + *this); + case NVPTX::BI__nvvm_sub_mixed_rn_f16_f32: + case NVPTX::BI__nvvm_sub_mixed_rn_bf16_f32: + return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_sub_mixed_rn_f32, E, + *this); + case NVPTX::BI__nvvm_sub_mixed_rz_f16_f32: + case NVPTX::BI__nvvm_sub_mixed_rz_bf16_f32: + return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_sub_mixed_rz_f32, E, + *this); + case NVPTX::BI__nvvm_sub_mixed_rm_f16_f32: + case NVPTX::BI__nvvm_sub_mixed_rm_bf16_f32: + return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_sub_mixed_rm_f32, E, + *this); + case NVPTX::BI__nvvm_sub_mixed_rp_f16_f32: + case NVPTX::BI__nvvm_sub_mixed_rp_bf16_f32: + return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_sub_mixed_rp_f32, E, + *this); + case NVPTX::BI__nvvm_sub_mixed_sat_f16_f32: + case NVPTX::BI__nvvm_sub_mixed_sat_bf16_f32: + return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_sub_mixed_sat_f32, E, + *this); + case NVPTX::BI__nvvm_sub_mixed_rn_sat_f16_f32: + case NVPTX::BI__nvvm_sub_mixed_rn_sat_bf16_f32: + return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_sub_mixed_rn_sat_f32, + E, *this); + case NVPTX::BI__nvvm_sub_mixed_rz_sat_f16_f32: + case NVPTX::BI__nvvm_sub_mixed_rz_sat_bf16_f32: + return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_sub_mixed_rz_sat_f32, + E, *this); + case NVPTX::BI__nvvm_sub_mixed_rm_sat_f16_f32: + case NVPTX::BI__nvvm_sub_mixed_rm_sat_bf16_f32: + return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_sub_mixed_rm_sat_f32, + E, *this); + case NVPTX::BI__nvvm_sub_mixed_rp_sat_f16_f32: + case NVPTX::BI__nvvm_sub_mixed_rp_sat_bf16_f32: + return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_sub_mixed_rp_sat_f32, + E, *this); + case NVPTX::BI__nvvm_fma_mixed_rn_f16_f32: + case NVPTX::BI__nvvm_fma_mixed_rn_bf16_f32: + return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_fma_mixed_rn_f32, E, + *this); + case NVPTX::BI__nvvm_fma_mixed_rz_f16_f32: + case NVPTX::BI__nvvm_fma_mixed_rz_bf16_f32: + return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_fma_mixed_rz_f32, E, + *this); + case NVPTX::BI__nvvm_fma_mixed_rm_f16_f32: + case NVPTX::BI__nvvm_fma_mixed_rm_bf16_f32: + return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_fma_mixed_rm_f32, E, + *this); + case NVPTX::BI__nvvm_fma_mixed_rp_f16_f32: + case NVPTX::BI__nvvm_fma_mixed_rp_bf16_f32: + return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_fma_mixed_rp_f32, E, + *this); + case NVPTX::BI__nvvm_fma_mixed_rn_sat_f16_f32: + case NVPTX::BI__nvvm_fma_mixed_rn_sat_bf16_f32: + return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_fma_mixed_rn_sat_f32, + E, *this); + case NVPTX::BI__nvvm_fma_mixed_rz_sat_f16_f32: + case NVPTX::BI__nvvm_fma_mixed_rz_sat_bf16_f32: + return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_fma_mixed_rz_sat_f32, + E, *this); + case NVPTX::BI__nvvm_fma_mixed_rm_sat_f16_f32: + case NVPTX::BI__nvvm_fma_mixed_rm_sat_bf16_f32: + return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_fma_mixed_rm_sat_f32, + E, *this); + case NVPTX::BI__nvvm_fma_mixed_rp_sat_f16_f32: + case NVPTX::BI__nvvm_fma_mixed_rp_sat_bf16_f32: + return MakeMixedPrecisionFPArithmetic(Intrinsic::nvvm_fma_mixed_rp_sat_f32, + E, *this); default: return nullptr; } diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index e3be262622844..1753b4c7767e9 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -1466,3 +1466,136 @@ __device__ void nvvm_min_max_sm86() { #endif // CHECK: ret void } + +#define F16 (__fp16)0.1f +#define F16_2 (__fp16)0.2f + +__device__ void nvvm_add_mixed_precision_sm100() { +#if __CUDA_ARCH__ >= 1000 + // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.f32.f16(half 0xH2E66, float 1.000000e+00) + __nvvm_add_mixed_f16_f32(F16, 1.0f); + // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.rn.f32.f16(half 0xH2E66, float 1.000000e+00) + __nvvm_add_mixed_rn_f16_f32(F16, 1.0f); + // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.rz.f32.f16(half 0xH2E66, float 1.000000e+00) + __nvvm_add_mixed_rz_f16_f32(F16, 1.0f); + // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.rm.f32.f16(half 0xH2E66, float 1.000000e+00) + __nvvm_add_mixed_rm_f16_f32(F16, 1.0f); + // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.rp.f32.f16(half 0xH2E66, float 1.000000e+00) + __nvvm_add_mixed_rp_f16_f32(F16, 1.0f); + // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.sat.f32.f16(half 0xH2E66, float 1.000000e+00) + __nvvm_add_mixed_sat_f16_f32(F16, 1.0f); + // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.rn.sat.f32.f16(half 0xH2E66, float 1.000000e+00) + __nvvm_add_mixed_rn_sat_f16_f32(F16, 1.0f); + // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.rz.sat.f32.f16(half 0xH2E66, float 1.000000e+00) + __nvvm_add_mixed_rz_sat_f16_f32(F16, 1.0f); + // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.rm.sat.f32.f16(half 0xH2E66, float 1.000000e+00) + __nvvm_add_mixed_rm_sat_f16_f32(F16, 1.0f); + // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.rp.sat.f32.f16(half 0xH2E66, float 1.000000e+00) + __nvvm_add_mixed_rp_sat_f16_f32(F16, 1.0f); + + // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00) + __nvvm_add_mixed_bf16_f32(BF16, 1.0f); + // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.rn.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00) + __nvvm_add_mixed_rn_bf16_f32(BF16, 1.0f); + // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.rz.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00) + __nvvm_add_mixed_rz_bf16_f32(BF16, 1.0f); + // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.rm.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00) + __nvvm_add_mixed_rm_bf16_f32(BF16, 1.0f); + // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.rp.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00) + __nvvm_add_mixed_rp_bf16_f32(BF16, 1.0f); + // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.sat.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00) + __nvvm_add_mixed_sat_bf16_f32(BF16, 1.0f); + // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.rn.sat.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00) + __nvvm_add_mixed_rn_sat_bf16_f32(BF16, 1.0f); + // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.rz.sat.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00) + __nvvm_add_mixed_rz_sat_bf16_f32(BF16, 1.0f); + // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.rm.sat.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00) + __nvvm_add_mixed_rm_sat_bf16_f32(BF16, 1.0f); + // CHECK_PTX86_SM100: call float @llvm.nvvm.add.mixed.rp.sat.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00) + __nvvm_add_mixed_rp_sat_bf16_f32(BF16, 1.0f); +#endif +} + +__device__ void nvvm_sub_mixed_precision_sm100() { +#if __CUDA_ARCH__ >= 1000 + // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.f32.f16(half 0xH2E66, float 1.000000e+00) + __nvvm_sub_mixed_f16_f32(F16, 1.0f); + // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.rn.f32.f16(half 0xH2E66, float 1.000000e+00) + __nvvm_sub_mixed_rn_f16_f32(F16, 1.0f); + // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.rz.f32.f16(half 0xH2E66, float 1.000000e+00) + __nvvm_sub_mixed_rz_f16_f32(F16, 1.0f); + // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.rm.f32.f16(half 0xH2E66, float 1.000000e+00) + __nvvm_sub_mixed_rm_f16_f32(F16, 1.0f); + // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.rp.f32.f16(half 0xH2E66, float 1.000000e+00) + __nvvm_sub_mixed_rp_f16_f32(F16, 1.0f); + // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.sat.f32.f16(half 0xH2E66, float 1.000000e+00) + __nvvm_sub_mixed_sat_f16_f32(F16, 1.0f); + // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.rn.sat.f32.f16(half 0xH2E66, float 1.000000e+00) + __nvvm_sub_mixed_rn_sat_f16_f32(F16, 1.0f); + // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.rz.sat.f32.f16(half 0xH2E66, float 1.000000e+00) + __nvvm_sub_mixed_rz_sat_f16_f32(F16, 1.0f); + // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.rm.sat.f32.f16(half 0xH2E66, float 1.000000e+00) + __nvvm_sub_mixed_rm_sat_f16_f32(F16, 1.0f); + // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.rp.sat.f32.f16(half 0xH2E66, float 1.000000e+00) + __nvvm_sub_mixed_rp_sat_f16_f32(F16, 1.0f); + + // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00) + __nvvm_sub_mixed_bf16_f32(BF16, 1.0f); + // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.rn.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00) + __nvvm_sub_mixed_rn_bf16_f32(BF16, 1.0f); + // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.rz.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00) + __nvvm_sub_mixed_rz_bf16_f32(BF16, 1.0f); + // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.rm.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00) + __nvvm_sub_mixed_rm_bf16_f32(BF16, 1.0f); + // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.rp.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00) + __nvvm_sub_mixed_rp_bf16_f32(BF16, 1.0f); + // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.sat.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00) + __nvvm_sub_mixed_sat_bf16_f32(BF16, 1.0f); + // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.rn.sat.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00) + __nvvm_sub_mixed_rn_sat_bf16_f32(BF16, 1.0f); + // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.rz.sat.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00) + __nvvm_sub_mixed_rz_sat_bf16_f32(BF16, 1.0f); + // CHECK_PTX86_SM100: call float @llvm.nvvm.sub.mixed.rm.sat.f32.bf16(bfloat 0xR3DCD, float 1.000000e+00) + _... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/168359 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
