https://github.com/Wolfram70 updated https://github.com/llvm/llvm-project/pull/178237
>From 868c76297d3bb5185d47fa0ddcd4a21b76a3a45f Mon Sep 17 00:00:00 2001 From: Srinivasa Ravi <[email protected]> Date: Tue, 27 Jan 2026 11:27:40 +0000 Subject: [PATCH 1/2] [clang][NVPTX] Add f16(x2) add/mul FTZ intrinsics This change adds `llvm.nvvm.{add/mul}.rn.ftz.{f16/f16x2}` intrinsics and corresponding clang builtins. These variants were missed in https://github.com/llvm/llvm-project/pull/170079 which added half-precision arithmetic intrinsics. PTX Spec Reference: https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions --- clang/include/clang/Basic/BuiltinsNVPTX.td | 4 +++ clang/test/CodeGen/builtins-nvptx.c | 10 +++++- llvm/docs/NVPTXUsage.rst | 4 +++ llvm/include/llvm/IR/IntrinsicsNVVM.td | 12 +++++++ llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 5 +++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 7 +++++ llvm/test/CodeGen/NVPTX/f16-add-ftz.ll | 33 +++++++++++++++++++ llvm/test/CodeGen/NVPTX/f16-mul-ftz.ll | 33 +++++++++++++++++++ llvm/test/CodeGen/NVPTX/f16-sub-ftz.ll | 35 +++++++++++++++++++++ 9 files changed, 142 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/NVPTX/f16-add-ftz.ll create mode 100644 llvm/test/CodeGen/NVPTX/f16-mul-ftz.ll create mode 100644 llvm/test/CodeGen/NVPTX/f16-sub-ftz.ll diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td b/clang/include/clang/Basic/BuiltinsNVPTX.td index 821c362d100c5..59d96551cc250 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.td +++ b/clang/include/clang/Basic/BuiltinsNVPTX.td @@ -467,8 +467,10 @@ def __nvvm_rsqrt_approx_d : NVPTXBuiltin<"double(double)">; // Add def __nvvm_add_rn_sat_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16, __fp16)", SM_53, PTX42>; +def __nvvm_add_rn_ftz_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16, __fp16)", SM_53, PTX42>; def __nvvm_add_rn_ftz_sat_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16, __fp16)", SM_53, PTX42>; def __nvvm_add_rn_sat_v2f16 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>, _Vector<2, __fp16>)", SM_53, PTX42>; +def __nvvm_add_rn_ftz_v2f16 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>, _Vector<2, __fp16>)", SM_53, PTX42>; def __nvvm_add_rn_ftz_sat_v2f16 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>, _Vector<2, __fp16>)", SM_53, PTX42>; def __nvvm_add_rn_ftz_f : NVPTXBuiltin<"float(float, float)">; @@ -496,8 +498,10 @@ def __nvvm_add_rp_d : NVPTXBuiltin<"double(double, double)">; // Mul def __nvvm_mul_rn_sat_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16, __fp16)", SM_53, PTX42>; +def __nvvm_mul_rn_ftz_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16, __fp16)", SM_53, PTX42>; def __nvvm_mul_rn_ftz_sat_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16, __fp16)", SM_53, PTX42>; def __nvvm_mul_rn_sat_v2f16 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>, _Vector<2, __fp16>)", SM_53, PTX42>; +def __nvvm_mul_rn_ftz_v2f16 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>, _Vector<2, __fp16>)", SM_53, PTX42>; def __nvvm_mul_rn_ftz_sat_v2f16 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>, _Vector<2, __fp16>)", SM_53, PTX42>; // Convert diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index a739b66042f19..8271b29c18968 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -1589,22 +1589,30 @@ __device__ void nvvm_add_fma_f32_sat() { #define F16X2_2 {(__fp16)0.2f, (__fp16)0.2f} // CHECK-LABEL: nvvm_add_mul_f16_sat -__device__ void nvvm_add_mul_f16_sat() { +__device__ void nvvm_add_mul_f16_sat_ftz() { // CHECK: call half @llvm.nvvm.add.rn.sat.f16 __nvvm_add_rn_sat_f16(F16, F16_2); + // CHECK: call half @llvm.nvvm.add.rn.ftz.f16 + __nvvm_add_rn_ftz_f16(F16, F16_2); // CHECK: call half @llvm.nvvm.add.rn.ftz.sat.f16 __nvvm_add_rn_ftz_sat_f16(F16, F16_2); // CHECK: call <2 x half> @llvm.nvvm.add.rn.sat.v2f16 __nvvm_add_rn_sat_v2f16(F16X2, F16X2_2); + // CHECK: call <2 x half> @llvm.nvvm.add.rn.ftz.v2f16 + __nvvm_add_rn_ftz_v2f16(F16X2, F16X2_2); // CHECK: call <2 x half> @llvm.nvvm.add.rn.ftz.sat.v2f16 __nvvm_add_rn_ftz_sat_v2f16(F16X2, F16X2_2); // CHECK: call half @llvm.nvvm.mul.rn.sat.f16 __nvvm_mul_rn_sat_f16(F16, F16_2); + // CHECK: call half @llvm.nvvm.mul.rn.ftz.f16 + __nvvm_mul_rn_ftz_f16(F16, F16_2); // CHECK: call half @llvm.nvvm.mul.rn.ftz.sat.f16 __nvvm_mul_rn_ftz_sat_f16(F16, F16_2); // CHECK: call <2 x half> @llvm.nvvm.mul.rn.sat.v2f16 __nvvm_mul_rn_sat_v2f16(F16X2, F16X2_2); + // CHECK: call <2 x half> @llvm.nvvm.mul.rn.ftz.v2f16 + __nvvm_mul_rn_ftz_v2f16(F16X2, F16X2_2); // CHECK: call <2 x half> @llvm.nvvm.mul.rn.ftz.sat.v2f16 __nvvm_mul_rn_ftz_sat_v2f16(F16X2, F16X2_2); diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 0e7e21ad46b8d..d712b2d548f81 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -1201,9 +1201,11 @@ Syntax: .. code-block:: llvm declare half @llvm.nvvm.add.rn.sat.f16(half %a, half %b) + declare half @llvm.nvvm.add.rn.ftz.f16(half %a, half %b) declare <2 x half> @llvm.nvvm.add.rn.sat.v2f16(<2 x half> %a, <2 x half> %b) declare half @llvm.nvvm.add.rn.ftz.sat.f16(half %a, half %b) + declare <2 x half> @llvm.nvvm.add.rn.ftz.v2f16(<2 x half> %a, <2 x half> %b) declare <2 x half> @llvm.nvvm.add.rn.ftz.sat.v2f16(<2 x half> %a, <2 x half> %b) Overview: @@ -1229,9 +1231,11 @@ Syntax: .. code-block:: llvm declare half @llvm.nvvm.mul.rn.sat.f16(half %a, half %b) + declare half @llvm.nvvm.mul.rn.ftz.f16(half %a, half %b) declare <2 x half> @llvm.nvvm.mul.rn.sat.v2f16(<2 x half> %a, <2 x half> %b) declare half @llvm.nvvm.mul.rn.ftz.sat.f16(half %a, half %b) + declare <2 x half> @llvm.nvvm.mul.rn.ftz.v2f16(<2 x half> %a, <2 x half> %b) declare <2 x half> @llvm.nvvm.mul.rn.ftz.sat.v2f16(<2 x half> %a, <2 x half> %b) Overview: diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index e5e08aacd2535..7ff7515db82c5 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -1373,6 +1373,12 @@ let TargetPrefix = "nvvm" in { def int_nvvm_mul_rn # ftz # _sat_v2f16 : NVVMBuiltin, DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty, llvm_v2f16_ty]>; } // ftz + + def int_nvvm_mul_rn_ftz_f16 : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_half_ty], [llvm_half_ty, llvm_half_ty]>; + + def int_nvvm_mul_rn_ftz_v2f16 : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty, llvm_v2f16_ty]>; } // @@ -1612,6 +1618,12 @@ let TargetPrefix = "nvvm" in { DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty, llvm_v2f16_ty]>; } // ftz + + def int_nvvm_add_rn_ftz_f16 : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_half_ty], [llvm_half_ty, llvm_half_ty]>; + + def int_nvvm_add_rn_ftz_v2f16 : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_v2f16_ty, llvm_v2f16_ty]>; } // diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 1be35a1c67457..5b62b9d073c19 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -6857,6 +6857,9 @@ static unsigned getF16SubOpc(Intrinsic::ID AddIntrinsicID) { case Intrinsic::nvvm_add_rn_sat_f16: case Intrinsic::nvvm_add_rn_sat_v2f16: return NVPTXISD::SUB_RN_SAT; + case Intrinsic::nvvm_add_rn_ftz_f16: + case Intrinsic::nvvm_add_rn_ftz_v2f16: + return NVPTXISD::SUB_RN_FTZ; case Intrinsic::nvvm_add_rn_ftz_sat_f16: case Intrinsic::nvvm_add_rn_ftz_sat_v2f16: return NVPTXISD::SUB_RN_FTZ_SAT; @@ -6895,8 +6898,10 @@ static SDValue combineIntrinsicWOChain(SDNode *N, default: break; case Intrinsic::nvvm_add_rn_sat_f16: + case Intrinsic::nvvm_add_rn_ftz_f16: case Intrinsic::nvvm_add_rn_ftz_sat_f16: case Intrinsic::nvvm_add_rn_sat_v2f16: + case Intrinsic::nvvm_add_rn_ftz_v2f16: case Intrinsic::nvvm_add_rn_ftz_sat_v2f16: return combineF16AddWithNeg(N, DCI.DAG, IID); } diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index ad5dd356ee90f..64fba7339ad0a 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1504,8 +1504,10 @@ def INT_NVVM_MUL24_I : F_MATH_2<"mul24.lo.s32", B32, B32, B32, int_nvvm_mul24_i> def INT_NVVM_MUL24_UI : F_MATH_2<"mul24.lo.u32", B32, B32, B32, int_nvvm_mul24_ui>; def INT_NVVM_MUL_RN_SAT_F16 : F_MATH_2<"mul.rn.sat.f16", B16, B16, B16, int_nvvm_mul_rn_sat_f16>; +def INT_NVVM_MUL_RN_FTZ_F16 : F_MATH_2<"mul.rn.ftz.f16", B16, B16, B16, int_nvvm_mul_rn_ftz_f16>; def INT_NVVM_MUL_RN_FTZ_SAT_F16 : F_MATH_2<"mul.rn.ftz.sat.f16", B16, B16, B16, int_nvvm_mul_rn_ftz_sat_f16>; def INT_NVVM_MUL_RN_SAT_F16X2 : F_MATH_2<"mul.rn.sat.f16x2", B32, B32, B32, int_nvvm_mul_rn_sat_v2f16>; +def INT_NVVM_MUL_RN_FTZ_F16X2 : F_MATH_2<"mul.rn.ftz.f16x2", B32, B32, B32, int_nvvm_mul_rn_ftz_v2f16>; def INT_NVVM_MUL_RN_FTZ_SAT_F16X2 : F_MATH_2<"mul.rn.ftz.sat.f16x2", B32, B32, B32, int_nvvm_mul_rn_ftz_sat_v2f16>; // @@ -1876,8 +1878,10 @@ let Predicates = [doRsqrtOpt] in { // def INT_NVVM_ADD_RN_SAT_F16 : F_MATH_2<"add.rn.sat.f16", B16, B16, B16, int_nvvm_add_rn_sat_f16>; +def INT_NVVM_ADD_RN_FTZ_F16 : F_MATH_2<"add.rn.ftz.f16", B16, B16, B16, int_nvvm_add_rn_ftz_f16>; def INT_NVVM_ADD_RN_FTZ_SAT_F16 : F_MATH_2<"add.rn.ftz.sat.f16", B16, B16, B16, int_nvvm_add_rn_ftz_sat_f16>; def INT_NVVM_ADD_RN_SAT_F16X2 : F_MATH_2<"add.rn.sat.f16x2", B32, B32, B32, int_nvvm_add_rn_sat_v2f16>; +def INT_NVVM_ADD_RN_FTZ_F16X2 : F_MATH_2<"add.rn.ftz.f16x2", B32, B32, B32, int_nvvm_add_rn_ftz_v2f16>; def INT_NVVM_ADD_RN_FTZ_SAT_F16X2 : F_MATH_2<"add.rn.ftz.sat.f16x2", B32, B32, B32, int_nvvm_add_rn_ftz_sat_v2f16>; def INT_NVVM_ADD_RN_FTZ_F : F_MATH_2<"add.rn.ftz.f32", B32, B32, B32, int_nvvm_add_rn_ftz_f>; @@ -1930,6 +1934,7 @@ let Predicates = [hasSM<100>, hasPTX<86>, doNoF32FTZ] in { // def sub_rn_sat : SDNode<"NVPTXISD::SUB_RN_SAT", SDTFPBinOp>; +def sub_rn_ftz : SDNode<"NVPTXISD::SUB_RN_FTZ", SDTFPBinOp>; def sub_rn_ftz_sat : SDNode<"NVPTXISD::SUB_RN_FTZ_SAT", SDTFPBinOp>; @@ -1940,8 +1945,10 @@ class INT_NVVM_SUB_RN<RegTyInfo TyInfo, string variant> : (!cast<SDNode>("sub_rn" # variant) TyInfo.Ty:$a, TyInfo.Ty:$b))]>; def INT_NVVM_SUB_RN_SAT_F16 : INT_NVVM_SUB_RN<F16RT, "_sat">; +def INT_NVVM_SUB_RN_FTZ_F16 : INT_NVVM_SUB_RN<F16RT, "_ftz">; def INT_NVVM_SUB_RN_FTZ_SAT_F16 : INT_NVVM_SUB_RN<F16RT, "_ftz_sat">; def INT_NVVM_SUB_RN_SAT_F16X2 : INT_NVVM_SUB_RN<F16X2RT, "_sat">; +def INT_NVVM_SUB_RN_FTZ_F16X2 : INT_NVVM_SUB_RN<F16X2RT, "_ftz">; def INT_NVVM_SUB_RN_FTZ_SAT_F16X2 : INT_NVVM_SUB_RN<F16X2RT, "_ftz_sat">; foreach rnd = ["_rn", "_rz", "_rm", "_rp"] in { diff --git a/llvm/test/CodeGen/NVPTX/f16-add-ftz.ll b/llvm/test/CodeGen/NVPTX/f16-add-ftz.ll new file mode 100644 index 0000000000000..fbe348fcc966f --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/f16-add-ftz.ll @@ -0,0 +1,33 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_53 -mattr=+ptx42 | FileCheck %s +; RUN: %if ptxas-isa-4.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_53 -mattr=+ptx42 | %ptxas-verify%} + +define half @add_rn_ftz_f16(half %a, half %b) { +; CHECK-LABEL: add_rn_ftz_f16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs1, [add_rn_ftz_f16_param_0]; +; CHECK-NEXT: ld.param.b16 %rs2, [add_rn_ftz_f16_param_1]; +; CHECK-NEXT: add.rn.ftz.f16 %rs3, %rs1, %rs2; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-NEXT: ret; + %f1 = call half @llvm.nvvm.add.rn.ftz.f16(half %a, half %b) + ret half %f1 +} + +define <2 x half> @add_rn_ftz_f16x2(<2 x half> %a, <2 x half> %b) { +; CHECK-LABEL: add_rn_ftz_f16x2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [add_rn_ftz_f16x2_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [add_rn_ftz_f16x2_param_1]; +; CHECK-NEXT: add.rn.ftz.f16x2 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %f1 = call <2 x half> @llvm.nvvm.add.rn.ftz.v2f16(<2 x half> %a, <2 x half> %b) + ret <2 x half> %f1 +} diff --git a/llvm/test/CodeGen/NVPTX/f16-mul-ftz.ll b/llvm/test/CodeGen/NVPTX/f16-mul-ftz.ll new file mode 100644 index 0000000000000..c2ebf8fd49db3 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/f16-mul-ftz.ll @@ -0,0 +1,33 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_53 -mattr=+ptx42 | FileCheck %s +; RUN: %if ptxas-isa-4.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_53 -mattr=+ptx42 | %ptxas-verify%} + +define half @mul_rn_ftz_f16(half %a, half %b) { +; CHECK-LABEL: mul_rn_ftz_f16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs1, [mul_rn_ftz_f16_param_0]; +; CHECK-NEXT: ld.param.b16 %rs2, [mul_rn_ftz_f16_param_1]; +; CHECK-NEXT: mul.rn.ftz.f16 %rs3, %rs1, %rs2; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-NEXT: ret; + %f1 = call half @llvm.nvvm.mul.rn.ftz.f16(half %a, half %b) + ret half %f1 +} + +define <2 x half> @mul_rn_ftz_f16x2(<2 x half> %a, <2 x half> %b) { +; CHECK-LABEL: mul_rn_ftz_f16x2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [mul_rn_ftz_f16x2_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [mul_rn_ftz_f16x2_param_1]; +; CHECK-NEXT: mul.rn.ftz.f16x2 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %f1 = call <2 x half> @llvm.nvvm.mul.rn.ftz.v2f16(<2 x half> %a, <2 x half> %b) + ret <2 x half> %f1 +} diff --git a/llvm/test/CodeGen/NVPTX/f16-sub-ftz.ll b/llvm/test/CodeGen/NVPTX/f16-sub-ftz.ll new file mode 100644 index 0000000000000..7164924caf620 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/f16-sub-ftz.ll @@ -0,0 +1,35 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_53 -mattr=+ptx42 | FileCheck %s +; RUN: %if ptxas-isa-4.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_53 -mattr=+ptx42 | %ptxas-verify%} + +define half @sub_rn_ftz_f16(half %a, half %b) { +; CHECK-LABEL: sub_rn_ftz_f16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs1, [sub_rn_ftz_f16_param_0]; +; CHECK-NEXT: ld.param.b16 %rs2, [sub_rn_ftz_f16_param_1]; +; CHECK-NEXT: sub.rn.ftz.f16 %rs3, %rs1, %rs2; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-NEXT: ret; + %f0 = fneg half %b + %f1 = call half @llvm.nvvm.add.rn.ftz.f16(half %a, half %f0) + ret half %f1 +} + +define <2 x half> @sub_rn_ftz_f16x2(<2 x half> %a, <2 x half> %b) { +; CHECK-LABEL: sub_rn_ftz_f16x2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [sub_rn_ftz_f16x2_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [sub_rn_ftz_f16x2_param_1]; +; CHECK-NEXT: sub.rn.ftz.f16x2 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %f0 = fneg <2 x half> %b + %f1 = call <2 x half> @llvm.nvvm.add.rn.ftz.v2f16(<2 x half> %a, <2 x half> %f0) + ret <2 x half> %f1 +} >From fd4fd4f9e645246a609a1d09dc9ce770bc689b55 Mon Sep 17 00:00:00 2001 From: Srinivasa Ravi <[email protected]> Date: Tue, 27 Jan 2026 21:23:40 +0530 Subject: [PATCH 2/2] update check line in builtins-nvptx.c --- clang/test/CodeGen/builtins-nvptx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index 8271b29c18968..9ea2f416be293 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -1588,7 +1588,7 @@ __device__ void nvvm_add_fma_f32_sat() { #define F16X2 {(__fp16)0.1f, (__fp16)0.1f} #define F16X2_2 {(__fp16)0.2f, (__fp16)0.2f} -// CHECK-LABEL: nvvm_add_mul_f16_sat +// CHECK-LABEL: nvvm_add_mul_f16_sat_ftz __device__ void nvvm_add_mul_f16_sat_ftz() { // CHECK: call half @llvm.nvvm.add.rn.sat.f16 __nvvm_add_rn_sat_f16(F16, F16_2); _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
