https://github.com/YonahGoldberg created https://github.com/llvm/llvm-project/pull/174005
Currently CUDA fp16 is implemented as inline PTX in `cuda_fp16.hpp` in the CTK. In CUDA 13.3, we are moving the implementations to libdevice. This PR declares the new libdevice functions, which operate on LLVM `half`, and adds wrappers for the CUDA `__half` and `__half2` types, which are just `unsigned short` and `unsigned int` under the hood. >From 566d592134cfbbe03c03a6d003c9b045ac7109ec Mon Sep 17 00:00:00 2001 From: Yonah Goldberg <[email protected]> Date: Tue, 30 Dec 2025 16:40:44 +0000 Subject: [PATCH] fp16 in libdevice --- .../Headers/__clang_cuda_device_functions.h | 748 +++++++++++++++++- .../Headers/__clang_cuda_libdevice_declares.h | 190 +++++ 2 files changed, 934 insertions(+), 4 deletions(-) diff --git a/clang/lib/Headers/__clang_cuda_device_functions.h b/clang/lib/Headers/__clang_cuda_device_functions.h index 0226fe95abab6..e31cb87a25140 100644 --- a/clang/lib/Headers/__clang_cuda_device_functions.h +++ b/clang/lib/Headers/__clang_cuda_device_functions.h @@ -1086,7 +1086,6 @@ __DEVICE__ unsigned int __vabsdiffs2(unsigned int __a, unsigned int __b) { : "r"(__a), "r"(__b), "r"(0)); return r; } - __DEVICE__ unsigned int __vabsdiffs4(unsigned int __a, unsigned int __b) { unsigned int r; __asm__("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;" @@ -1404,7 +1403,6 @@ __DEVICE__ unsigned int __vhaddu2(unsigned int __a, unsigned int __b) { __DEVICE__ unsigned int __vhaddu4(unsigned int __a, unsigned int __b) { return (((__a ^ __b) >> 1) & ~0x80808080u) + (__a & __b); } - __DEVICE__ unsigned int __vmaxs2(unsigned int __a, unsigned int __b) { unsigned int r; if ((__a & 0x8000) && (__b & 0x8000)) { @@ -1496,7 +1494,6 @@ __DEVICE__ unsigned int __vsadu4(unsigned int __a, unsigned int __b) { : "r"(__a), "r"(__b), "r"(0)); return r; } - __DEVICE__ unsigned int __vsub2(unsigned int __a, unsigned int __b) { unsigned int r; __asm__("vsub2.u32.u32.u32 %0,%1,%2,%3;" @@ -1505,7 +1502,6 @@ __DEVICE__ unsigned int __vsub2(unsigned int __a, unsigned int __b) { return r; } __DEVICE__ unsigned int __vneg2(unsigned int __a) { return __vsub2(0, __a); } - __DEVICE__ unsigned int __vsub4(unsigned int __a, unsigned int __b) { unsigned int r; __asm__("vsub4.u32.u32.u32 %0,%1,%2,%3;" @@ -1550,6 +1546,750 @@ __DEVICE__ unsigned int __vsubus4(unsigned int __a, unsigned int __b) { } #endif // CUDA_VERSION >= 9020 +#if CUDA_VERSION >= 13030 +typedef _Float16 _Float16x2 __attribute__((ext_vector_type(2))); + +// fp16 conversion functions +__DEVICE__ unsigned short __f16_double2half(double __a) { + return __builtin_bit_cast(unsigned short, __nv_f16_double2half(__a)); +} +__DEVICE__ unsigned short __f16_float2half_rn(float __a) { + return __builtin_bit_cast(unsigned short, __nv_f16_float2half_rn(__a)); +} +__DEVICE__ unsigned short __f16_float2half_rz(float __a) { + return __builtin_bit_cast(unsigned short, __nv_f16_float2half_rz(__a)); +} +__DEVICE__ unsigned short __f16_float2half_rd(float __a) { + return __builtin_bit_cast(unsigned short, __nv_f16_float2half_rd(__a)); +} +__DEVICE__ unsigned short __f16_float2half_ru(float __a) { + return __builtin_bit_cast(unsigned short, __nv_f16_float2half_ru(__a)); +} +__DEVICE__ unsigned int __f16_float2half2_rn(float __a) { + return __builtin_bit_cast(unsigned int, __nv_f16_float2half2_rn(__a)); +} +__DEVICE__ unsigned int __f16_floats2half2_rn(float __a, float __b) { + return __builtin_bit_cast(unsigned int, __nv_f16_floats2half2_rn(__a, __b)); +} +__DEVICE__ float __f16_half2float(unsigned short __a) { + return __nv_f16_half2float(__builtin_bit_cast(_Float16, __a)); +} +__DEVICE__ float __f16_low2float(unsigned int __a) { + return __nv_f16_low2float(__builtin_bit_cast(_Float16x2, __a)); +} +__DEVICE__ float __f16_high2float(unsigned int __a) { + return __nv_f16_high2float(__builtin_bit_cast(_Float16x2, __a)); +} +__DEVICE__ char __f16_half2char_rz(unsigned short __h) { + return __nv_f16_half2char_rz(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ unsigned char __f16_half2uchar_rz(unsigned short __h) { + return __nv_f16_half2uchar_rz(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ short __f16_half2short_rz(unsigned short __h) { + return __nv_f16_half2short_rz(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ unsigned short __f16_half2ushort_rz(unsigned short __h) { + return __nv_f16_half2ushort_rz(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ int __f16_half2int_rz(unsigned short __h) { + return __nv_f16_half2int_rz(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ unsigned int __f16_half2uint_rz(unsigned short __h) { + return __nv_f16_half2uint_rz(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ long long __f16_half2ll_rz(unsigned short __h) { + return __nv_f16_half2ll_rz(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ unsigned long long __f16_half2ull_rz(unsigned short __h) { + return __nv_f16_half2ull_rz(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ float2 __f16_half22float2(unsigned int __a) { + return __nv_f16_half22float2(__builtin_bit_cast(_Float16x2, __a)); +} +__DEVICE__ int __f16_half2int_rn(unsigned short __h) { + return __nv_f16_half2int_rn(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ int __f16_half2int_rd(unsigned short __h) { + return __nv_f16_half2int_rd(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ int __f16_half2int_ru(unsigned short __h) { + return __nv_f16_half2int_ru(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ unsigned short __f16_int2half_rn(int __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_int2half_rn(__i)); +} +__DEVICE__ unsigned short __f16_int2half_rz(int __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_int2half_rz(__i)); +} +__DEVICE__ unsigned short __f16_int2half_rd(int __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_int2half_rd(__i)); +} +__DEVICE__ unsigned short __f16_int2half_ru(int __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_int2half_ru(__i)); +} +__DEVICE__ short __f16_half2short_rn(unsigned short __h) { + return __nv_f16_half2short_rn(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ short __f16_half2short_rd(unsigned short __h) { + return __nv_f16_half2short_rd(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ short __f16_half2short_ru(unsigned short __h) { + return __nv_f16_half2short_ru(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ unsigned short __f16_short2half_rn(short __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_short2half_rn(__i)); +} +__DEVICE__ unsigned short __f16_short2half_rz(short __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_short2half_rz(__i)); +} +__DEVICE__ unsigned short __f16_short2half_rd(short __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_short2half_rd(__i)); +} +__DEVICE__ unsigned short __f16_short2half_ru(short __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_short2half_ru(__i)); +} +__DEVICE__ unsigned int __f16_half2uint_rn(unsigned short __h) { + return __nv_f16_half2uint_rn(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ unsigned int __f16_half2uint_rd(unsigned short __h) { + return __nv_f16_half2uint_rd(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ unsigned int __f16_half2uint_ru(unsigned short __h) { + return __nv_f16_half2uint_ru(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ unsigned short __f16_uint2half_rn(unsigned int __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_uint2half_rn(__i)); +} +__DEVICE__ unsigned short __f16_uint2half_rz(unsigned int __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_uint2half_rz(__i)); +} +__DEVICE__ unsigned short __f16_uint2half_rd(unsigned int __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_uint2half_rd(__i)); +} +__DEVICE__ unsigned short __f16_uint2half_ru(unsigned int __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_uint2half_ru(__i)); +} +__DEVICE__ unsigned short __f16_half2ushort_rn(unsigned short __h) { + return __nv_f16_half2ushort_rn(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ unsigned short __f16_half2ushort_rd(unsigned short __h) { + return __nv_f16_half2ushort_rd(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ unsigned short __f16_half2ushort_ru(unsigned short __h) { + return __nv_f16_half2ushort_ru(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ unsigned short __f16_ushort2half_rn(unsigned short __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_ushort2half_rn(__i)); +} +__DEVICE__ unsigned short __f16_ushort2half_rz(unsigned short __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_ushort2half_rz(__i)); +} +__DEVICE__ unsigned short __f16_ushort2half_rd(unsigned short __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_ushort2half_rd(__i)); +} +__DEVICE__ unsigned short __f16_ushort2half_ru(unsigned short __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_ushort2half_ru(__i)); +} +__DEVICE__ unsigned long long __f16_half2ull_rn(unsigned short __h) { + return __nv_f16_half2ull_rn(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ unsigned long long __f16_half2ull_rd(unsigned short __h) { + return __nv_f16_half2ull_rd(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ unsigned long long __f16_half2ull_ru(unsigned short __h) { + return __nv_f16_half2ull_ru(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ unsigned short __f16_ull2half_rn(unsigned long long __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_ull2half_rn(__i)); +} +__DEVICE__ unsigned short __f16_ull2half_rz(unsigned long long __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_ull2half_rz(__i)); +} +__DEVICE__ unsigned short __f16_ull2half_rd(unsigned long long __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_ull2half_rd(__i)); +} +__DEVICE__ unsigned short __f16_ull2half_ru(unsigned long long __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_ull2half_ru(__i)); +} +__DEVICE__ long long __f16_half2ll_rn(unsigned short __h) { + return __nv_f16_half2ll_rn(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ long long __f16_half2ll_rd(unsigned short __h) { + return __nv_f16_half2ll_rd(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ long long __f16_half2ll_ru(unsigned short __h) { + return __nv_f16_half2ll_ru(__builtin_bit_cast(_Float16, __h)); +} +__DEVICE__ unsigned short __f16_ll2half_rn(long long __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_ll2half_rn(__i)); +} +__DEVICE__ unsigned short __f16_ll2half_rz(long long __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_ll2half_rz(__i)); +} +__DEVICE__ unsigned short __f16_ll2half_rd(long long __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_ll2half_rd(__i)); +} +__DEVICE__ unsigned short __f16_ll2half_ru(long long __i) { + return __builtin_bit_cast(unsigned short, __nv_f16_ll2half_ru(__i)); +} + +// fp16 rounding functions +__DEVICE__ unsigned short __f16_trunc(unsigned short __x) { + return __builtin_bit_cast(unsigned short, + __nv_f16_trunc(__builtin_bit_cast(_Float16, __x))); +} +__DEVICE__ unsigned short __f16_ceil(unsigned short __x) { + return __builtin_bit_cast(unsigned short, + __nv_f16_ceil(__builtin_bit_cast(_Float16, __x))); +} +__DEVICE__ unsigned short __f16_floor(unsigned short __x) { + return __builtin_bit_cast(unsigned short, + __nv_f16_floor(__builtin_bit_cast(_Float16, __x))); +} +__DEVICE__ unsigned short __f16_rint(unsigned short __x) { + return __builtin_bit_cast(unsigned short, + __nv_f16_rint(__builtin_bit_cast(_Float16, __x))); +} +__DEVICE__ unsigned int __f16x2_trunc(unsigned int __x) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_trunc(__builtin_bit_cast(_Float16x2, __x))); +} +__DEVICE__ unsigned int __f16x2_floor(unsigned int __x) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_floor(__builtin_bit_cast(_Float16x2, __x))); +} +__DEVICE__ unsigned int __f16x2_ceil(unsigned int __x) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_ceil(__builtin_bit_cast(_Float16x2, __x))); +} +__DEVICE__ unsigned int __f16x2_rint(unsigned int __x) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_rint(__builtin_bit_cast(_Float16x2, __x))); +} + +// half2 utilities +__DEVICE__ unsigned int __f16_lows2half2(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast( + unsigned int, __nv_f16_lows2half2(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned int __f16_highs2half2(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast( + unsigned int, __nv_f16_highs2half2(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned short __f16_low2half(unsigned int __a) { + return __builtin_bit_cast( + unsigned short, __nv_f16_low2half(__builtin_bit_cast(_Float16x2, __a))); +} +__DEVICE__ unsigned int __f16_low2half2(unsigned int __a) { + return __builtin_bit_cast( + unsigned int, __nv_f16_low2half2(__builtin_bit_cast(_Float16x2, __a))); +} +__DEVICE__ unsigned int __f16_high2half2(unsigned int __a) { + return __builtin_bit_cast( + unsigned int, __nv_f16_high2half2(__builtin_bit_cast(_Float16x2, __a))); +} +__DEVICE__ unsigned short __f16_high2half(unsigned int __a) { + return __builtin_bit_cast( + unsigned short, __nv_f16_high2half(__builtin_bit_cast(_Float16x2, __a))); +} +__DEVICE__ unsigned int __f16_halves2half2(unsigned short __a, + unsigned short __b) { + return __builtin_bit_cast( + unsigned int, __nv_f16_halves2half2(__builtin_bit_cast(_Float16, __a), + __builtin_bit_cast(_Float16, __b))); +} +__DEVICE__ unsigned int __f16_half2half2(unsigned short __a) { + return __builtin_bit_cast( + unsigned int, __nv_f16_half2half2(__builtin_bit_cast(_Float16, __a))); +} +__DEVICE__ unsigned int __f16_lowhigh2highlow(unsigned int __a) { + return __builtin_bit_cast( + unsigned int, + __nv_f16_lowhigh2highlow(__builtin_bit_cast(_Float16x2, __a))); +} + +// fp16 comparison functions +__DEVICE__ unsigned short __f16_max(unsigned short __x, unsigned short __y) { + return __builtin_bit_cast(unsigned short, + __nv_f16_max(__builtin_bit_cast(_Float16, __x), + __builtin_bit_cast(_Float16, __y))); +} +__DEVICE__ unsigned short __f16_min(unsigned short __x, unsigned short __y) { + return __builtin_bit_cast(unsigned short, + __nv_f16_min(__builtin_bit_cast(_Float16, __x), + __builtin_bit_cast(_Float16, __y))); +} +__DEVICE__ unsigned int __f16x2_max(unsigned int __x, unsigned int __y) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_max(__builtin_bit_cast(_Float16x2, __x), + __builtin_bit_cast(_Float16x2, __y))); +} +__DEVICE__ unsigned int __f16x2_min(unsigned int __x, unsigned int __y) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_min(__builtin_bit_cast(_Float16x2, __x), + __builtin_bit_cast(_Float16x2, __y))); +} +__DEVICE__ unsigned int __f16x2_eq(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast(unsigned int, + __nv_f16x2_eq(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned int __f16x2_ne(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast(unsigned int, + __nv_f16x2_ne(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned int __f16x2_le(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast(unsigned int, + __nv_f16x2_le(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned int __f16x2_ge(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast(unsigned int, + __nv_f16x2_ge(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned int __f16x2_lt(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast(unsigned int, + __nv_f16x2_lt(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned int __f16x2_gt(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast(unsigned int, + __nv_f16x2_gt(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned int __f16x2_equ(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_equ(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned int __f16x2_neu(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_neu(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned int __f16x2_leu(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_leu(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned int __f16x2_geu(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_geu(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned int __f16x2_ltu(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_ltu(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned int __f16x2_gtu(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_gtu(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned int __f16x2_eq_mask(unsigned int __a, unsigned int __b) { + return __nv_f16x2_eq_mask(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b)); +} +__DEVICE__ unsigned int __f16x2_ne_mask(unsigned int __a, unsigned int __b) { + return __nv_f16x2_ne_mask(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b)); +} +__DEVICE__ unsigned int __f16x2_le_mask(unsigned int __a, unsigned int __b) { + return __nv_f16x2_le_mask(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b)); +} +__DEVICE__ unsigned int __f16x2_ge_mask(unsigned int __a, unsigned int __b) { + return __nv_f16x2_ge_mask(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b)); +} +__DEVICE__ unsigned int __f16x2_lt_mask(unsigned int __a, unsigned int __b) { + return __nv_f16x2_lt_mask(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b)); +} +__DEVICE__ unsigned int __f16x2_gt_mask(unsigned int __a, unsigned int __b) { + return __nv_f16x2_gt_mask(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b)); +} +__DEVICE__ unsigned int __f16x2_equ_mask(unsigned int __a, unsigned int __b) { + return __nv_f16x2_equ_mask(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b)); +} +__DEVICE__ unsigned int __f16x2_neu_mask(unsigned int __a, unsigned int __b) { + return __nv_f16x2_neu_mask(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b)); +} +__DEVICE__ unsigned int __f16x2_leu_mask(unsigned int __a, unsigned int __b) { + return __nv_f16x2_leu_mask(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b)); +} +__DEVICE__ unsigned int __f16x2_geu_mask(unsigned int __a, unsigned int __b) { + return __nv_f16x2_geu_mask(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b)); +} +__DEVICE__ unsigned int __f16x2_ltu_mask(unsigned int __a, unsigned int __b) { + return __nv_f16x2_ltu_mask(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b)); +} +__DEVICE__ unsigned int __f16x2_gtu_mask(unsigned int __a, unsigned int __b) { + return __nv_f16x2_gtu_mask(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b)); +} +__DEVICE__ unsigned short __f16_eq(unsigned short __a, unsigned short __b) { + return __nv_f16_eq(__builtin_bit_cast(_Float16, __a), + __builtin_bit_cast(_Float16, __b)); +} +__DEVICE__ unsigned short __f16_ne(unsigned short __a, unsigned short __b) { + return __nv_f16_ne(__builtin_bit_cast(_Float16, __a), + __builtin_bit_cast(_Float16, __b)); +} +__DEVICE__ unsigned short __f16_le(unsigned short __a, unsigned short __b) { + return __nv_f16_le(__builtin_bit_cast(_Float16, __a), + __builtin_bit_cast(_Float16, __b)); +} +__DEVICE__ unsigned short __f16_ge(unsigned short __a, unsigned short __b) { + return __nv_f16_ge(__builtin_bit_cast(_Float16, __a), + __builtin_bit_cast(_Float16, __b)); +} +__DEVICE__ unsigned short __f16_lt(unsigned short __a, unsigned short __b) { + return __nv_f16_lt(__builtin_bit_cast(_Float16, __a), + __builtin_bit_cast(_Float16, __b)); +} +__DEVICE__ unsigned short __f16_gt(unsigned short __a, unsigned short __b) { + return __nv_f16_gt(__builtin_bit_cast(_Float16, __a), + __builtin_bit_cast(_Float16, __b)); +} +__DEVICE__ unsigned short __f16_equ(unsigned short __a, unsigned short __b) { + return __nv_f16_equ(__builtin_bit_cast(_Float16, __a), + __builtin_bit_cast(_Float16, __b)); +} +__DEVICE__ unsigned short __f16_neu(unsigned short __a, unsigned short __b) { + return __nv_f16_neu(__builtin_bit_cast(_Float16, __a), + __builtin_bit_cast(_Float16, __b)); +} +__DEVICE__ unsigned short __f16_leu(unsigned short __a, unsigned short __b) { + return __nv_f16_leu(__builtin_bit_cast(_Float16, __a), + __builtin_bit_cast(_Float16, __b)); +} +__DEVICE__ unsigned short __f16_geu(unsigned short __a, unsigned short __b) { + return __nv_f16_geu(__builtin_bit_cast(_Float16, __a), + __builtin_bit_cast(_Float16, __b)); +} +__DEVICE__ unsigned short __f16_ltu(unsigned short __a, unsigned short __b) { + return __nv_f16_ltu(__builtin_bit_cast(_Float16, __a), + __builtin_bit_cast(_Float16, __b)); +} +__DEVICE__ unsigned short __f16_gtu(unsigned short __a, unsigned short __b) { + return __nv_f16_gtu(__builtin_bit_cast(_Float16, __a), + __builtin_bit_cast(_Float16, __b)); +} + +// fp16 arithmetic functions +__DEVICE__ unsigned int __f16x2_add(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_add(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned int __f16x2_sub(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_sub(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned int __f16x2_mul(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_mul(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned int __f16x2_add_sat(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_add_sat(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned int __f16x2_sub_sat(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_sub_sat(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned int __f16x2_mul_sat(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_mul_sat(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned int __f16x2_add_rn(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_add_rn(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned int __f16x2_sub_rn(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_sub_rn(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned int __f16x2_mul_rn(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_mul_rn(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned int __f16x2_fma(unsigned int __a, unsigned int __b, + unsigned int __c) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_fma(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b), + __builtin_bit_cast(_Float16x2, __c))); +} +__DEVICE__ unsigned int __f16x2_fma_sat(unsigned int __a, unsigned int __b, + unsigned int __c) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_fma_sat(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b), + __builtin_bit_cast(_Float16x2, __c))); +} +__DEVICE__ unsigned int __f16x2_div(unsigned int __a, unsigned int __b) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_div(__builtin_bit_cast(_Float16x2, __a), + __builtin_bit_cast(_Float16x2, __b))); +} +__DEVICE__ unsigned short __f16_add(unsigned short __a, unsigned short __b) { + return __builtin_bit_cast(unsigned short, + __nv_f16_add(__builtin_bit_cast(_Float16, __a), + __builtin_bit_cast(_Float16, __b))); +} +__DEVICE__ unsigned short __f16_sub(unsigned short __a, unsigned short __b) { + return __builtin_bit_cast(unsigned short, + __nv_f16_sub(__builtin_bit_cast(_Float16, __a), + __builtin_bit_cast(_Float16, __b))); +} +__DEVICE__ unsigned short __f16_mul(unsigned short __a, unsigned short __b) { + return __builtin_bit_cast(unsigned short, + __nv_f16_mul(__builtin_bit_cast(_Float16, __a), + __builtin_bit_cast(_Float16, __b))); +} +__DEVICE__ unsigned short __f16_add_sat(unsigned short __a, + unsigned short __b) { + return __builtin_bit_cast( + unsigned short, __nv_f16_add_sat(__builtin_bit_cast(_Float16, __a), + __builtin_bit_cast(_Float16, __b))); +} +__DEVICE__ unsigned short __f16_sub_sat(unsigned short __a, + unsigned short __b) { + return __builtin_bit_cast( + unsigned short, __nv_f16_sub_sat(__builtin_bit_cast(_Float16, __a), + __builtin_bit_cast(_Float16, __b))); +} +__DEVICE__ unsigned short __f16_mul_sat(unsigned short __a, + unsigned short __b) { + return __builtin_bit_cast( + unsigned short, __nv_f16_mul_sat(__builtin_bit_cast(_Float16, __a), + __builtin_bit_cast(_Float16, __b))); +} +__DEVICE__ unsigned short __f16_add_rn(unsigned short __a, unsigned short __b) { + return __builtin_bit_cast(unsigned short, + __nv_f16_add_rn(__builtin_bit_cast(_Float16, __a), + __builtin_bit_cast(_Float16, __b))); +} +__DEVICE__ unsigned short __f16_sub_rn(unsigned short __a, unsigned short __b) { + return __builtin_bit_cast(unsigned short, + __nv_f16_sub_rn(__builtin_bit_cast(_Float16, __a), + __builtin_bit_cast(_Float16, __b))); +} +__DEVICE__ unsigned short __f16_mul_rn(unsigned short __a, unsigned short __b) { + return __builtin_bit_cast(unsigned short, + __nv_f16_mul_rn(__builtin_bit_cast(_Float16, __a), + __builtin_bit_cast(_Float16, __b))); +} +__DEVICE__ unsigned short __f16_fma(unsigned short __a, unsigned short __b, + unsigned short __c) { + return __builtin_bit_cast(unsigned short, + __nv_f16_fma(__builtin_bit_cast(_Float16, __a), + __builtin_bit_cast(_Float16, __b), + __builtin_bit_cast(_Float16, __c))); +} +__DEVICE__ unsigned short __f16_div(unsigned short __a, unsigned short __b) { + return __builtin_bit_cast(unsigned short, + __nv_f16_div(__builtin_bit_cast(_Float16, __a), + __builtin_bit_cast(_Float16, __b))); +} +__DEVICE__ unsigned short __f16_fma_sat(unsigned short __a, unsigned short __b, + unsigned short __c) { + return __builtin_bit_cast( + unsigned short, __nv_f16_fma_sat(__builtin_bit_cast(_Float16, __a), + __builtin_bit_cast(_Float16, __b), + __builtin_bit_cast(_Float16, __c))); +} + +// fp16 transcendental functions +__DEVICE__ unsigned short __f16_sin(unsigned short __x) { + return __builtin_bit_cast(unsigned short, + __nv_f16_sin(__builtin_bit_cast(_Float16, __x))); +} +__DEVICE__ unsigned int __f16x2_sin(unsigned int __x) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_sin(__builtin_bit_cast(_Float16x2, __x))); +} +__DEVICE__ unsigned short __f16_cos(unsigned short __x) { + return __builtin_bit_cast(unsigned short, + __nv_f16_cos(__builtin_bit_cast(_Float16, __x))); +} +__DEVICE__ unsigned int __f16x2_cos(unsigned int __x) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_cos(__builtin_bit_cast(_Float16x2, __x))); +} +__DEVICE__ unsigned short __f16_exp(unsigned short __x) { + return __builtin_bit_cast(unsigned short, + __nv_f16_exp(__builtin_bit_cast(_Float16, __x))); +} +__DEVICE__ unsigned int __f16x2_exp(unsigned int __x) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_exp(__builtin_bit_cast(_Float16x2, __x))); +} +__DEVICE__ unsigned short __f16_tanh(unsigned short __x) { + return __builtin_bit_cast(unsigned short, + __nv_f16_tanh(__builtin_bit_cast(_Float16, __x))); +} +__DEVICE__ unsigned int __f16x2_tanh(unsigned int __x) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_tanh(__builtin_bit_cast(_Float16x2, __x))); +} +__DEVICE__ unsigned short __f16_tanh_approx(unsigned short __x) { + return __builtin_bit_cast( + unsigned short, __nv_f16_tanh_approx(__builtin_bit_cast(_Float16, __x))); +} +__DEVICE__ unsigned int __f16x2_tanh_approx(unsigned int __x) { + return __builtin_bit_cast( + unsigned int, + __nv_f16x2_tanh_approx(__builtin_bit_cast(_Float16x2, __x))); +} +__DEVICE__ unsigned short __f16_exp2(unsigned short __x) { + return __builtin_bit_cast(unsigned short, + __nv_f16_exp2(__builtin_bit_cast(_Float16, __x))); +} +__DEVICE__ unsigned int __f16x2_exp2(unsigned int __x) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_exp2(__builtin_bit_cast(_Float16x2, __x))); +} +__DEVICE__ unsigned short __f16_exp10(unsigned short __x) { + return __builtin_bit_cast(unsigned short, + __nv_f16_exp10(__builtin_bit_cast(_Float16, __x))); +} +__DEVICE__ unsigned int __f16x2_exp10(unsigned int __x) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_exp10(__builtin_bit_cast(_Float16x2, __x))); +} +__DEVICE__ unsigned short __f16_log2(unsigned short __x) { + return __builtin_bit_cast(unsigned short, + __nv_f16_log2(__builtin_bit_cast(_Float16, __x))); +} +__DEVICE__ unsigned int __f16x2_log2(unsigned int __x) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_log2(__builtin_bit_cast(_Float16x2, __x))); +} +__DEVICE__ unsigned short __f16_log(unsigned short __x) { + return __builtin_bit_cast(unsigned short, + __nv_f16_log(__builtin_bit_cast(_Float16, __x))); +} +__DEVICE__ unsigned int __f16x2_log(unsigned int __x) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_log(__builtin_bit_cast(_Float16x2, __x))); +} +__DEVICE__ unsigned short __f16_log10(unsigned short __x) { + return __builtin_bit_cast(unsigned short, + __nv_f16_log10(__builtin_bit_cast(_Float16, __x))); +} +__DEVICE__ unsigned int __f16x2_log10(unsigned int __x) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_log10(__builtin_bit_cast(_Float16x2, __x))); +} +__DEVICE__ unsigned int __f16x2_rcp(unsigned int __x) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_rcp(__builtin_bit_cast(_Float16x2, __x))); +} +__DEVICE__ unsigned short __f16_rcp(unsigned short __x) { + return __builtin_bit_cast(unsigned short, + __nv_f16_rcp(__builtin_bit_cast(_Float16, __x))); +} +__DEVICE__ unsigned short __f16_rsqrt(unsigned short __x) { + return __builtin_bit_cast(unsigned short, + __nv_f16_rsqrt(__builtin_bit_cast(_Float16, __x))); +} +__DEVICE__ unsigned int __f16x2_rsqrt(unsigned int __x) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_rsqrt(__builtin_bit_cast(_Float16x2, __x))); +} +__DEVICE__ unsigned int __f16x2_sqrt(unsigned int __x) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_sqrt(__builtin_bit_cast(_Float16x2, __x))); +} +__DEVICE__ unsigned short __f16_sqrt(unsigned short __x) { + return __builtin_bit_cast(unsigned short, + __nv_f16_sqrt(__builtin_bit_cast(_Float16, __x))); +} +__DEVICE__ unsigned int __f16x2_isnan(unsigned int __x) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_isnan(__builtin_bit_cast(_Float16x2, __x))); +} +__DEVICE__ unsigned short __f16_isnan(unsigned short __x) { + return __builtin_bit_cast(unsigned short, + __nv_f16_isnan(__builtin_bit_cast(_Float16, __x))); +} +__DEVICE__ unsigned int __f16x2_neg(unsigned int __x) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_neg(__builtin_bit_cast(_Float16x2, __x))); +} +__DEVICE__ unsigned short __f16_neg(unsigned short __x) { + return __builtin_bit_cast(unsigned short, + __nv_f16_neg(__builtin_bit_cast(_Float16, __x))); +} +__DEVICE__ unsigned int __f16x2_abs(unsigned int __x) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_abs(__builtin_bit_cast(_Float16x2, __x))); +} +__DEVICE__ unsigned short __f16_abs(unsigned short __x) { + return __builtin_bit_cast(unsigned short, + __nv_f16_abs(__builtin_bit_cast(_Float16, __x))); +} +__DEVICE__ unsigned short __f16_max_nan(unsigned short __x, + unsigned short __y) { + return __builtin_bit_cast( + unsigned short, __nv_f16_max_nan(__builtin_bit_cast(_Float16, __x), + __builtin_bit_cast(_Float16, __y))); +} +__DEVICE__ unsigned short __f16_min_nan(unsigned short __x, + unsigned short __y) { + return __builtin_bit_cast( + unsigned short, __nv_f16_min_nan(__builtin_bit_cast(_Float16, __x), + __builtin_bit_cast(_Float16, __y))); +} +__DEVICE__ unsigned short __f16_fma_relu(unsigned short __x, unsigned short __y, + unsigned short __z) { + return __builtin_bit_cast( + unsigned short, __nv_f16_fma_relu(__builtin_bit_cast(_Float16, __x), + __builtin_bit_cast(_Float16, __y), + __builtin_bit_cast(_Float16, __z))); +} +__DEVICE__ unsigned int __f16x2_max_nan(unsigned int __x, unsigned int __y) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_max_nan(__builtin_bit_cast(_Float16x2, __x), + __builtin_bit_cast(_Float16x2, __y))); +} +__DEVICE__ unsigned int __f16x2_min_nan(unsigned int __x, unsigned int __y) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_min_nan(__builtin_bit_cast(_Float16x2, __x), + __builtin_bit_cast(_Float16x2, __y))); +} +__DEVICE__ unsigned int __f16x2_fma_relu(unsigned int __x, unsigned int __y, + unsigned int __z) { + return __builtin_bit_cast( + unsigned int, __nv_f16x2_fma_relu(__builtin_bit_cast(_Float16x2, __x), + __builtin_bit_cast(_Float16x2, __y), + __builtin_bit_cast(_Float16x2, __z))); +} +#endif // CUDA_VERSION >= 13030 + // For OpenMP we require the user to include <time.h> as we need to know what // clock_t is on the system. #ifndef __OPENMP_NVPTX__ diff --git a/clang/lib/Headers/__clang_cuda_libdevice_declares.h b/clang/lib/Headers/__clang_cuda_libdevice_declares.h index ded0382a7ddce..1f237b431ebf9 100644 --- a/clang/lib/Headers/__clang_cuda_libdevice_declares.h +++ b/clang/lib/Headers/__clang_cuda_libdevice_declares.h @@ -458,6 +458,196 @@ __DEVICE__ float __nv_y1f(float __a); __DEVICE__ float __nv_ynf(int __a, float __b); __DEVICE__ double __nv_yn(int __a, double __b); +#if CUDA_VERSION >= 13030 +typedef _Float16 _Float16x2 __attribute__((ext_vector_type(2))); + +__device__ _Float16 __nv_f16_double2half(double __a); +__device__ _Float16 __nv_f16_float2half_rn(float __a); +__device__ _Float16 __nv_f16_float2half_rz(float __a); +__device__ _Float16 __nv_f16_float2half_rd(float __a); +__device__ _Float16 __nv_f16_float2half_ru(float __a); +__device__ _Float16x2 __nv_f16_float2half2_rn(float __a); +__device__ _Float16x2 __nv_f16_floats2half2_rn(float __a, float __b); +__device__ float __nv_f16_half2float(_Float16 __a); +__device__ float __nv_f16_low2float(_Float16x2 __a); +__device__ float __nv_f16_high2float(_Float16x2 __a); +__device__ char __nv_f16_half2char_rz(_Float16 __a); +__device__ unsigned char __nv_f16_half2uchar_rz(_Float16 __a); +__device__ short __nv_f16_half2short_rz(_Float16 __a); +__device__ unsigned short __nv_f16_half2ushort_rz(_Float16 __a); +__device__ int __nv_f16_half2int_rz(_Float16 __a); +__device__ unsigned int __nv_f16_half2uint_rz(_Float16 __a); +__device__ long long __nv_f16_half2ll_rz(_Float16 __a); +__device__ unsigned long long __nv_f16_half2ull_rz(_Float16 __a); +__device__ float2 __nv_f16_half22float2(_Float16x2 __a); +__device__ int __nv_f16_half2int_rn(_Float16 __a); +__device__ int __nv_f16_half2int_rd(_Float16 __a); +__device__ int __nv_f16_half2int_ru(_Float16 __a); +__device__ _Float16 __nv_f16_int2half_rn(int __a); +__device__ _Float16 __nv_f16_int2half_rz(int __a); +__device__ _Float16 __nv_f16_int2half_rd(int __a); +__device__ _Float16 __nv_f16_int2half_ru(int __a); +__device__ short __nv_f16_half2short_rn(_Float16 __a); +__device__ short __nv_f16_half2short_rd(_Float16 __a); +__device__ short __nv_f16_half2short_ru(_Float16 __a); +__device__ _Float16 __nv_f16_short2half_rn(short __a); +__device__ _Float16 __nv_f16_short2half_rz(short __a); +__device__ _Float16 __nv_f16_short2half_rd(short __a); +__device__ _Float16 __nv_f16_short2half_ru(short __a); +__device__ unsigned int __nv_f16_half2uint_rn(_Float16 __a); +__device__ unsigned int __nv_f16_half2uint_rd(_Float16 __a); +__device__ unsigned int __nv_f16_half2uint_ru(_Float16 __a); +__device__ _Float16 __nv_f16_uint2half_rn(unsigned int __a); +__device__ _Float16 __nv_f16_uint2half_rz(unsigned int __a); +__device__ _Float16 __nv_f16_uint2half_rd(unsigned int __a); +__device__ _Float16 __nv_f16_uint2half_ru(unsigned int __a); +__device__ unsigned short __nv_f16_half2ushort_rn(_Float16 __a); +__device__ unsigned short __nv_f16_half2ushort_rd(_Float16 __a); +__device__ unsigned short __nv_f16_half2ushort_ru(_Float16 __a); +__device__ _Float16 __nv_f16_ushort2half_rn(unsigned short __a); +__device__ _Float16 __nv_f16_ushort2half_rz(unsigned short __a); +__device__ _Float16 __nv_f16_ushort2half_rd(unsigned short __a); +__device__ _Float16 __nv_f16_ushort2half_ru(unsigned short __a); +__device__ unsigned long long __nv_f16_half2ull_rn(_Float16 __a); +__device__ unsigned long long __nv_f16_half2ull_rd(_Float16 __a); +__device__ unsigned long long __nv_f16_half2ull_ru(_Float16 __a); +__device__ _Float16 __nv_f16_ull2half_rn(unsigned long long __a); +__device__ _Float16 __nv_f16_ull2half_rz(unsigned long long __a); +__device__ _Float16 __nv_f16_ull2half_rd(unsigned long long __a); +__device__ _Float16 __nv_f16_ull2half_ru(unsigned long long __a); +__device__ long long __nv_f16_half2ll_rn(_Float16 __a); +__device__ long long __nv_f16_half2ll_rd(_Float16 __a); +__device__ long long __nv_f16_half2ll_ru(_Float16 __a); +__device__ _Float16 __nv_f16_ll2half_rn(long long __a); +__device__ _Float16 __nv_f16_ll2half_rz(long long __a); +__device__ _Float16 __nv_f16_ll2half_rd(long long __a); +__device__ _Float16 __nv_f16_ll2half_ru(long long __a); +__device__ _Float16 __nv_f16_trunc(_Float16 __a); +__device__ _Float16 __nv_f16_ceil(_Float16 __a); +__device__ _Float16 __nv_f16_floor(_Float16 __a); +__device__ _Float16 __nv_f16_rint(_Float16 __a); +__device__ _Float16x2 __nv_f16x2_trunc(_Float16x2 __a); +__device__ _Float16x2 __nv_f16x2_floor(_Float16x2 __a); +__device__ _Float16x2 __nv_f16x2_ceil(_Float16x2 __a); +__device__ _Float16x2 __nv_f16x2_rint(_Float16x2 __a); +__device__ _Float16x2 __nv_f16_lows2half2(_Float16x2 __a, _Float16x2 __b); +__device__ _Float16x2 __nv_f16_highs2half2(_Float16x2 __a, _Float16x2 __b); +__device__ _Float16 __nv_f16_low2half(_Float16x2 __a); +__device__ _Float16x2 __nv_f16_low2half2(_Float16x2 __a); +__device__ _Float16x2 __nv_f16_high2half2(_Float16x2 __a); +__device__ _Float16 __nv_f16_high2half(_Float16x2 __a); +__device__ _Float16x2 __nv_f16_halves2half2(_Float16 __a, _Float16 __b); +__device__ _Float16x2 __nv_f16_half2half2(_Float16 __a); +__device__ _Float16x2 __nv_f16_lowhigh2highlow(_Float16x2 __a); +__device__ _Float16 __nv_f16_max(_Float16 __a, _Float16 __b); +__device__ _Float16 __nv_f16_min(_Float16 __a, _Float16 __b); +__device__ _Float16x2 __nv_f16x2_max(_Float16x2 __a, _Float16x2 __b); +__device__ _Float16x2 __nv_f16x2_min(_Float16x2 __a, _Float16x2 __b); +__device__ _Float16x2 __nv_f16x2_eq(_Float16x2 __a, _Float16x2 __b); +__device__ _Float16x2 __nv_f16x2_ne(_Float16x2 __a, _Float16x2 __b); +__device__ _Float16x2 __nv_f16x2_le(_Float16x2 __a, _Float16x2 __b); +__device__ _Float16x2 __nv_f16x2_ge(_Float16x2 __a, _Float16x2 __b); +__device__ _Float16x2 __nv_f16x2_lt(_Float16x2 __a, _Float16x2 __b); +__device__ _Float16x2 __nv_f16x2_gt(_Float16x2 __a, _Float16x2 __b); +__device__ _Float16x2 __nv_f16x2_equ(_Float16x2 __a, _Float16x2 __b); +__device__ _Float16x2 __nv_f16x2_neu(_Float16x2 __a, _Float16x2 __b); +__device__ _Float16x2 __nv_f16x2_leu(_Float16x2 __a, _Float16x2 __b); +__device__ _Float16x2 __nv_f16x2_geu(_Float16x2 __a, _Float16x2 __b); +__device__ _Float16x2 __nv_f16x2_ltu(_Float16x2 __a, _Float16x2 __b); +__device__ _Float16x2 __nv_f16x2_gtu(_Float16x2 __a, _Float16x2 __b); +__device__ unsigned int __nv_f16x2_eq_mask(_Float16x2 __a, _Float16x2 __b); +__device__ unsigned int __nv_f16x2_ne_mask(_Float16x2 __a, _Float16x2 __b); +__device__ unsigned int __nv_f16x2_le_mask(_Float16x2 __a, _Float16x2 __b); +__device__ unsigned int __nv_f16x2_ge_mask(_Float16x2 __a, _Float16x2 __b); +__device__ unsigned int __nv_f16x2_lt_mask(_Float16x2 __a, _Float16x2 __b); +__device__ unsigned int __nv_f16x2_gt_mask(_Float16x2 __a, _Float16x2 __b); +__device__ unsigned int __nv_f16x2_equ_mask(_Float16x2 __a, _Float16x2 __b); +__device__ unsigned int __nv_f16x2_neu_mask(_Float16x2 __a, _Float16x2 __b); +__device__ unsigned int __nv_f16x2_leu_mask(_Float16x2 __a, _Float16x2 __b); +__device__ unsigned int __nv_f16x2_geu_mask(_Float16x2 __a, _Float16x2 __b); +__device__ unsigned int __nv_f16x2_ltu_mask(_Float16x2 __a, _Float16x2 __b); +__device__ unsigned int __nv_f16x2_gtu_mask(_Float16x2 __a, _Float16x2 __b); +__device__ unsigned short __nv_f16_eq(_Float16 __a, _Float16 __b); +__device__ unsigned short __nv_f16_ne(_Float16 __a, _Float16 __b); +__device__ unsigned short __nv_f16_le(_Float16 __a, _Float16 __b); +__device__ unsigned short __nv_f16_ge(_Float16 __a, _Float16 __b); +__device__ unsigned short __nv_f16_lt(_Float16 __a, _Float16 __b); +__device__ unsigned short __nv_f16_gt(_Float16 __a, _Float16 __b); +__device__ unsigned short __nv_f16_equ(_Float16 __a, _Float16 __b); +__device__ unsigned short __nv_f16_neu(_Float16 __a, _Float16 __b); +__device__ unsigned short __nv_f16_leu(_Float16 __a, _Float16 __b); +__device__ unsigned short __nv_f16_geu(_Float16 __a, _Float16 __b); +__device__ unsigned short __nv_f16_ltu(_Float16 __a, _Float16 __b); +__device__ unsigned short __nv_f16_gtu(_Float16 __a, _Float16 __b); +__device__ _Float16x2 __nv_f16x2_add(_Float16x2 __a, _Float16x2 __b); +__device__ _Float16x2 __nv_f16x2_sub(_Float16x2 __a, _Float16x2 __b); +__device__ _Float16x2 __nv_f16x2_mul(_Float16x2 __a, _Float16x2 __b); +__device__ _Float16x2 __nv_f16x2_add_sat(_Float16x2 __a, _Float16x2 __b); +__device__ _Float16x2 __nv_f16x2_sub_sat(_Float16x2 __a, _Float16x2 __b); +__device__ _Float16x2 __nv_f16x2_mul_sat(_Float16x2 __a, _Float16x2 __b); +__device__ _Float16x2 __nv_f16x2_add_rn(_Float16x2 __a, _Float16x2 __b); +__device__ _Float16x2 __nv_f16x2_sub_rn(_Float16x2 __a, _Float16x2 __b); +__device__ _Float16x2 __nv_f16x2_mul_rn(_Float16x2 __a, _Float16x2 __b); +__device__ _Float16x2 __nv_f16x2_fma(_Float16x2 __a, _Float16x2 __b, + _Float16x2 __c); +__device__ _Float16x2 __nv_f16x2_fma_sat(_Float16x2 __a, _Float16x2 __b, + _Float16x2 __c); +__device__ _Float16x2 __nv_f16x2_div(_Float16x2 __a, _Float16x2 __b); +__device__ _Float16 __nv_f16_add(_Float16 __a, _Float16 __b); +__device__ _Float16 __nv_f16_sub(_Float16 __a, _Float16 __b); +__device__ _Float16 __nv_f16_mul(_Float16 __a, _Float16 __b); +__device__ _Float16 __nv_f16_add_sat(_Float16 __a, _Float16 __b); +__device__ _Float16 __nv_f16_sub_sat(_Float16 __a, _Float16 __b); +__device__ _Float16 __nv_f16_mul_sat(_Float16 __a, _Float16 __b); +__device__ _Float16 __nv_f16_add_rn(_Float16 __a, _Float16 __b); +__device__ _Float16 __nv_f16_sub_rn(_Float16 __a, _Float16 __b); +__device__ _Float16 __nv_f16_mul_rn(_Float16 __a, _Float16 __b); +__device__ _Float16 __nv_f16_fma(_Float16 __a, _Float16 __b, _Float16 __c); +__device__ _Float16 __nv_f16_div(_Float16 __a, _Float16 __b); +__device__ _Float16 __nv_f16_fma_sat(_Float16 __a, _Float16 __b, _Float16 __c); +__device__ _Float16 __nv_f16_sin(_Float16 __a); +__device__ _Float16x2 __nv_f16x2_sin(_Float16x2 __a); +__device__ _Float16 __nv_f16_cos(_Float16 __a); +__device__ _Float16x2 __nv_f16x2_cos(_Float16x2 __a); +__device__ _Float16 __nv_f16_exp(_Float16 __a); +__device__ _Float16x2 __nv_f16x2_exp(_Float16x2 __a); +__device__ _Float16 __nv_f16_tanh(_Float16 __a); +__device__ _Float16x2 __nv_f16x2_tanh(_Float16x2 __a); +__device__ _Float16 __nv_f16_tanh_approx(_Float16 __a); +__device__ _Float16x2 __nv_f16x2_tanh_approx(_Float16x2 __a); +__device__ _Float16 __nv_f16_exp2(_Float16 __a); +__device__ _Float16x2 __nv_f16x2_exp2(_Float16x2 __a); +__device__ _Float16 __nv_f16_exp10(_Float16 __a); +__device__ _Float16x2 __nv_f16x2_exp10(_Float16x2 __a); +__device__ _Float16 __nv_f16_log2(_Float16 __a); +__device__ _Float16x2 __nv_f16x2_log2(_Float16x2 __a); +__device__ _Float16 __nv_f16_log(_Float16 __a); +__device__ _Float16x2 __nv_f16x2_log(_Float16x2 __a); +__device__ _Float16 __nv_f16_log10(_Float16 __a); +__device__ _Float16x2 __nv_f16x2_log10(_Float16x2 __a); +__device__ _Float16x2 __nv_f16x2_rcp(_Float16x2 __a); +__device__ _Float16 __nv_f16_rcp(_Float16 __a); +__device__ _Float16 __nv_f16_rsqrt(_Float16 __a); +__device__ _Float16x2 __nv_f16x2_rsqrt(_Float16x2 __a); +__device__ _Float16x2 __nv_f16x2_sqrt(_Float16x2 __a); +__device__ _Float16 __nv_f16_sqrt(_Float16 __a); +__device__ _Float16x2 __nv_f16x2_isnan(_Float16x2 __a); +__device__ _Float16 __nv_f16_isnan(_Float16 __a); +__device__ _Float16x2 __nv_f16x2_neg(_Float16x2 __a); +__device__ _Float16 __nv_f16_neg(_Float16 __a); +__device__ _Float16x2 __nv_f16x2_abs(_Float16x2 __a); +__device__ _Float16 __nv_f16_abs(_Float16 __a); +__device__ _Float16 __nv_f16_max_nan(_Float16 __a, _Float16 __b); +__device__ _Float16 __nv_f16_min_nan(_Float16 __a, _Float16 __b); +__device__ _Float16 __nv_f16_fma_relu(_Float16 __a, _Float16 __b, _Float16 __c); +__device__ _Float16x2 __nv_f16x2_max_nan(_Float16x2 __a, _Float16x2 __b); +__device__ _Float16x2 __nv_f16x2_min_nan(_Float16x2 __a, _Float16x2 __b); +__device__ _Float16x2 __nv_f16x2_fma_relu(_Float16x2 __a, _Float16x2 __b, + _Float16x2 __c); +__device__ _Float16x2 __nv_f16x2_atomic_add(_Float16x2 *__a, _Float16x2 __b); +__device__ _Float16 __nv_f16_atomic_add(_Float16 *__a, _Float16 __b); +#endif // CUDA_VERSION >= 13030 + #if defined(__OPENMP_NVPTX__) #pragma omp end assumes ext_spmd_amenable no_openmp #endif _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
