Author: Rana Pratap Reddy Date: 2026-03-10T10:20:40+05:30 New Revision: a17289b76ae31efdd5b6ce0ed8da04b1b9185a33
URL: https://github.com/llvm/llvm-project/commit/a17289b76ae31efdd5b6ce0ed8da04b1b9185a33 DIFF: https://github.com/llvm/llvm-project/commit/a17289b76ae31efdd5b6ce0ed8da04b1b9185a33.diff LOG: [Clang][AMDGPU] Change __fp16 to _Float16 in builtin definitions (#185446) Change the type signature of `SWMMAC, load, cvt` builtins from `__fp16 to _Float16` in the tablegen builtin definitions. Added: clang/test/CodeGenHIP/builtins-amdgcn-f16-misc.hip clang/test/CodeGenHIP/builtins-amdgcn-gfx12-f16-w32.hip clang/test/CodeGenHIP/builtins-amdgcn-gfx12-f16-w64.hip clang/test/CodeGenHIP/builtins-amdgcn-gfx1250-f16-misc.hip clang/test/CodeGenHIP/builtins-amdgcn-gfx950-f16.hip Modified: clang/include/clang/Basic/BuiltinsAMDGPU.td clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-tr.cl clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl clang/test/SemaHIP/hip-builtin-lvalue-to-rvalue.hip Removed: ################################################################################ diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td index acd0a34a79253..18aebdc38bcfc 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.td +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td @@ -223,7 +223,7 @@ def __builtin_amdgcn_alignbit : AMDGPUBuiltin<"unsigned int(unsigned int, unsign def __builtin_amdgcn_alignbyte : AMDGPUBuiltin<"unsigned int(unsigned int, unsigned int, unsigned int)", [Const]>; def __builtin_amdgcn_ubfe : AMDGPUBuiltin<"unsigned int(unsigned int, unsigned int, unsigned int)", [Const]>; def __builtin_amdgcn_sbfe : AMDGPUBuiltin<"unsigned int(unsigned int, unsigned int, unsigned int)", [Const]>; -def __builtin_amdgcn_cvt_pkrtz : AMDGPUBuiltin<"_ExtVector<2, __fp16>(float, float)", [Const]>; +def __builtin_amdgcn_cvt_pkrtz : AMDGPUBuiltin<"_ExtVector<2, _Float16>(float, float)", [Const]>; def __builtin_amdgcn_cvt_pknorm_i16 : AMDGPUBuiltin<"_ExtVector<2, short>(float, float)", [Const], "cvt-pknorm-vop2-insts">; def __builtin_amdgcn_cvt_pknorm_u16 : AMDGPUBuiltin<"_ExtVector<2, unsigned short>(float, float)", [Const], "cvt-pknorm-vop2-insts">; def __builtin_amdgcn_cvt_pk_i16 : AMDGPUBuiltin<"_ExtVector<2, short>(int, int)", [Const]>; @@ -319,7 +319,7 @@ def __builtin_amdgcn_ds_gws_sema_release_all : AMDGPUBuiltin<"void(unsigned int) // Interpolation builtins. //===----------------------------------------------------------------------===// def __builtin_amdgcn_interp_p1_f16 : AMDGPUBuiltin<"float(float, unsigned int, unsigned int, bool, unsigned int)", [Const]>; -def __builtin_amdgcn_interp_p2_f16 : AMDGPUBuiltin<"__fp16(float, float, unsigned int, unsigned int, bool, unsigned int)", [Const]>; +def __builtin_amdgcn_interp_p2_f16 : AMDGPUBuiltin<"_Float16(float, float, unsigned int, unsigned int, bool, unsigned int)", [Const]>; def __builtin_amdgcn_interp_p1 : AMDGPUBuiltin<"float(float, unsigned int, unsigned int, unsigned int)", [Const]>; def __builtin_amdgcn_interp_p2 : AMDGPUBuiltin<"float(float, float, unsigned int, unsigned int, unsigned int)", [Const]>; def __builtin_amdgcn_interp_mov : AMDGPUBuiltin<"float(unsigned int, unsigned int, unsigned int, unsigned int)", [Const]>; @@ -349,7 +349,7 @@ def __builtin_amdgcn_perm : AMDGPUBuiltin<"unsigned int(unsigned int, unsigned i // GFX9+ only builtins. //===----------------------------------------------------------------------===// -def __builtin_amdgcn_fmed3h : AMDGPUBuiltin<"__fp16(__fp16, __fp16, __fp16)", [Const], "gfx9-insts">; +def __builtin_amdgcn_fmed3h : AMDGPUBuiltin<"_Float16(_Float16, _Float16, _Float16)", [Const], "gfx9-insts">; def __builtin_amdgcn_global_atomic_fadd_f64 : AMDGPUBuiltin<"double(double address_space<1> *, double)", [], "gfx90a-insts">; def __builtin_amdgcn_global_atomic_fadd_f32 : AMDGPUBuiltin<"float(float address_space<1> *, float)", [], "atomic-fadd-rtn-insts">; @@ -669,7 +669,7 @@ def __builtin_amdgcn_ds_read_tr4_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, int>(_ def __builtin_amdgcn_ds_read_tr6_b96_v3i32 : AMDGPUBuiltin<"_ExtVector<3, int>(_ExtVector<3, int> address_space<3> *)", [Const], "gfx950-insts">; def __builtin_amdgcn_ds_read_tr8_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int> address_space<3> *)", [Const], "gfx950-insts">; def __builtin_amdgcn_ds_read_tr16_b64_v4i16 : AMDGPUBuiltin<"_ExtVector<4, short>(_ExtVector<4, short> address_space<3> *)", [Const], "gfx950-insts">; -def __builtin_amdgcn_ds_read_tr16_b64_v4f16 : AMDGPUBuiltin<"_ExtVector<4, __fp16>(_ExtVector<4, __fp16> address_space<3> *)", [Const], "gfx950-insts">; +def __builtin_amdgcn_ds_read_tr16_b64_v4f16 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(_ExtVector<4, _Float16> address_space<3> *)", [Const], "gfx950-insts">; def __builtin_amdgcn_ds_read_tr16_b64_v4bf16 : AMDGPUBuiltin<"_ExtVector<4, __bf16>(_ExtVector<4, __bf16> address_space<3> *)", [Const], "gfx950-insts">; def __builtin_amdgcn_ashr_pk_i8_i32 : AMDGPUBuiltin<"unsigned short(unsigned int, unsigned int, unsigned int)", [Const], "ashr-pk-insts">; @@ -699,11 +699,11 @@ def __builtin_amdgcn_s_buffer_prefetch_data : AMDGPUBuiltin<"void(__amdgpu_buffe def __builtin_amdgcn_global_load_tr_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int> address_space<1> *)", [Const], "gfx12-insts,wavefrontsize32">; def __builtin_amdgcn_global_load_tr_b128_v8i16 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<8, short> address_space<1> *)", [Const], "gfx12-insts,wavefrontsize32">; -def __builtin_amdgcn_global_load_tr_b128_v8f16 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, __fp16> address_space<1> *)", [Const], "gfx12-insts,wavefrontsize32">; +def __builtin_amdgcn_global_load_tr_b128_v8f16 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<8, _Float16> address_space<1> *)", [Const], "gfx12-insts,wavefrontsize32">; def __builtin_amdgcn_global_load_tr_b128_v8bf16 : AMDGPUBuiltin<"_ExtVector<8, __bf16>(_ExtVector<8, __bf16> address_space<1> *)", [Const], "gfx12-insts,wavefrontsize32">; def __builtin_amdgcn_global_load_tr_b64_i32 : AMDGPUBuiltin<"int(int address_space<1> *)", [Const], "gfx12-insts,wavefrontsize64">; def __builtin_amdgcn_global_load_tr_b128_v4i16 : AMDGPUBuiltin<"_ExtVector<4, short>(_ExtVector<4, short> address_space<1> *)", [Const], "gfx12-insts,wavefrontsize64">; -def __builtin_amdgcn_global_load_tr_b128_v4f16 : AMDGPUBuiltin<"_ExtVector<4, __fp16>(_ExtVector<4, __fp16> address_space<1> *)", [Const], "gfx12-insts,wavefrontsize64">; +def __builtin_amdgcn_global_load_tr_b128_v4f16 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(_ExtVector<4, _Float16> address_space<1> *)", [Const], "gfx12-insts,wavefrontsize64">; def __builtin_amdgcn_global_load_tr_b128_v4bf16 : AMDGPUBuiltin<"_ExtVector<4, __bf16>(_ExtVector<4, __bf16> address_space<1> *)", [Const], "gfx12-insts,wavefrontsize64">; def __builtin_amdgcn_ds_bpermute_fi_b32 : AMDGPUBuiltin<"int(int, int)", [Const], "gfx12-insts">; @@ -828,9 +828,9 @@ def __builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12 : AMDGPUBuiltin<"_ExtVector let ArgNames = ["a_sign", "a", "b_sign", "b", "c", "clamp"]; } -def __builtin_amdgcn_swmmac_f32_16x16x32_f16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, __fp16>, _ExtVector<16, __fp16>, _ExtVector<8, float>, int)", [Const], "wmma-128b-insts,wavefrontsize32">; +def __builtin_amdgcn_swmmac_f32_16x16x32_f16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, _Float16>, _ExtVector<16, _Float16>, _ExtVector<8, float>, int)", [Const], "wmma-128b-insts,wavefrontsize32">; def __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, short>, _ExtVector<16, short>, _ExtVector<8, float>, int)", [Const], "wmma-128b-insts,wavefrontsize32">; -def __builtin_amdgcn_swmmac_f16_16x16x32_f16_w32 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, __fp16>, _ExtVector<16, __fp16>, _ExtVector<8, __fp16>, int)", [Const], "wmma-128b-insts,wavefrontsize32">; +def __builtin_amdgcn_swmmac_f16_16x16x32_f16_w32 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<8, _Float16>, _ExtVector<16, _Float16>, _ExtVector<8, _Float16>, int)", [Const], "wmma-128b-insts,wavefrontsize32">; def __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<8, short>, _ExtVector<16, short>, _ExtVector<8, short>, int)", [Const], "wmma-128b-insts,wavefrontsize32">; def __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant bool, _ExtVector<4, int>, _ExtVector<8, int>, int, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize32">; def __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32 : AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, int, _Constant bool, _ExtVector<2, int>, _ExtVector<8, int>, int, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize32">; @@ -840,9 +840,9 @@ def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32 : AMDGPUBuiltin<"_ExtVector def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "wmma-128b-insts,wavefrontsize32">; def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32 : AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, _ExtVector<8, float>, int)", [Const], "wmma-128b-insts,wavefrontsize32">; -def __builtin_amdgcn_swmmac_f32_16x16x32_f16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, __fp16>, _ExtVector<8, __fp16>, _ExtVector<4, float>, int)", [Const], "wmma-128b-insts,wavefrontsize64">; +def __builtin_amdgcn_swmmac_f32_16x16x32_f16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, _Float16>, _ExtVector<8, _Float16>, _ExtVector<4, float>, int)", [Const], "wmma-128b-insts,wavefrontsize64">; def __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64 : AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, short>, _ExtVector<8, short>, _ExtVector<4, float>, int)", [Const], "wmma-128b-insts,wavefrontsize64">; -def __builtin_amdgcn_swmmac_f16_16x16x32_f16_w64 : AMDGPUBuiltin<"_ExtVector<4, __fp16>(_ExtVector<4, __fp16>, _ExtVector<8, __fp16>, _ExtVector<4, __fp16>, int)", [Const], "wmma-128b-insts,wavefrontsize64">; +def __builtin_amdgcn_swmmac_f16_16x16x32_f16_w64 : AMDGPUBuiltin<"_ExtVector<4, _Float16>(_ExtVector<4, _Float16>, _ExtVector<8, _Float16>, _ExtVector<4, _Float16>, int)", [Const], "wmma-128b-insts,wavefrontsize64">; def __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64 : AMDGPUBuiltin<"_ExtVector<4, short>(_ExtVector<4, short>, _ExtVector<8, short>, _ExtVector<4, short>, int)", [Const], "wmma-128b-insts,wavefrontsize64">; def __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, _ExtVector<2, int>, _ExtVector<4, int>, int, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize64">; def __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64 : AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, _ExtVector<4, int>, int, _Constant bool)", [Const], "wmma-128b-insts,wavefrontsize64">; @@ -947,13 +947,13 @@ def __builtin_amdgcn_global_load_tr4_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, in def __builtin_amdgcn_global_load_tr8_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int> address_space<1> *)", [Const], "gfx1250-insts,wavefrontsize32">; def __builtin_amdgcn_global_load_tr6_b96_v3i32 : AMDGPUBuiltin<"_ExtVector<3, int>(_ExtVector<3, int> address_space<1> *)", [Const], "transpose-load-f4f6-insts,wavefrontsize32">; def __builtin_amdgcn_global_load_tr16_b128_v8i16 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<8, short> address_space<1> *)", [Const], "gfx1250-insts,wavefrontsize32">; -def __builtin_amdgcn_global_load_tr16_b128_v8f16 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, __fp16> address_space<1> *)", [Const], "gfx1250-insts,wavefrontsize32">; +def __builtin_amdgcn_global_load_tr16_b128_v8f16 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<8, _Float16> address_space<1> *)", [Const], "gfx1250-insts,wavefrontsize32">; def __builtin_amdgcn_global_load_tr16_b128_v8bf16 : AMDGPUBuiltin<"_ExtVector<8, __bf16>(_ExtVector<8, __bf16> address_space<1> *)", [Const], "gfx1250-insts,wavefrontsize32">; def __builtin_amdgcn_ds_load_tr4_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int> address_space<3> *)", [Const], "transpose-load-f4f6-insts,wavefrontsize32">; def __builtin_amdgcn_ds_load_tr8_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, int>(_ExtVector<2, int> address_space<3> *)", [Const], "gfx1250-insts,wavefrontsize32">; def __builtin_amdgcn_ds_load_tr6_b96_v3i32 : AMDGPUBuiltin<"_ExtVector<3, int>(_ExtVector<3, int> address_space<3> *)", [Const], "transpose-load-f4f6-insts,wavefrontsize32">; def __builtin_amdgcn_ds_load_tr16_b128_v8i16 : AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<8, short> address_space<3> *)", [Const], "gfx1250-insts,wavefrontsize32">; -def __builtin_amdgcn_ds_load_tr16_b128_v8f16 : AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, __fp16> address_space<3> *)", [Const], "gfx1250-insts,wavefrontsize32">; +def __builtin_amdgcn_ds_load_tr16_b128_v8f16 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<8, _Float16> address_space<3> *)", [Const], "gfx1250-insts,wavefrontsize32">; def __builtin_amdgcn_ds_load_tr16_b128_v8bf16 : AMDGPUBuiltin<"_ExtVector<8, __bf16>(_ExtVector<8, __bf16> address_space<3> *)", [Const], "gfx1250-insts,wavefrontsize32">; def __builtin_amdgcn_s_setprio_inc_wg : AMDGPUBuiltin<"void(_Constant short)", [], "setprio-inc-wg-inst">; @@ -964,7 +964,7 @@ def __builtin_amdgcn_s_wait_asynccnt : AMDGPUBuiltin<"void(_Constant unsigned sh def __builtin_amdgcn_s_wait_tensorcnt : AMDGPUBuiltin<"void(_Constant unsigned short)", [], "gfx1250-insts">; def __builtin_amdgcn_tanhf : AMDGPUBuiltin<"float(float)", [Const], "tanh-insts">; -def __builtin_amdgcn_tanhh : AMDGPUBuiltin<"__fp16(__fp16)", [Const], "tanh-insts">; +def __builtin_amdgcn_tanhh : AMDGPUBuiltin<"_Float16(_Float16)", [Const], "tanh-insts">; def __builtin_amdgcn_tanh_bf16 : AMDGPUBuiltin<"__bf16(__bf16)", [Const], "bf16-trans-insts">; def __builtin_amdgcn_rcp_bf16 : AMDGPUBuiltin<"__bf16(__bf16)", [Const], "bf16-trans-insts">; def __builtin_amdgcn_sqrt_bf16 : AMDGPUBuiltin<"__bf16(__bf16)", [Const], "bf16-trans-insts">; diff --git a/clang/test/CodeGenHIP/builtins-amdgcn-f16-misc.hip b/clang/test/CodeGenHIP/builtins-amdgcn-f16-misc.hip new file mode 100644 index 0000000000000..fc3bf9a87e282 --- /dev/null +++ b/clang/test/CodeGenHIP/builtins-amdgcn-f16-misc.hip @@ -0,0 +1,88 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm -fcuda-is-device -o - %s | FileCheck %s + +#define __device__ __attribute__((device)) + +typedef _Float16 v2h __attribute__((ext_vector_type(2))); + +// cvt_pkrtz: _ExtVector<2, _Float16>(float, float) +// CHECK-LABEL: define dso_local void @_Z14test_cvt_pkrtzPDv2_DF16_ff( +// CHECK-SAME: ptr noundef [[OUT:%.*]], float noundef [[A:%.*]], float noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store float [[A]], ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[B]], ptr [[B_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[B_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = call contract <2 x half> @llvm.amdgcn.cvt.pkrtz(float [[TMP0]], float [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x half> [[TMP2]], ptr [[TMP3]], align 4 +// CHECK-NEXT: ret void +// +__device__ void test_cvt_pkrtz(v2h *out, float a, float b) { + *out = __builtin_amdgcn_cvt_pkrtz(a, b); +} + +// interp_p2_f16: _Float16(float, float, unsigned int, unsigned int, bool, unsigned int) +// attr_chan and attr must be compile-time constants +// CHECK-LABEL: define dso_local void @_Z18test_interp_p2_f16PDF16_ffj( +// CHECK-SAME: ptr noundef [[OUT:%.*]], float noundef [[P1:%.*]], float noundef [[J:%.*]], i32 noundef [[M0:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[P1_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[J_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[M0_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[P1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P1_ADDR]] to ptr +// CHECK-NEXT: [[J_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_ADDR]] to ptr +// CHECK-NEXT: [[M0_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[M0_ADDR]] to ptr +// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store float [[P1]], ptr [[P1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store float [[J]], ptr [[J_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[M0]], ptr [[M0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[P1_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[J_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[M0_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = call contract half @llvm.amdgcn.interp.p2.f16(float [[TMP0]], float [[TMP1]], i32 2, i32 3, i1 false, i32 [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store half [[TMP3]], ptr [[TMP4]], align 2 +// CHECK-NEXT: ret void +// +__device__ void test_interp_p2_f16(_Float16 *out, float p1, float j, unsigned int m0) { + *out = __builtin_amdgcn_interp_p2_f16(p1, j, 2, 3, false, m0); +} + +// fmed3h: _Float16(_Float16, _Float16, _Float16) - requires gfx9-insts +// CHECK-LABEL: define dso_local void @_Z11test_fmed3hPDF16_DF16_DF16_DF16_( +// CHECK-SAME: ptr noundef [[OUT:%.*]], half noundef [[A:%.*]], half noundef [[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca half, align 2, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca half, align 2, addrspace(5) +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca half, align 2, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store half [[A]], ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: store half [[B]], ptr [[B_ADDR_ASCAST]], align 2 +// CHECK-NEXT: store half [[C]], ptr [[C_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr [[B_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = load half, ptr [[C_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP3:%.*]] = call contract half @llvm.amdgcn.fmed3.f16(half [[TMP0]], half [[TMP1]], half [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store half [[TMP3]], ptr [[TMP4]], align 2 +// CHECK-NEXT: ret void +// +__device__ void test_fmed3h(_Float16 *out, _Float16 a, _Float16 b, _Float16 c) { + *out = __builtin_amdgcn_fmed3h(a, b, c); +} diff --git a/clang/test/CodeGenHIP/builtins-amdgcn-gfx12-f16-w32.hip b/clang/test/CodeGenHIP/builtins-amdgcn-gfx12-f16-w32.hip new file mode 100644 index 0000000000000..a688869be9f38 --- /dev/null +++ b/clang/test/CodeGenHIP/builtins-amdgcn-gfx12-f16-w32.hip @@ -0,0 +1,96 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 -target-feature +wavefrontsize32 -emit-llvm -fcuda-is-device -o - %s | FileCheck %s + +#define __device__ __attribute__((device)) + +typedef _Float16 v8h __attribute__((ext_vector_type(8))); +typedef _Float16 v16h __attribute__((ext_vector_type(16))); +typedef float v8f __attribute__((ext_vector_type(8))); + +// global_load_tr_b128_v8f16: _ExtVector<8, _Float16>(_ExtVector<8, _Float16> address_space<1> *) +// Requires gfx12-insts,wavefrontsize32 +// CHECK-LABEL: define dso_local void @_Z30test_global_load_tr_b128_v8f16PDv8_DF16_PU3AS1S_( +// CHECK-SAME: ptr noundef [[OUT:%.*]], ptr addrspace(1) noundef [[INPTR:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[INPTR_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[INPTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[INPTR_ADDR]] to ptr +// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[INPTR]], ptr [[INPTR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[INPTR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = call contract <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1) [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <8 x half> [[TMP1]], ptr [[TMP2]], align 16 +// CHECK-NEXT: ret void +// +__device__ void test_global_load_tr_b128_v8f16(v8h *out, v8h __attribute__((address_space(1))) *inptr) { + *out = __builtin_amdgcn_global_load_tr_b128_v8f16(inptr); +} + +// swmmac_f32_16x16x32_f16_w32: _ExtVector<8, float>(_ExtVector<8, _Float16>, _ExtVector<16, _Float16>, _ExtVector<8, float>, int) +// Requires wmma-128b-insts,wavefrontsize32 +// CHECK-LABEL: define dso_local void @_Z32test_swmmac_f32_16x16x32_f16_w32PDv8_fDv8_DF16_Dv16_DF16_S_i( +// CHECK-SAME: ptr noundef [[OUT:%.*]], <8 x half> noundef [[A:%.*]], <16 x half> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <16 x half>, align 32, addrspace(5) +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca <8 x float>, align 32, addrspace(5) +// CHECK-NEXT: [[INDEX_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// CHECK-NEXT: [[INDEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[INDEX_ADDR]] to ptr +// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <8 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store <16 x half> [[B]], ptr [[B_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <8 x float> [[C]], ptr [[C_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store i32 [[INDEX]], ptr [[INDEX_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x half>, ptr [[B_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x float>, ptr [[C_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[INDEX_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = call contract <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i32(<8 x half> [[TMP0]], <16 x half> [[TMP1]], <8 x float> [[TMP2]], i32 [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <8 x float> [[TMP4]], ptr [[TMP5]], align 32 +// CHECK-NEXT: ret void +// +__device__ void test_swmmac_f32_16x16x32_f16_w32(v8f *out, v8h a, v16h b, v8f c, int index) { + *out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w32(a, b, c, index); +} + +// swmmac_f16_16x16x32_f16_w32: _ExtVector<8, _Float16>(_ExtVector<8, _Float16>, _ExtVector<16, _Float16>, _ExtVector<8, _Float16>, int) +// Requires wmma-128b-insts,wavefrontsize32 +// CHECK-LABEL: define dso_local void @_Z32test_swmmac_f16_16x16x32_f16_w32PDv8_DF16_S_Dv16_DF16_S_i( +// CHECK-SAME: ptr noundef [[OUT:%.*]], <8 x half> noundef [[A:%.*]], <16 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <16 x half>, align 32, addrspace(5) +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5) +// CHECK-NEXT: [[INDEX_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// CHECK-NEXT: [[INDEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[INDEX_ADDR]] to ptr +// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <8 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store <16 x half> [[B]], ptr [[B_ADDR_ASCAST]], align 32 +// CHECK-NEXT: store <8 x half> [[C]], ptr [[C_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store i32 [[INDEX]], ptr [[INDEX_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x half>, ptr [[B_ADDR_ASCAST]], align 32 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x half>, ptr [[C_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[INDEX_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = call contract <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i32(<8 x half> [[TMP0]], <16 x half> [[TMP1]], <8 x half> [[TMP2]], i32 [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <8 x half> [[TMP4]], ptr [[TMP5]], align 16 +// CHECK-NEXT: ret void +// +__device__ void test_swmmac_f16_16x16x32_f16_w32(v8h *out, v8h a, v16h b, v8h c, int index) { + *out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w32(a, b, c, index); +} diff --git a/clang/test/CodeGenHIP/builtins-amdgcn-gfx12-f16-w64.hip b/clang/test/CodeGenHIP/builtins-amdgcn-gfx12-f16-w64.hip new file mode 100644 index 0000000000000..a18fdffe9920a --- /dev/null +++ b/clang/test/CodeGenHIP/builtins-amdgcn-gfx12-f16-w64.hip @@ -0,0 +1,96 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 -target-feature +wavefrontsize64 -emit-llvm -fcuda-is-device -o - %s | FileCheck %s + +#define __device__ __attribute__((device)) + +typedef _Float16 v4h __attribute__((ext_vector_type(4))); +typedef _Float16 v8h __attribute__((ext_vector_type(8))); +typedef float v4f __attribute__((ext_vector_type(4))); + +// global_load_tr_b128_v4f16: _ExtVector<4, _Float16>(_ExtVector<4, _Float16> address_space<1> *) +// Requires gfx12-insts,wavefrontsize64 +// CHECK-LABEL: define dso_local void @_Z30test_global_load_tr_b128_v4f16PDv4_DF16_PU3AS1S_( +// CHECK-SAME: ptr noundef [[OUT:%.*]], ptr addrspace(1) noundef [[INPTR:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[INPTR_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[INPTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[INPTR_ADDR]] to ptr +// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[INPTR]], ptr [[INPTR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[INPTR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = call contract <4 x half> @llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1) [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <4 x half> [[TMP1]], ptr [[TMP2]], align 8 +// CHECK-NEXT: ret void +// +__device__ void test_global_load_tr_b128_v4f16(v4h *out, v4h __attribute__((address_space(1))) *inptr) { + *out = __builtin_amdgcn_global_load_tr_b128_v4f16(inptr); +} + +// swmmac_f32_16x16x32_f16_w64: _ExtVector<4, float>(_ExtVector<4, _Float16>, _ExtVector<8, _Float16>, _ExtVector<4, float>, int) +// Requires wmma-128b-insts,wavefrontsize64 +// CHECK-LABEL: define dso_local void @_Z32test_swmmac_f32_16x16x32_f16_w64PDv4_fDv4_DF16_Dv8_DF16_S_i( +// CHECK-SAME: ptr noundef [[OUT:%.*]], <4 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x half>, align 8, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5) +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5) +// CHECK-NEXT: [[INDEX_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// CHECK-NEXT: [[INDEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[INDEX_ADDR]] to ptr +// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <4 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <8 x half> [[B]], ptr [[B_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store <4 x float> [[C]], ptr [[C_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store i32 [[INDEX]], ptr [[INDEX_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x half>, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[C_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[INDEX_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = call contract <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i32(<4 x half> [[TMP0]], <8 x half> [[TMP1]], <4 x float> [[TMP2]], i32 [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <4 x float> [[TMP4]], ptr [[TMP5]], align 16 +// CHECK-NEXT: ret void +// +__device__ void test_swmmac_f32_16x16x32_f16_w64(v4f *out, v4h a, v8h b, v4f c, int index) { + *out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w64(a, b, c, index); +} + +// swmmac_f16_16x16x32_f16_w64: _ExtVector<4, _Float16>(_ExtVector<4, _Float16>, _ExtVector<8, _Float16>, _ExtVector<4, _Float16>, int) +// Requires wmma-128b-insts,wavefrontsize64 +// CHECK-LABEL: define dso_local void @_Z32test_swmmac_f16_16x16x32_f16_w64PDv4_DF16_S_Dv8_DF16_S_i( +// CHECK-SAME: ptr noundef [[OUT:%.*]], <4 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]], i32 noundef [[INDEX:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x half>, align 8, addrspace(5) +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5) +// CHECK-NEXT: [[C_ADDR:%.*]] = alloca <4 x half>, align 8, addrspace(5) +// CHECK-NEXT: [[INDEX_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// CHECK-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// CHECK-NEXT: [[INDEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[INDEX_ADDR]] to ptr +// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <4 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <8 x half> [[B]], ptr [[B_ADDR_ASCAST]], align 16 +// CHECK-NEXT: store <4 x half> [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[INDEX]], ptr [[INDEX_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x half>, ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR_ASCAST]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x half>, ptr [[C_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[INDEX_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = call contract <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i32(<4 x half> [[TMP0]], <8 x half> [[TMP1]], <4 x half> [[TMP2]], i32 [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <4 x half> [[TMP4]], ptr [[TMP5]], align 8 +// CHECK-NEXT: ret void +// +__device__ void test_swmmac_f16_16x16x32_f16_w64(v4h *out, v4h a, v8h b, v4h c, int index) { + *out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w64(a, b, c, index); +} diff --git a/clang/test/CodeGenHIP/builtins-amdgcn-gfx1250-f16-misc.hip b/clang/test/CodeGenHIP/builtins-amdgcn-gfx1250-f16-misc.hip new file mode 100644 index 0000000000000..c6f34f789d1ba --- /dev/null +++ b/clang/test/CodeGenHIP/builtins-amdgcn-gfx1250-f16-misc.hip @@ -0,0 +1,70 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1250 -emit-llvm -fcuda-is-device -o - %s | FileCheck %s + +#define __device__ __attribute__((device)) + +typedef _Float16 v8h __attribute__((ext_vector_type(8))); + +// global_load_tr16_b128_v8f16: _ExtVector<8, _Float16>(_ExtVector<8, _Float16> address_space<1> *) +// Requires gfx1250-insts,wavefrontsize32 +// CHECK-LABEL: define dso_local void @_Z32test_global_load_tr16_b128_v8f16PDv8_DF16_PU3AS1S_( +// CHECK-SAME: ptr noundef [[OUT:%.*]], ptr addrspace(1) noundef [[INPTR:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[INPTR_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[INPTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[INPTR_ADDR]] to ptr +// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[INPTR]], ptr [[INPTR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[INPTR_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = call contract <8 x half> @llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1) [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <8 x half> [[TMP1]], ptr [[TMP2]], align 16 +// CHECK-NEXT: ret void +// +__device__ void test_global_load_tr16_b128_v8f16(v8h *out, v8h __attribute__((address_space(1))) *inptr) { + *out = __builtin_amdgcn_global_load_tr16_b128_v8f16(inptr); +} + +// ds_load_tr16_b128_v8f16: _ExtVector<8, _Float16>(_ExtVector<8, _Float16> address_space<3> *) +// Requires gfx1250-insts,wavefrontsize32 +// CHECK-LABEL: define dso_local void @_Z28test_ds_load_tr16_b128_v8f16PDv8_DF16_PU3AS3S_( +// CHECK-SAME: ptr noundef [[OUT:%.*]], ptr addrspace(3) noundef [[INPTR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[INPTR_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[INPTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[INPTR_ADDR]] to ptr +// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(3) [[INPTR]], ptr [[INPTR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(3), ptr [[INPTR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call contract <8 x half> @llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <8 x half> [[TMP1]], ptr [[TMP2]], align 16 +// CHECK-NEXT: ret void +// +__device__ void test_ds_load_tr16_b128_v8f16(v8h *out, v8h __attribute__((address_space(3))) *inptr) { + *out = __builtin_amdgcn_ds_load_tr16_b128_v8f16(inptr); +} + +// tanhh: _Float16(_Float16) +// Requires tanh-insts +// CHECK-LABEL: define dso_local void @_Z10test_tanhhPDF16_DF16_( +// CHECK-SAME: ptr noundef [[OUT:%.*]], half noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca half, align 2, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store half [[A]], ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = call contract half @llvm.amdgcn.tanh.f16(half [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store half [[TMP1]], ptr [[TMP2]], align 2 +// CHECK-NEXT: ret void +// +__device__ void test_tanhh(_Float16 *out, _Float16 a) { + *out = __builtin_amdgcn_tanhh(a); +} diff --git a/clang/test/CodeGenHIP/builtins-amdgcn-gfx950-f16.hip b/clang/test/CodeGenHIP/builtins-amdgcn-gfx950-f16.hip new file mode 100644 index 0000000000000..96e22d04ee42f --- /dev/null +++ b/clang/test/CodeGenHIP/builtins-amdgcn-gfx950-f16.hip @@ -0,0 +1,27 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6 +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx950 -emit-llvm -fcuda-is-device -o - %s | FileCheck %s + +#define __device__ __attribute__((device)) + +typedef _Float16 v4h __attribute__((ext_vector_type(4))); + +// ds_read_tr16_b64_v4f16: _ExtVector<4, _Float16>(_ExtVector<4, _Float16> address_space<3> *) +// CHECK-LABEL: define dso_local void @_Z27test_ds_read_tr16_b64_v4f16PDv4_DF16_PU3AS3S_( +// CHECK-SAME: ptr noundef [[OUT:%.*]], ptr addrspace(3) noundef [[INPTR:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[INPTR_ADDR:%.*]] = alloca ptr addrspace(3), align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[INPTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[INPTR_ADDR]] to ptr +// CHECK-NEXT: store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(3) [[INPTR]], ptr [[INPTR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(3), ptr [[INPTR_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call contract <4 x half> @llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <4 x half> [[TMP1]], ptr [[TMP2]], align 8 +// CHECK-NEXT: ret void +// +__device__ void test_ds_read_tr16_b64_v4f16(v4h *out, v4h __attribute__((address_space(3))) *inptr) { + *out = __builtin_amdgcn_ds_read_tr16_b64_v4f16(inptr); +} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-tr.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-tr.cl index 1e3a88a41f90e..467d4fe17f504 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-tr.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-tr.cl @@ -6,7 +6,7 @@ typedef int v2i __attribute__((ext_vector_type(2))); typedef int v3i __attribute__((ext_vector_type(3))); typedef int v4i __attribute__((ext_vector_type(4))); typedef short v8s __attribute__((ext_vector_type(8))); -typedef half v8h __attribute__((ext_vector_type(8))); +typedef _Float16 v8h __attribute__((ext_vector_type(8))); typedef __bf16 v8y __attribute__((ext_vector_type(8))); // CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr4_b64_v2i32( diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl index af1f434403767..0d9bcbfe335fa 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl @@ -4,7 +4,7 @@ typedef int v2i __attribute__((ext_vector_type(2))); typedef int v3i __attribute__((ext_vector_type(3))); typedef short v4s __attribute__((ext_vector_type(4))); -typedef half v4h __attribute__((ext_vector_type(4))); +typedef _Float16 v4h __attribute__((ext_vector_type(4))); typedef __bf16 v4y __attribute__((ext_vector_type(4))); // GFX950-LABEL: define dso_local <2 x i32> @test_amdgcn_ds_read_b64_tr_b4_v2i32( diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl index 8242ae6a98c40..267b634414692 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl @@ -5,10 +5,10 @@ typedef int v2i __attribute__((ext_vector_type(2))); typedef short v8s __attribute__((ext_vector_type(8))); -typedef half v8h __attribute__((ext_vector_type(8))); +typedef _Float16 v8h __attribute__((ext_vector_type(8))); typedef __bf16 v8y __attribute__((ext_vector_type(8))); typedef short v4s __attribute__((ext_vector_type(4))); -typedef half v4h __attribute__((ext_vector_type(4))); +typedef _Float16 v4h __attribute__((ext_vector_type(4))); typedef __bf16 v4y __attribute__((ext_vector_type(4))); void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr, global v8y* v8y_inptr, diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl index 6f7a93ef897ac..5533b6cfa7c8f 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl @@ -4,7 +4,7 @@ // REQUIRES: amdgpu-registered-target typedef short v4s __attribute__((ext_vector_type(4))); -typedef half v4h __attribute__((ext_vector_type(4))); +typedef _Float16 v4h __attribute__((ext_vector_type(4))); typedef __bf16 v4y __attribute__((ext_vector_type(4))); void amdgcn_global_load_tr(global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr, global v4y* v4y_inptr) diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl index b7323f1b41c2a..2f380bcf57d47 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl @@ -5,7 +5,7 @@ typedef int v2i __attribute__((ext_vector_type(2))); typedef short v8s __attribute__((ext_vector_type(8))); -typedef half v8h __attribute__((ext_vector_type(8))); +typedef _Float16 v8h __attribute__((ext_vector_type(8))); typedef __bf16 v8y __attribute__((ext_vector_type(8))); void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr, global v8y* v8y_inptr) diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl index 186fc4eacfaaf..012844a90512d 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl @@ -4,7 +4,7 @@ typedef int v2i __attribute__((ext_vector_type(2))); typedef short v8s __attribute__((ext_vector_type(8))); -typedef half v8h __attribute__((ext_vector_type(8))); +typedef _Float16 v8h __attribute__((ext_vector_type(8))); typedef __bf16 v8y __attribute__((ext_vector_type(8))); // CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_v2i32( diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl index b6627f1c8114d..3d84d04f56eb8 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl @@ -3,7 +3,7 @@ // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200 typedef short v4s __attribute__((ext_vector_type(4))); -typedef half v4h __attribute__((ext_vector_type(4))); +typedef _Float16 v4h __attribute__((ext_vector_type(4))); typedef __bf16 v4y __attribute__((ext_vector_type(4))); // CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_i32( diff --git a/clang/test/SemaHIP/hip-builtin-lvalue-to-rvalue.hip b/clang/test/SemaHIP/hip-builtin-lvalue-to-rvalue.hip index b3238d7b29d3e..db156af516f46 100644 --- a/clang/test/SemaHIP/hip-builtin-lvalue-to-rvalue.hip +++ b/clang/test/SemaHIP/hip-builtin-lvalue-to-rvalue.hip @@ -7,7 +7,7 @@ #define __device__ __attribute__((device)) -typedef __attribute__((__vector_size__(8 * sizeof(__fp16)))) __fp16 fp16x8_t; +typedef __attribute__((__vector_size__(8 * sizeof(_Float16)))) _Float16 fp16x8_t; // CHECK: ImplicitCastExpr {{.*}} <AddressSpaceConversion> // CHECK-NEXT: ImplicitCastExpr {{.*}} <LValueToRValue> _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
