[clang] a17289b - [Clang][AMDGPU] Change __fp16 to _Float16 in builtin definitions (#185446)

via cfe-commits Mon, 09 Mar 2026 21:50:58 -0700

Author: Rana Pratap Reddy
Date: 2026-03-10T10:20:40+05:30
New Revision: a17289b76ae31efdd5b6ce0ed8da04b1b9185a33


URL: 
https://github.com/llvm/llvm-project/commit/a17289b76ae31efdd5b6ce0ed8da04b1b9185a33
DIFF: 
https://github.com/llvm/llvm-project/commit/a17289b76ae31efdd5b6ce0ed8da04b1b9185a33.diff

LOG: [Clang][AMDGPU] Change __fp16 to _Float16 in builtin definitions (#185446)

Change the type signature of `SWMMAC, load, cvt` builtins from `__fp16
to _Float16` in the tablegen builtin definitions.

Added: 
    clang/test/CodeGenHIP/builtins-amdgcn-f16-misc.hip
    clang/test/CodeGenHIP/builtins-amdgcn-gfx12-f16-w32.hip
    clang/test/CodeGenHIP/builtins-amdgcn-gfx12-f16-w64.hip
    clang/test/CodeGenHIP/builtins-amdgcn-gfx1250-f16-misc.hip
    clang/test/CodeGenHIP/builtins-amdgcn-gfx950-f16.hip

Modified: 
    clang/include/clang/Basic/BuiltinsAMDGPU.td
    clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-tr.cl
    clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl
    clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl
    clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl
    clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl
    clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl
    clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl
    clang/test/SemaHIP/hip-builtin-lvalue-to-rvalue.hip

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td 
b/clang/include/clang/Basic/BuiltinsAMDGPU.td
index acd0a34a79253..18aebdc38bcfc 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.td
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td
@@ -223,7 +223,7 @@ def __builtin_amdgcn_alignbit : AMDGPUBuiltin<"unsigned 
int(unsigned int, unsign
 def __builtin_amdgcn_alignbyte : AMDGPUBuiltin<"unsigned int(unsigned int, 
unsigned int, unsigned int)", [Const]>;
 def __builtin_amdgcn_ubfe : AMDGPUBuiltin<"unsigned int(unsigned int, unsigned 
int, unsigned int)", [Const]>;
 def __builtin_amdgcn_sbfe : AMDGPUBuiltin<"unsigned int(unsigned int, unsigned 
int, unsigned int)", [Const]>;
-def __builtin_amdgcn_cvt_pkrtz : AMDGPUBuiltin<"_ExtVector<2, __fp16>(float, 
float)", [Const]>;
+def __builtin_amdgcn_cvt_pkrtz : AMDGPUBuiltin<"_ExtVector<2, _Float16>(float, 
float)", [Const]>;
 def __builtin_amdgcn_cvt_pknorm_i16 : AMDGPUBuiltin<"_ExtVector<2, 
short>(float, float)", [Const], "cvt-pknorm-vop2-insts">;
 def __builtin_amdgcn_cvt_pknorm_u16 : AMDGPUBuiltin<"_ExtVector<2, unsigned 
short>(float, float)", [Const], "cvt-pknorm-vop2-insts">;
 def __builtin_amdgcn_cvt_pk_i16 : AMDGPUBuiltin<"_ExtVector<2, short>(int, 
int)", [Const]>;
@@ -319,7 +319,7 @@ def __builtin_amdgcn_ds_gws_sema_release_all : 
AMDGPUBuiltin<"void(unsigned int)
 // Interpolation builtins.
 
//===----------------------------------------------------------------------===//
 def __builtin_amdgcn_interp_p1_f16 : AMDGPUBuiltin<"float(float, unsigned int, 
unsigned int, bool, unsigned int)", [Const]>;
-def __builtin_amdgcn_interp_p2_f16 : AMDGPUBuiltin<"__fp16(float, float, 
unsigned int, unsigned int, bool, unsigned int)", [Const]>;
+def __builtin_amdgcn_interp_p2_f16 : AMDGPUBuiltin<"_Float16(float, float, 
unsigned int, unsigned int, bool, unsigned int)", [Const]>;
 def __builtin_amdgcn_interp_p1 : AMDGPUBuiltin<"float(float, unsigned int, 
unsigned int, unsigned int)", [Const]>;
 def __builtin_amdgcn_interp_p2 : AMDGPUBuiltin<"float(float, float, unsigned 
int, unsigned int, unsigned int)", [Const]>;
 def __builtin_amdgcn_interp_mov : AMDGPUBuiltin<"float(unsigned int, unsigned 
int, unsigned int, unsigned int)", [Const]>;
@@ -349,7 +349,7 @@ def __builtin_amdgcn_perm : AMDGPUBuiltin<"unsigned 
int(unsigned int, unsigned i
 // GFX9+ only builtins.
 
//===----------------------------------------------------------------------===//
 
-def __builtin_amdgcn_fmed3h : AMDGPUBuiltin<"__fp16(__fp16, __fp16, __fp16)", 
[Const], "gfx9-insts">;
+def __builtin_amdgcn_fmed3h : AMDGPUBuiltin<"_Float16(_Float16, _Float16, 
_Float16)", [Const], "gfx9-insts">;
 
 def __builtin_amdgcn_global_atomic_fadd_f64 : AMDGPUBuiltin<"double(double 
address_space<1> *, double)", [], "gfx90a-insts">;
 def __builtin_amdgcn_global_atomic_fadd_f32 : AMDGPUBuiltin<"float(float 
address_space<1> *, float)", [], "atomic-fadd-rtn-insts">;
@@ -669,7 +669,7 @@ def __builtin_amdgcn_ds_read_tr4_b64_v2i32 : 
AMDGPUBuiltin<"_ExtVector<2, int>(_
 def __builtin_amdgcn_ds_read_tr6_b96_v3i32 : AMDGPUBuiltin<"_ExtVector<3, 
int>(_ExtVector<3, int> address_space<3> *)", [Const], "gfx950-insts">;
 def __builtin_amdgcn_ds_read_tr8_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, 
int>(_ExtVector<2, int> address_space<3> *)", [Const], "gfx950-insts">;
 def __builtin_amdgcn_ds_read_tr16_b64_v4i16 : AMDGPUBuiltin<"_ExtVector<4, 
short>(_ExtVector<4, short> address_space<3> *)", [Const], "gfx950-insts">;
-def __builtin_amdgcn_ds_read_tr16_b64_v4f16 : AMDGPUBuiltin<"_ExtVector<4, 
__fp16>(_ExtVector<4, __fp16> address_space<3> *)", [Const], "gfx950-insts">;
+def __builtin_amdgcn_ds_read_tr16_b64_v4f16 : AMDGPUBuiltin<"_ExtVector<4, 
_Float16>(_ExtVector<4, _Float16> address_space<3> *)", [Const], 
"gfx950-insts">;
 def __builtin_amdgcn_ds_read_tr16_b64_v4bf16 : AMDGPUBuiltin<"_ExtVector<4, 
__bf16>(_ExtVector<4, __bf16> address_space<3> *)", [Const], "gfx950-insts">;
 
 def __builtin_amdgcn_ashr_pk_i8_i32 : AMDGPUBuiltin<"unsigned short(unsigned 
int, unsigned int, unsigned int)", [Const], "ashr-pk-insts">;
@@ -699,11 +699,11 @@ def __builtin_amdgcn_s_buffer_prefetch_data : 
AMDGPUBuiltin<"void(__amdgpu_buffe
 
 def __builtin_amdgcn_global_load_tr_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, 
int>(_ExtVector<2, int> address_space<1> *)", [Const], 
"gfx12-insts,wavefrontsize32">;
 def __builtin_amdgcn_global_load_tr_b128_v8i16 : AMDGPUBuiltin<"_ExtVector<8, 
short>(_ExtVector<8, short> address_space<1> *)", [Const], 
"gfx12-insts,wavefrontsize32">;
-def __builtin_amdgcn_global_load_tr_b128_v8f16 : AMDGPUBuiltin<"_ExtVector<8, 
__fp16>(_ExtVector<8, __fp16> address_space<1> *)", [Const], 
"gfx12-insts,wavefrontsize32">;
+def __builtin_amdgcn_global_load_tr_b128_v8f16 : AMDGPUBuiltin<"_ExtVector<8, 
_Float16>(_ExtVector<8, _Float16> address_space<1> *)", [Const], 
"gfx12-insts,wavefrontsize32">;
 def __builtin_amdgcn_global_load_tr_b128_v8bf16 : AMDGPUBuiltin<"_ExtVector<8, 
__bf16>(_ExtVector<8, __bf16> address_space<1> *)", [Const], 
"gfx12-insts,wavefrontsize32">;
 def __builtin_amdgcn_global_load_tr_b64_i32 : AMDGPUBuiltin<"int(int 
address_space<1> *)", [Const], "gfx12-insts,wavefrontsize64">;
 def __builtin_amdgcn_global_load_tr_b128_v4i16 : AMDGPUBuiltin<"_ExtVector<4, 
short>(_ExtVector<4, short> address_space<1> *)", [Const], 
"gfx12-insts,wavefrontsize64">;
-def __builtin_amdgcn_global_load_tr_b128_v4f16 : AMDGPUBuiltin<"_ExtVector<4, 
__fp16>(_ExtVector<4, __fp16> address_space<1> *)", [Const], 
"gfx12-insts,wavefrontsize64">;
+def __builtin_amdgcn_global_load_tr_b128_v4f16 : AMDGPUBuiltin<"_ExtVector<4, 
_Float16>(_ExtVector<4, _Float16> address_space<1> *)", [Const], 
"gfx12-insts,wavefrontsize64">;
 def __builtin_amdgcn_global_load_tr_b128_v4bf16 : AMDGPUBuiltin<"_ExtVector<4, 
__bf16>(_ExtVector<4, __bf16> address_space<1> *)", [Const], 
"gfx12-insts,wavefrontsize64">;
 
 def __builtin_amdgcn_ds_bpermute_fi_b32 : AMDGPUBuiltin<"int(int, int)", 
[Const], "gfx12-insts">;
@@ -828,9 +828,9 @@ def __builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12 : 
AMDGPUBuiltin<"_ExtVector
   let ArgNames = ["a_sign", "a", "b_sign", "b", "c", "clamp"];
 }
 
-def __builtin_amdgcn_swmmac_f32_16x16x32_f16_w32 : 
AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, __fp16>, _ExtVector<16, 
__fp16>, _ExtVector<8, float>, int)", [Const], 
"wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_f32_16x16x32_f16_w32 : 
AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, _Float16>, _ExtVector<16, 
_Float16>, _ExtVector<8, float>, int)", [Const], 
"wmma-128b-insts,wavefrontsize32">;
 def __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32 : 
AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<8, short>, _ExtVector<16, 
short>, _ExtVector<8, float>, int)", [Const], 
"wmma-128b-insts,wavefrontsize32">;
-def __builtin_amdgcn_swmmac_f16_16x16x32_f16_w32 : 
AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, __fp16>, _ExtVector<16, 
__fp16>, _ExtVector<8, __fp16>, int)", [Const], 
"wmma-128b-insts,wavefrontsize32">;
+def __builtin_amdgcn_swmmac_f16_16x16x32_f16_w32 : 
AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<8, _Float16>, _ExtVector<16, 
_Float16>, _ExtVector<8, _Float16>, int)", [Const], 
"wmma-128b-insts,wavefrontsize32">;
 def __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32 : 
AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<8, short>, _ExtVector<16, 
short>, _ExtVector<8, short>, int)", [Const], 
"wmma-128b-insts,wavefrontsize32">;
 def __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32 : 
AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, _ExtVector<2, int>, _Constant 
bool, _ExtVector<4, int>, _ExtVector<8, int>, int, _Constant bool)", [Const], 
"wmma-128b-insts,wavefrontsize32">;
 def __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32 : 
AMDGPUBuiltin<"_ExtVector<8, int>(_Constant bool, int, _Constant bool, 
_ExtVector<2, int>, _ExtVector<8, int>, int, _Constant bool)", [Const], 
"wmma-128b-insts,wavefrontsize32">;
@@ -840,9 +840,9 @@ def __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32 : 
AMDGPUBuiltin<"_ExtVector
 def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32 : 
AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, 
_ExtVector<8, float>, int)", [Const], "wmma-128b-insts,wavefrontsize32">;
 def __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32 : 
AMDGPUBuiltin<"_ExtVector<8, float>(_ExtVector<2, int>, _ExtVector<4, int>, 
_ExtVector<8, float>, int)", [Const], "wmma-128b-insts,wavefrontsize32">;
 
-def __builtin_amdgcn_swmmac_f32_16x16x32_f16_w64 : 
AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, __fp16>, _ExtVector<8, 
__fp16>, _ExtVector<4, float>, int)", [Const], 
"wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_swmmac_f32_16x16x32_f16_w64 : 
AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, _Float16>, _ExtVector<8, 
_Float16>, _ExtVector<4, float>, int)", [Const], 
"wmma-128b-insts,wavefrontsize64">;
 def __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64 : 
AMDGPUBuiltin<"_ExtVector<4, float>(_ExtVector<4, short>, _ExtVector<8, short>, 
_ExtVector<4, float>, int)", [Const], "wmma-128b-insts,wavefrontsize64">;
-def __builtin_amdgcn_swmmac_f16_16x16x32_f16_w64 : 
AMDGPUBuiltin<"_ExtVector<4, __fp16>(_ExtVector<4, __fp16>, _ExtVector<8, 
__fp16>, _ExtVector<4, __fp16>, int)", [Const], 
"wmma-128b-insts,wavefrontsize64">;
+def __builtin_amdgcn_swmmac_f16_16x16x32_f16_w64 : 
AMDGPUBuiltin<"_ExtVector<4, _Float16>(_ExtVector<4, _Float16>, _ExtVector<8, 
_Float16>, _ExtVector<4, _Float16>, int)", [Const], 
"wmma-128b-insts,wavefrontsize64">;
 def __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64 : 
AMDGPUBuiltin<"_ExtVector<4, short>(_ExtVector<4, short>, _ExtVector<8, short>, 
_ExtVector<4, short>, int)", [Const], "wmma-128b-insts,wavefrontsize64">;
 def __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64 : 
AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, 
_ExtVector<2, int>, _ExtVector<4, int>, int, _Constant bool)", [Const], 
"wmma-128b-insts,wavefrontsize64">;
 def __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64 : 
AMDGPUBuiltin<"_ExtVector<4, int>(_Constant bool, int, _Constant bool, int, 
_ExtVector<4, int>, int, _Constant bool)", [Const], 
"wmma-128b-insts,wavefrontsize64">;
@@ -947,13 +947,13 @@ def __builtin_amdgcn_global_load_tr4_b64_v2i32 : 
AMDGPUBuiltin<"_ExtVector<2, in
 def __builtin_amdgcn_global_load_tr8_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, 
int>(_ExtVector<2, int> address_space<1> *)", [Const], 
"gfx1250-insts,wavefrontsize32">;
 def __builtin_amdgcn_global_load_tr6_b96_v3i32 : AMDGPUBuiltin<"_ExtVector<3, 
int>(_ExtVector<3, int> address_space<1> *)", [Const], 
"transpose-load-f4f6-insts,wavefrontsize32">;
 def __builtin_amdgcn_global_load_tr16_b128_v8i16 : 
AMDGPUBuiltin<"_ExtVector<8, short>(_ExtVector<8, short> address_space<1> *)", 
[Const], "gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_global_load_tr16_b128_v8f16 : 
AMDGPUBuiltin<"_ExtVector<8, __fp16>(_ExtVector<8, __fp16> address_space<1> 
*)", [Const], "gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_global_load_tr16_b128_v8f16 : 
AMDGPUBuiltin<"_ExtVector<8, _Float16>(_ExtVector<8, _Float16> address_space<1> 
*)", [Const], "gfx1250-insts,wavefrontsize32">;
 def __builtin_amdgcn_global_load_tr16_b128_v8bf16 : 
AMDGPUBuiltin<"_ExtVector<8, __bf16>(_ExtVector<8, __bf16> address_space<1> 
*)", [Const], "gfx1250-insts,wavefrontsize32">;
 def __builtin_amdgcn_ds_load_tr4_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, 
int>(_ExtVector<2, int> address_space<3> *)", [Const], 
"transpose-load-f4f6-insts,wavefrontsize32">;
 def __builtin_amdgcn_ds_load_tr8_b64_v2i32 : AMDGPUBuiltin<"_ExtVector<2, 
int>(_ExtVector<2, int> address_space<3> *)", [Const], 
"gfx1250-insts,wavefrontsize32">;
 def __builtin_amdgcn_ds_load_tr6_b96_v3i32 : AMDGPUBuiltin<"_ExtVector<3, 
int>(_ExtVector<3, int> address_space<3> *)", [Const], 
"transpose-load-f4f6-insts,wavefrontsize32">;
 def __builtin_amdgcn_ds_load_tr16_b128_v8i16 : AMDGPUBuiltin<"_ExtVector<8, 
short>(_ExtVector<8, short> address_space<3> *)", [Const], 
"gfx1250-insts,wavefrontsize32">;
-def __builtin_amdgcn_ds_load_tr16_b128_v8f16 : AMDGPUBuiltin<"_ExtVector<8, 
__fp16>(_ExtVector<8, __fp16> address_space<3> *)", [Const], 
"gfx1250-insts,wavefrontsize32">;
+def __builtin_amdgcn_ds_load_tr16_b128_v8f16 : AMDGPUBuiltin<"_ExtVector<8, 
_Float16>(_ExtVector<8, _Float16> address_space<3> *)", [Const], 
"gfx1250-insts,wavefrontsize32">;
 def __builtin_amdgcn_ds_load_tr16_b128_v8bf16 : AMDGPUBuiltin<"_ExtVector<8, 
__bf16>(_ExtVector<8, __bf16> address_space<3> *)", [Const], 
"gfx1250-insts,wavefrontsize32">;
 
 def __builtin_amdgcn_s_setprio_inc_wg : AMDGPUBuiltin<"void(_Constant short)", 
[], "setprio-inc-wg-inst">;
@@ -964,7 +964,7 @@ def __builtin_amdgcn_s_wait_asynccnt : 
AMDGPUBuiltin<"void(_Constant unsigned sh
 def __builtin_amdgcn_s_wait_tensorcnt : AMDGPUBuiltin<"void(_Constant unsigned 
short)", [], "gfx1250-insts">;
 
 def __builtin_amdgcn_tanhf : AMDGPUBuiltin<"float(float)", [Const], 
"tanh-insts">;
-def __builtin_amdgcn_tanhh : AMDGPUBuiltin<"__fp16(__fp16)", [Const], 
"tanh-insts">;
+def __builtin_amdgcn_tanhh : AMDGPUBuiltin<"_Float16(_Float16)", [Const], 
"tanh-insts">;
 def __builtin_amdgcn_tanh_bf16 : AMDGPUBuiltin<"__bf16(__bf16)", [Const], 
"bf16-trans-insts">;
 def __builtin_amdgcn_rcp_bf16 : AMDGPUBuiltin<"__bf16(__bf16)", [Const], 
"bf16-trans-insts">;
 def __builtin_amdgcn_sqrt_bf16 : AMDGPUBuiltin<"__bf16(__bf16)", [Const], 
"bf16-trans-insts">;

diff  --git a/clang/test/CodeGenHIP/builtins-amdgcn-f16-misc.hip 
b/clang/test/CodeGenHIP/builtins-amdgcn-f16-misc.hip
new file mode 100644
index 0000000000000..fc3bf9a87e282
--- /dev/null
+++ b/clang/test/CodeGenHIP/builtins-amdgcn-f16-misc.hip
@@ -0,0 +1,88 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 6
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx900 -emit-llvm 
-fcuda-is-device -o - %s | FileCheck %s
+
+#define __device__ __attribute__((device))
+
+typedef _Float16 v2h __attribute__((ext_vector_type(2)));
+
+// cvt_pkrtz: _ExtVector<2, _Float16>(float, float)
+// CHECK-LABEL: define dso_local void @_Z14test_cvt_pkrtzPDv2_DF16_ff(
+// CHECK-SAME: ptr noundef [[OUT:%.*]], float noundef [[A:%.*]], float noundef 
[[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[A_ADDR]] to ptr
+// CHECK-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[B_ADDR]] to ptr
+// CHECK-NEXT:    store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store float [[A]], ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store float [[B]], ptr [[B_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[B_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = call contract <2 x half> 
@llvm.amdgcn.cvt.pkrtz(float [[TMP0]], float [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store <2 x half> [[TMP2]], ptr [[TMP3]], align 4
+// CHECK-NEXT:    ret void
+//
+__device__ void test_cvt_pkrtz(v2h *out, float a, float b) {
+  *out = __builtin_amdgcn_cvt_pkrtz(a, b);
+}
+
+// interp_p2_f16: _Float16(float, float, unsigned int, unsigned int, bool, 
unsigned int)
+// attr_chan and attr must be compile-time constants
+// CHECK-LABEL: define dso_local void @_Z18test_interp_p2_f16PDF16_ffj(
+// CHECK-SAME: ptr noundef [[OUT:%.*]], float noundef [[P1:%.*]], float 
noundef [[J:%.*]], i32 noundef [[M0:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[P1_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[J_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[M0_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[P1_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[P1_ADDR]] to ptr
+// CHECK-NEXT:    [[J_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[J_ADDR]] to ptr
+// CHECK-NEXT:    [[M0_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[M0_ADDR]] to ptr
+// CHECK-NEXT:    store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store float [[P1]], ptr [[P1_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store float [[J]], ptr [[J_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[M0]], ptr [[M0_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[P1_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[J_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[M0_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = call contract half 
@llvm.amdgcn.interp.p2.f16(float [[TMP0]], float [[TMP1]], i32 2, i32 3, i1 
false, i32 [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store half [[TMP3]], ptr [[TMP4]], align 2
+// CHECK-NEXT:    ret void
+//
+__device__ void test_interp_p2_f16(_Float16 *out, float p1, float j, unsigned 
int m0) {
+  *out = __builtin_amdgcn_interp_p2_f16(p1, j, 2, 3, false, m0);
+}
+
+// fmed3h: _Float16(_Float16, _Float16, _Float16) - requires gfx9-insts
+// CHECK-LABEL: define dso_local void @_Z11test_fmed3hPDF16_DF16_DF16_DF16_(
+// CHECK-SAME: ptr noundef [[OUT:%.*]], half noundef [[A:%.*]], half noundef 
[[B:%.*]], half noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca half, align 2, addrspace(5)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca half, align 2, addrspace(5)
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca half, align 2, addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[A_ADDR]] to ptr
+// CHECK-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[B_ADDR]] to ptr
+// CHECK-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[C_ADDR]] to ptr
+// CHECK-NEXT:    store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store half [[A]], ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    store half [[B]], ptr [[B_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    store half [[C]], ptr [[C_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = load half, ptr [[B_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP2:%.*]] = load half, ptr [[C_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP3:%.*]] = call contract half 
@llvm.amdgcn.fmed3.f16(half [[TMP0]], half [[TMP1]], half [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store half [[TMP3]], ptr [[TMP4]], align 2
+// CHECK-NEXT:    ret void
+//
+__device__ void test_fmed3h(_Float16 *out, _Float16 a, _Float16 b, _Float16 c) 
{
+  *out = __builtin_amdgcn_fmed3h(a, b, c);
+}

diff  --git a/clang/test/CodeGenHIP/builtins-amdgcn-gfx12-f16-w32.hip 
b/clang/test/CodeGenHIP/builtins-amdgcn-gfx12-f16-w32.hip
new file mode 100644
index 0000000000000..a688869be9f38
--- /dev/null
+++ b/clang/test/CodeGenHIP/builtins-amdgcn-gfx12-f16-w32.hip
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 6
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 
-target-feature +wavefrontsize32 -emit-llvm -fcuda-is-device -o - %s | 
FileCheck %s
+
+#define __device__ __attribute__((device))
+
+typedef _Float16 v8h __attribute__((ext_vector_type(8)));
+typedef _Float16 v16h __attribute__((ext_vector_type(16)));
+typedef float v8f __attribute__((ext_vector_type(8)));
+
+// global_load_tr_b128_v8f16: _ExtVector<8, _Float16>(_ExtVector<8, _Float16> 
address_space<1> *)
+// Requires gfx12-insts,wavefrontsize32
+// CHECK-LABEL: define dso_local void 
@_Z30test_global_load_tr_b128_v8f16PDv8_DF16_PU3AS1S_(
+// CHECK-SAME: ptr noundef [[OUT:%.*]], ptr addrspace(1) noundef 
[[INPTR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[INPTR_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[INPTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[INPTR_ADDR]] to ptr
+// CHECK-NEXT:    store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[INPTR]], ptr [[INPTR_ADDR_ASCAST]], 
align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr 
[[INPTR_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = call contract <8 x half> 
@llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1) [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[TMP2]], align 16
+// CHECK-NEXT:    ret void
+//
+__device__ void test_global_load_tr_b128_v8f16(v8h *out, v8h 
__attribute__((address_space(1))) *inptr) {
+  *out = __builtin_amdgcn_global_load_tr_b128_v8f16(inptr);
+}
+
+// swmmac_f32_16x16x32_f16_w32: _ExtVector<8, float>(_ExtVector<8, _Float16>, 
_ExtVector<16, _Float16>, _ExtVector<8, float>, int)
+// Requires wmma-128b-insts,wavefrontsize32
+// CHECK-LABEL: define dso_local void 
@_Z32test_swmmac_f32_16x16x32_f16_w32PDv8_fDv8_DF16_Dv16_DF16_S_i(
+// CHECK-SAME: ptr noundef [[OUT:%.*]], <8 x half> noundef [[A:%.*]], <16 x 
half> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]], i32 noundef 
[[INDEX:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <16 x half>, align 32, addrspace(5)
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <8 x float>, align 32, addrspace(5)
+// CHECK-NEXT:    [[INDEX_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[A_ADDR]] to ptr
+// CHECK-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[B_ADDR]] to ptr
+// CHECK-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[C_ADDR]] to ptr
+// CHECK-NEXT:    [[INDEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[INDEX_ADDR]] to ptr
+// CHECK-NEXT:    store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store <16 x half> [[B]], ptr [[B_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <8 x float> [[C]], ptr [[C_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store i32 [[INDEX]], ptr [[INDEX_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR_ASCAST]], align 
16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x half>, ptr [[B_ADDR_ASCAST]], 
align 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[C_ADDR_ASCAST]], 
align 32
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[INDEX_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = call contract <8 x float> 
@llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i32(<8 x half> 
[[TMP0]], <16 x half> [[TMP1]], <8 x float> [[TMP2]], i32 [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store <8 x float> [[TMP4]], ptr [[TMP5]], align 32
+// CHECK-NEXT:    ret void
+//
+__device__ void test_swmmac_f32_16x16x32_f16_w32(v8f *out, v8h a, v16h b, v8f 
c, int index) {
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w32(a, b, c, index);
+}
+
+// swmmac_f16_16x16x32_f16_w32: _ExtVector<8, _Float16>(_ExtVector<8, 
_Float16>, _ExtVector<16, _Float16>, _ExtVector<8, _Float16>, int)
+// Requires wmma-128b-insts,wavefrontsize32
+// CHECK-LABEL: define dso_local void 
@_Z32test_swmmac_f16_16x16x32_f16_w32PDv8_DF16_S_Dv16_DF16_S_i(
+// CHECK-SAME: ptr noundef [[OUT:%.*]], <8 x half> noundef [[A:%.*]], <16 x 
half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]], i32 noundef 
[[INDEX:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <16 x half>, align 32, addrspace(5)
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5)
+// CHECK-NEXT:    [[INDEX_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[A_ADDR]] to ptr
+// CHECK-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[B_ADDR]] to ptr
+// CHECK-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[C_ADDR]] to ptr
+// CHECK-NEXT:    [[INDEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[INDEX_ADDR]] to ptr
+// CHECK-NEXT:    store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store <16 x half> [[B]], ptr [[B_ADDR_ASCAST]], align 32
+// CHECK-NEXT:    store <8 x half> [[C]], ptr [[C_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store i32 [[INDEX]], ptr [[INDEX_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR_ASCAST]], align 
16
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x half>, ptr [[B_ADDR_ASCAST]], 
align 32
+// CHECK-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[C_ADDR_ASCAST]], align 
16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[INDEX_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = call contract <8 x half> 
@llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i32(<8 x half> 
[[TMP0]], <16 x half> [[TMP1]], <8 x half> [[TMP2]], i32 [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store <8 x half> [[TMP4]], ptr [[TMP5]], align 16
+// CHECK-NEXT:    ret void
+//
+__device__ void test_swmmac_f16_16x16x32_f16_w32(v8h *out, v8h a, v16h b, v8h 
c, int index) {
+  *out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w32(a, b, c, index);
+}

diff  --git a/clang/test/CodeGenHIP/builtins-amdgcn-gfx12-f16-w64.hip 
b/clang/test/CodeGenHIP/builtins-amdgcn-gfx12-f16-w64.hip
new file mode 100644
index 0000000000000..a18fdffe9920a
--- /dev/null
+++ b/clang/test/CodeGenHIP/builtins-amdgcn-gfx12-f16-w64.hip
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 6
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 
-target-feature +wavefrontsize64 -emit-llvm -fcuda-is-device -o - %s | 
FileCheck %s
+
+#define __device__ __attribute__((device))
+
+typedef _Float16 v4h __attribute__((ext_vector_type(4)));
+typedef _Float16 v8h __attribute__((ext_vector_type(8)));
+typedef float v4f __attribute__((ext_vector_type(4)));
+
+// global_load_tr_b128_v4f16: _ExtVector<4, _Float16>(_ExtVector<4, _Float16> 
address_space<1> *)
+// Requires gfx12-insts,wavefrontsize64
+// CHECK-LABEL: define dso_local void 
@_Z30test_global_load_tr_b128_v4f16PDv4_DF16_PU3AS1S_(
+// CHECK-SAME: ptr noundef [[OUT:%.*]], ptr addrspace(1) noundef 
[[INPTR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[INPTR_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[INPTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[INPTR_ADDR]] to ptr
+// CHECK-NEXT:    store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[INPTR]], ptr [[INPTR_ADDR_ASCAST]], 
align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr 
[[INPTR_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = call contract <4 x half> 
@llvm.amdgcn.global.load.tr.b128.v4f16(ptr addrspace(1) [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store <4 x half> [[TMP1]], ptr [[TMP2]], align 8
+// CHECK-NEXT:    ret void
+//
+__device__ void test_global_load_tr_b128_v4f16(v4h *out, v4h 
__attribute__((address_space(1))) *inptr) {
+  *out = __builtin_amdgcn_global_load_tr_b128_v4f16(inptr);
+}
+
+// swmmac_f32_16x16x32_f16_w64: _ExtVector<4, float>(_ExtVector<4, _Float16>, 
_ExtVector<8, _Float16>, _ExtVector<4, float>, int)
+// Requires wmma-128b-insts,wavefrontsize64
+// CHECK-LABEL: define dso_local void 
@_Z32test_swmmac_f32_16x16x32_f16_w64PDv4_fDv4_DF16_Dv8_DF16_S_i(
+// CHECK-SAME: ptr noundef [[OUT:%.*]], <4 x half> noundef [[A:%.*]], <8 x 
half> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]], i32 noundef 
[[INDEX:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5)
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-NEXT:    [[INDEX_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[A_ADDR]] to ptr
+// CHECK-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[B_ADDR]] to ptr
+// CHECK-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[C_ADDR]] to ptr
+// CHECK-NEXT:    [[INDEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[INDEX_ADDR]] to ptr
+// CHECK-NEXT:    store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store <4 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store <4 x float> [[C]], ptr [[C_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store i32 [[INDEX]], ptr [[INDEX_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr [[A_ADDR_ASCAST]], align 
8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR_ASCAST]], align 
16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[C_ADDR_ASCAST]], 
align 16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[INDEX_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = call contract <4 x float> 
@llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i32(<4 x half> [[TMP0]], 
<8 x half> [[TMP1]], <4 x float> [[TMP2]], i32 [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store <4 x float> [[TMP4]], ptr [[TMP5]], align 16
+// CHECK-NEXT:    ret void
+//
+__device__ void test_swmmac_f32_16x16x32_f16_w64(v4f *out, v4h a, v8h b, v4f 
c, int index) {
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w64(a, b, c, index);
+}
+
+// swmmac_f16_16x16x32_f16_w64: _ExtVector<4, _Float16>(_ExtVector<4, 
_Float16>, _ExtVector<8, _Float16>, _ExtVector<4, _Float16>, int)
+// Requires wmma-128b-insts,wavefrontsize64
+// CHECK-LABEL: define dso_local void 
@_Z32test_swmmac_f16_16x16x32_f16_w64PDv4_DF16_S_Dv8_DF16_S_i(
+// CHECK-SAME: ptr noundef [[OUT:%.*]], <4 x half> noundef [[A:%.*]], <8 x 
half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]], i32 noundef 
[[INDEX:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5)
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+// CHECK-NEXT:    [[INDEX_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[A_ADDR]] to ptr
+// CHECK-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[B_ADDR]] to ptr
+// CHECK-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[C_ADDR]] to ptr
+// CHECK-NEXT:    [[INDEX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[INDEX_ADDR]] to ptr
+// CHECK-NEXT:    store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store <4 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR_ASCAST]], align 16
+// CHECK-NEXT:    store <4 x half> [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store i32 [[INDEX]], ptr [[INDEX_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr [[A_ADDR_ASCAST]], align 
8
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR_ASCAST]], align 
16
+// CHECK-NEXT:    [[TMP2:%.*]] = load <4 x half>, ptr [[C_ADDR_ASCAST]], align 
8
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[INDEX_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = call contract <4 x half> 
@llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i32(<4 x half> [[TMP0]], 
<8 x half> [[TMP1]], <4 x half> [[TMP2]], i32 [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store <4 x half> [[TMP4]], ptr [[TMP5]], align 8
+// CHECK-NEXT:    ret void
+//
+__device__ void test_swmmac_f16_16x16x32_f16_w64(v4h *out, v4h a, v8h b, v4h 
c, int index) {
+  *out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w64(a, b, c, index);
+}

diff  --git a/clang/test/CodeGenHIP/builtins-amdgcn-gfx1250-f16-misc.hip 
b/clang/test/CodeGenHIP/builtins-amdgcn-gfx1250-f16-misc.hip
new file mode 100644
index 0000000000000..c6f34f789d1ba
--- /dev/null
+++ b/clang/test/CodeGenHIP/builtins-amdgcn-gfx1250-f16-misc.hip
@@ -0,0 +1,70 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 6
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1250 -emit-llvm 
-fcuda-is-device -o - %s | FileCheck %s
+
+#define __device__ __attribute__((device))
+
+typedef _Float16 v8h __attribute__((ext_vector_type(8)));
+
+// global_load_tr16_b128_v8f16: _ExtVector<8, _Float16>(_ExtVector<8, 
_Float16> address_space<1> *)
+// Requires gfx1250-insts,wavefrontsize32
+// CHECK-LABEL: define dso_local void 
@_Z32test_global_load_tr16_b128_v8f16PDv8_DF16_PU3AS1S_(
+// CHECK-SAME: ptr noundef [[OUT:%.*]], ptr addrspace(1) noundef 
[[INPTR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[INPTR_ADDR:%.*]] = alloca ptr addrspace(1), align 8, 
addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[INPTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[INPTR_ADDR]] to ptr
+// CHECK-NEXT:    store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[INPTR]], ptr [[INPTR_ADDR_ASCAST]], 
align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr 
[[INPTR_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = call contract <8 x half> 
@llvm.amdgcn.global.load.tr.b128.v8f16(ptr addrspace(1) [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[TMP2]], align 16
+// CHECK-NEXT:    ret void
+//
+__device__ void test_global_load_tr16_b128_v8f16(v8h *out, v8h 
__attribute__((address_space(1))) *inptr) {
+  *out = __builtin_amdgcn_global_load_tr16_b128_v8f16(inptr);
+}
+
+// ds_load_tr16_b128_v8f16: _ExtVector<8, _Float16>(_ExtVector<8, _Float16> 
address_space<3> *)
+// Requires gfx1250-insts,wavefrontsize32
+// CHECK-LABEL: define dso_local void 
@_Z28test_ds_load_tr16_b128_v8f16PDv8_DF16_PU3AS3S_(
+// CHECK-SAME: ptr noundef [[OUT:%.*]], ptr addrspace(3) noundef 
[[INPTR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[INPTR_ADDR:%.*]] = alloca ptr addrspace(3), align 4, 
addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[INPTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[INPTR_ADDR]] to ptr
+// CHECK-NEXT:    store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store ptr addrspace(3) [[INPTR]], ptr [[INPTR_ADDR_ASCAST]], 
align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(3), ptr 
[[INPTR_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = call contract <8 x half> 
@llvm.amdgcn.ds.load.tr16.b128.v8f16(ptr addrspace(3) [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store <8 x half> [[TMP1]], ptr [[TMP2]], align 16
+// CHECK-NEXT:    ret void
+//
+__device__ void test_ds_load_tr16_b128_v8f16(v8h *out, v8h 
__attribute__((address_space(3))) *inptr) {
+  *out = __builtin_amdgcn_ds_load_tr16_b128_v8f16(inptr);
+}
+
+// tanhh: _Float16(_Float16)
+// Requires tanh-insts
+// CHECK-LABEL: define dso_local void @_Z10test_tanhhPDF16_DF16_(
+// CHECK-SAME: ptr noundef [[OUT:%.*]], half noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca half, align 2, addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[A_ADDR]] to ptr
+// CHECK-NEXT:    store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store half [[A]], ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP0:%.*]] = load half, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT:    [[TMP1:%.*]] = call contract half @llvm.amdgcn.tanh.f16(half 
[[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store half [[TMP1]], ptr [[TMP2]], align 2
+// CHECK-NEXT:    ret void
+//
+__device__ void test_tanhh(_Float16 *out, _Float16 a) {
+  *out = __builtin_amdgcn_tanhh(a);
+}

diff  --git a/clang/test/CodeGenHIP/builtins-amdgcn-gfx950-f16.hip 
b/clang/test/CodeGenHIP/builtins-amdgcn-gfx950-f16.hip
new file mode 100644
index 0000000000000..96e22d04ee42f
--- /dev/null
+++ b/clang/test/CodeGenHIP/builtins-amdgcn-gfx950-f16.hip
@@ -0,0 +1,27 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 6
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx950 -emit-llvm 
-fcuda-is-device -o - %s | FileCheck %s
+
+#define __device__ __attribute__((device))
+
+typedef _Float16 v4h __attribute__((ext_vector_type(4)));
+
+// ds_read_tr16_b64_v4f16: _ExtVector<4, _Float16>(_ExtVector<4, _Float16> 
address_space<3> *)
+// CHECK-LABEL: define dso_local void 
@_Z27test_ds_read_tr16_b64_v4f16PDv4_DF16_PU3AS3S_(
+// CHECK-SAME: ptr noundef [[OUT:%.*]], ptr addrspace(3) noundef 
[[INPTR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[INPTR_ADDR:%.*]] = alloca ptr addrspace(3), align 4, 
addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[INPTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) 
[[INPTR_ADDR]] to ptr
+// CHECK-NEXT:    store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store ptr addrspace(3) [[INPTR]], ptr [[INPTR_ADDR_ASCAST]], 
align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(3), ptr 
[[INPTR_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = call contract <4 x half> 
@llvm.amdgcn.ds.read.tr16.b64.v4f16(ptr addrspace(3) [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store <4 x half> [[TMP1]], ptr [[TMP2]], align 8
+// CHECK-NEXT:    ret void
+//
+__device__ void test_ds_read_tr16_b64_v4f16(v4h *out, v4h 
__attribute__((address_space(3))) *inptr) {
+  *out = __builtin_amdgcn_ds_read_tr16_b64_v4f16(inptr);
+}

diff  --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-tr.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-tr.cl
index 1e3a88a41f90e..467d4fe17f504 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-tr.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-load-tr.cl
@@ -6,7 +6,7 @@ typedef int    v2i   __attribute__((ext_vector_type(2)));
 typedef int    v3i   __attribute__((ext_vector_type(3)));
 typedef int    v4i   __attribute__((ext_vector_type(4)));
 typedef short  v8s   __attribute__((ext_vector_type(8)));
-typedef half   v8h   __attribute__((ext_vector_type(8)));
+typedef _Float16 v8h __attribute__((ext_vector_type(8)));
 typedef __bf16 v8y  __attribute__((ext_vector_type(8)));
 
 // CHECK-GFX1250-LABEL: @test_amdgcn_global_load_tr4_b64_v2i32(

diff  --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl
index af1f434403767..0d9bcbfe335fa 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-read-tr.cl
@@ -4,7 +4,7 @@
 typedef int    v2i   __attribute__((ext_vector_type(2)));
 typedef int    v3i   __attribute__((ext_vector_type(3)));
 typedef short  v4s   __attribute__((ext_vector_type(4)));
-typedef half   v4h   __attribute__((ext_vector_type(4)));
+typedef _Float16 v4h __attribute__((ext_vector_type(4)));
 typedef __bf16 v4y   __attribute__((ext_vector_type(4)));
 
 // GFX950-LABEL: define dso_local <2 x i32> 
@test_amdgcn_ds_read_b64_tr_b4_v2i32(

diff  --git 
a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl
index 8242ae6a98c40..267b634414692 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl
@@ -5,10 +5,10 @@
 
 typedef int    v2i   __attribute__((ext_vector_type(2)));
 typedef short  v8s   __attribute__((ext_vector_type(8)));
-typedef half   v8h   __attribute__((ext_vector_type(8)));
+typedef _Float16 v8h __attribute__((ext_vector_type(8)));
 typedef __bf16 v8y   __attribute__((ext_vector_type(8)));
 typedef short  v4s   __attribute__((ext_vector_type(4)));
-typedef  half  v4h   __attribute__((ext_vector_type(4)));
+typedef _Float16 v4h __attribute__((ext_vector_type(4)));
 typedef __bf16 v4y   __attribute__((ext_vector_type(4)));
 
 void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, 
global v8h* v8h_inptr, global v8y* v8y_inptr,

diff  --git 
a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl
index 6f7a93ef897ac..5533b6cfa7c8f 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl
@@ -4,7 +4,7 @@
 // REQUIRES: amdgpu-registered-target
 
 typedef short  v4s   __attribute__((ext_vector_type(4)));
-typedef  half  v4h   __attribute__((ext_vector_type(4)));
+typedef _Float16 v4h __attribute__((ext_vector_type(4)));
 typedef __bf16 v4y   __attribute__((ext_vector_type(4)));
 
 void amdgcn_global_load_tr(global int* int_inptr, global v4s* v4s_inptr, 
global v4h* v4h_inptr, global v4y* v4y_inptr)

diff  --git 
a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl
index b7323f1b41c2a..2f380bcf57d47 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl
@@ -5,7 +5,7 @@
 
 typedef int    v2i   __attribute__((ext_vector_type(2)));
 typedef short  v8s   __attribute__((ext_vector_type(8)));
-typedef half   v8h   __attribute__((ext_vector_type(8)));
+typedef _Float16 v8h __attribute__((ext_vector_type(8)));
 typedef __bf16 v8y   __attribute__((ext_vector_type(8)));
 
 void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, 
global v8h* v8h_inptr, global v8y* v8y_inptr)

diff  --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl
index 186fc4eacfaaf..012844a90512d 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl
@@ -4,7 +4,7 @@
 
 typedef int    v2i   __attribute__((ext_vector_type(2)));
 typedef short  v8s   __attribute__((ext_vector_type(8)));
-typedef half   v8h   __attribute__((ext_vector_type(8)));
+typedef _Float16 v8h __attribute__((ext_vector_type(8)));
 typedef __bf16 v8y   __attribute__((ext_vector_type(8)));
 
 // CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_v2i32(

diff  --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl
index b6627f1c8114d..3d84d04f56eb8 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl
@@ -3,7 +3,7 @@
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 
-target-feature +wavefrontsize64 -emit-llvm -o - %s | FileCheck %s 
--check-prefix=CHECK-GFX1200
 
 typedef short  v4s   __attribute__((ext_vector_type(4)));
-typedef  half  v4h   __attribute__((ext_vector_type(4)));
+typedef _Float16 v4h __attribute__((ext_vector_type(4)));
 typedef __bf16 v4y   __attribute__((ext_vector_type(4)));
 
 // CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_i32(

diff  --git a/clang/test/SemaHIP/hip-builtin-lvalue-to-rvalue.hip 
b/clang/test/SemaHIP/hip-builtin-lvalue-to-rvalue.hip
index b3238d7b29d3e..db156af516f46 100644
--- a/clang/test/SemaHIP/hip-builtin-lvalue-to-rvalue.hip
+++ b/clang/test/SemaHIP/hip-builtin-lvalue-to-rvalue.hip
@@ -7,7 +7,7 @@
 
 #define __device__ __attribute__((device))
 
-typedef __attribute__((__vector_size__(8 * sizeof(__fp16)))) __fp16 fp16x8_t;
+typedef __attribute__((__vector_size__(8 * sizeof(_Float16)))) _Float16 
fp16x8_t;
 
 // CHECK: ImplicitCastExpr {{.*}} <AddressSpaceConversion>
 // CHECK-NEXT: ImplicitCastExpr {{.*}} <LValueToRValue>


        
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] a17289b - [Clang][AMDGPU] Change __fp16 to _Float16 in builtin definitions (#185446)

Reply via email to