https://github.com/ranapratap55 created 
https://github.com/llvm/llvm-project/pull/175039

For raytrace and wmma builtins, using 'x' in the def to take _Float16 for 
HIP/C++ and half for OpenCL.

>From 07bcd3136eed97d75d2d1849c40102293cdf6ec7 Mon Sep 17 00:00:00 2001
From: ranapratap55 <[email protected]>
Date: Thu, 8 Jan 2026 14:11:23 +0530
Subject: [PATCH] [AMDGPU] Modifies raytracing and wmma builtin def to take
 _Float16 for HIP/C++

---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  | 24 ++---
 .../builtins-amdgcn-gfx12-wmma-w32.hip        | 62 ++++++++++++
 .../builtins-amdgcn-gfx12-wmma-w64.hip        | 62 ++++++++++++
 .../CodeGenHIP/builtins-amdgcn-raytracing.hip | 96 +++++++++++++++++++
 .../CodeGenHIP/builtins-amdgcn-wmma-w32.hip   | 89 +++++++++++++++++
 .../CodeGenHIP/builtins-amdgcn-wmma-w64.hip   | 90 +++++++++++++++++
 6 files changed, 411 insertions(+), 12 deletions(-)
 create mode 100644 clang/test/CodeGenHIP/builtins-amdgcn-gfx12-wmma-w32.hip
 create mode 100644 clang/test/CodeGenHIP/builtins-amdgcn-gfx12-wmma-w64.hip
 create mode 100644 clang/test/CodeGenHIP/builtins-amdgcn-raytracing.hip
 create mode 100644 clang/test/CodeGenHIP/builtins-amdgcn-wmma-w32.hip
 create mode 100644 clang/test/CodeGenHIP/builtins-amdgcn-wmma-w64.hip

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index bb823704c84c8..f189e34aac707 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -325,9 +325,9 @@ TARGET_BUILTIN(__builtin_amdgcn_s_ttracedata_imm, "vIs", 
"n", "gfx10-insts")
 // Postfix h indicates the 4/5-th arguments are half4.
 
//===----------------------------------------------------------------------===//
 TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray, 
"V4UiUifV4fV4fV4fV4Ui", "nc", "gfx10-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray_h, 
"V4UiUifV4fV4hV4hV4Ui", "nc", "gfx10-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray_h, 
"V4UiUifV4fV4xV4xV4Ui", "nc", "gfx10-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray_l, 
"V4UiWUifV4fV4fV4fV4Ui", "nc", "gfx10-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray_lh, 
"V4UiWUifV4fV4hV4hV4Ui", "nc", "gfx10-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray_lh, 
"V4UiWUifV4fV4xV4xV4Ui", "nc", "gfx10-insts")
 
 
 
//===----------------------------------------------------------------------===//
@@ -343,20 +343,20 @@ 
TARGET_BUILTIN(__builtin_amdgcn_s_wait_event_export_ready, "v", "n", "gfx11-inst
 // Postfix w32 indicates the builtin requires wavefront size of 32.
 // Postfix w64 indicates the builtin requires wavefront size of 64.
 
//===----------------------------------------------------------------------===//
-TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w32, "V8fV16hV16hV8f", 
"nc", "gfx11-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w32, "V8fV16xV16xV8f", 
"nc", "gfx11-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32, "V8fV16sV16sV8f", 
"nc", "gfx11-insts,wavefrontsize32")
-TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w32, 
"V16hV16hV16hV16hIb", "nc", "gfx11-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w32, 
"V16xV16xV16xV16xIb", "nc", "gfx11-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32, 
"V16sV16sV16sV16sIb", "nc", "gfx11-insts,wavefrontsize32")
-TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32, 
"V16hV16hV16hV16hIb", "nc", "gfx11-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32, 
"V16xV16xV16xV16xIb", "nc", "gfx11-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32, 
"V16sV16sV16sV16sIb", "nc", "gfx11-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32, 
"V8iIbV4iIbV4iV8iIb", "nc", "gfx11-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32, 
"V8iIbV2iIbV2iV8iIb", "nc", "gfx11-insts,wavefrontsize32")
 
-TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w64, "V4fV16hV16hV4f", 
"nc", "gfx11-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w64, "V4fV16xV16xV4f", 
"nc", "gfx11-insts,wavefrontsize64")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64, "V4fV16sV16sV4f", 
"nc", "gfx11-insts,wavefrontsize64")
-TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w64, "V8hV16hV16hV8hIb", 
"nc", "gfx11-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w64, "V8xV16xV16xV8xIb", 
"nc", "gfx11-insts,wavefrontsize64")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64, 
"V8sV16sV16sV8sIb", "nc", "gfx11-insts,wavefrontsize64")
-TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64, 
"V8hV16hV16hV8hIb", "nc", "gfx11-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64, 
"V8xV16xV16xV8xIb", "nc", "gfx11-insts,wavefrontsize64")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64, 
"V8sV16sV16sV8sIb", "nc", "gfx11-insts,wavefrontsize64")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64, 
"V4iIbV4iIbV4iV4iIb", "nc", "gfx11-insts,wavefrontsize64")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64, 
"V4iIbV2iIbV2iV4iIb", "nc", "gfx11-insts,wavefrontsize64")
@@ -590,9 +590,9 @@ 
TARGET_BUILTIN(__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn, "V2WUiUiUiV8UiIi",
 // Therefore, we add an "_gfx12" suffix to distinguish them from the existing
 // builtins.
 
//===----------------------------------------------------------------------===//
-TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12, 
"V8fV8hV8hV8f", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12, 
"V8fV8xV8xV8f", "nc", "gfx12-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12, 
"V8fV8sV8sV8f", "nc", "gfx12-insts,wavefrontsize32")
-TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12, 
"V8hV8hV8hV8h", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12, 
"V8xV8xV8xV8x", "nc", "gfx12-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12, 
"V8sV8sV8sV8s", "nc", "gfx12-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12, 
"V8iIbV2iIbV2iV8iIb", "nc", "gfx12-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12, 
"V8iIbiIbiV8iIb", "nc", "gfx12-insts,wavefrontsize32")
@@ -604,9 +604,9 @@ 
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12, "V8fV2iV2iV
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12, 
"V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12, 
"V8iIbV2iIbV2iV8iIb", "nc", "gfx12-insts,wavefrontsize32")
 
-TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12, 
"V4fV4hV4hV4f", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12, 
"V4fV4xV4xV4f", "nc", "gfx12-insts,wavefrontsize64")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12, 
"V4fV4sV4sV4f", "nc", "gfx12-insts,wavefrontsize64")
-TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12, 
"V4hV4hV4hV4h", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12, 
"V4xV4xV4xV4x", "nc", "gfx12-insts,wavefrontsize64")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12, 
"V4sV4sV4sV4s", "nc", "gfx12-insts,wavefrontsize64")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12, 
"V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12, 
"V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64")
diff --git a/clang/test/CodeGenHIP/builtins-amdgcn-gfx12-wmma-w32.hip 
b/clang/test/CodeGenHIP/builtins-amdgcn-gfx12-wmma-w32.hip
new file mode 100644
index 0000000000000..6e4ec6bf8c107
--- /dev/null
+++ b/clang/test/CodeGenHIP/builtins-amdgcn-gfx12-wmma-w32.hip
@@ -0,0 +1,62 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 6
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 -emit-llvm 
-fcuda-is-device -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+
+#define __device__ __attribute__((device))
+
+typedef float    v8f   __attribute__((ext_vector_type(8)));
+typedef _Float16 v8h   __attribute__((ext_vector_type(8)));
+
+// CHECK-GFX1200-LABEL: define dso_local void 
@_Z47test_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12_hipPDv8_fDv8_DF16_S1_S_(
+// CHECK-GFX1200-SAME: ptr noundef [[OUT:%.*]], <8 x half> noundef [[A:%.*]], 
<8 x half> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-GFX1200-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16, 
addrspace(5)
+// CHECK-GFX1200-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16, 
addrspace(5)
+// CHECK-GFX1200-NEXT:    [[C_ADDR:%.*]] = alloca <8 x float>, align 32, 
addrspace(5)
+// CHECK-GFX1200-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-GFX1200-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[A_ADDR]] to ptr
+// CHECK-GFX1200-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[B_ADDR]] to ptr
+// CHECK-GFX1200-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[C_ADDR]] to ptr
+// CHECK-GFX1200-NEXT:    store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-GFX1200-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 
16
+// CHECK-GFX1200-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR_ASCAST]], align 
16
+// CHECK-GFX1200-NEXT:    store <8 x float> [[C]], ptr [[C_ADDR_ASCAST]], 
align 32
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr 
[[A_ADDR_ASCAST]], align 16
+// CHECK-GFX1200-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr 
[[B_ADDR_ASCAST]], align 16
+// CHECK-GFX1200-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr 
[[C_ADDR_ASCAST]], align 32
+// CHECK-GFX1200-NEXT:    [[TMP3:%.*]] = call contract <8 x float> 
@llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> [[TMP0]], <8 x half> 
[[TMP1]], <8 x float> [[TMP2]])
+// CHECK-GFX1200-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], 
align 8
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP3]], ptr [[TMP4]], align 32
+// CHECK-GFX1200-NEXT:    ret void
+//
+__device__ void test_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12_hip(v8f* out, v8h 
a, v8h b, v8f c) {
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: define dso_local void 
@_Z47test_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12_hipPDv8_DF16_S_S_S_(
+// CHECK-GFX1200-SAME: ptr noundef [[OUT:%.*]], <8 x half> noundef [[A:%.*]], 
<8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-GFX1200-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16, 
addrspace(5)
+// CHECK-GFX1200-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16, 
addrspace(5)
+// CHECK-GFX1200-NEXT:    [[C_ADDR:%.*]] = alloca <8 x half>, align 16, 
addrspace(5)
+// CHECK-GFX1200-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-GFX1200-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[A_ADDR]] to ptr
+// CHECK-GFX1200-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[B_ADDR]] to ptr
+// CHECK-GFX1200-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[C_ADDR]] to ptr
+// CHECK-GFX1200-NEXT:    store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-GFX1200-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 
16
+// CHECK-GFX1200-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR_ASCAST]], align 
16
+// CHECK-GFX1200-NEXT:    store <8 x half> [[C]], ptr [[C_ADDR_ASCAST]], align 
16
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr 
[[A_ADDR_ASCAST]], align 16
+// CHECK-GFX1200-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr 
[[B_ADDR_ASCAST]], align 16
+// CHECK-GFX1200-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr 
[[C_ADDR_ASCAST]], align 16
+// CHECK-GFX1200-NEXT:    [[TMP3:%.*]] = call contract <8 x half> 
@llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> [[TMP0]], <8 x half> 
[[TMP1]], <8 x half> [[TMP2]], i1 false)
+// CHECK-GFX1200-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], 
align 8
+// CHECK-GFX1200-NEXT:    store <8 x half> [[TMP3]], ptr [[TMP4]], align 16
+// CHECK-GFX1200-NEXT:    ret void
+//
+__device__ void test_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12_hip(v8h* out, v8h 
a, v8h b, v8h c) {
+  *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12(a, b, c);
+}
diff --git a/clang/test/CodeGenHIP/builtins-amdgcn-gfx12-wmma-w64.hip 
b/clang/test/CodeGenHIP/builtins-amdgcn-gfx12-wmma-w64.hip
new file mode 100644
index 0000000000000..21bae28f85e8a
--- /dev/null
+++ b/clang/test/CodeGenHIP/builtins-amdgcn-gfx12-wmma-w64.hip
@@ -0,0 +1,62 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 6
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 
-target-feature +wavefrontsize64 -emit-llvm -fcuda-is-device -o - %s | 
FileCheck %s --check-prefix=CHECK-GFX1200
+
+#define __device__ __attribute__((device))
+
+typedef float    v4f   __attribute__((ext_vector_type(4)));
+typedef _Float16 v4h   __attribute__((ext_vector_type(4)));
+
+// CHECK-GFX1200-LABEL: define dso_local void 
@_Z47test_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12_hipPDv4_fDv4_DF16_S1_S_(
+// CHECK-GFX1200-SAME: ptr noundef [[OUT:%.*]], <4 x half> noundef [[A:%.*]], 
<4 x half> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-GFX1200-NEXT:    [[A_ADDR:%.*]] = alloca <4 x half>, align 8, 
addrspace(5)
+// CHECK-GFX1200-NEXT:    [[B_ADDR:%.*]] = alloca <4 x half>, align 8, 
addrspace(5)
+// CHECK-GFX1200-NEXT:    [[C_ADDR:%.*]] = alloca <4 x float>, align 16, 
addrspace(5)
+// CHECK-GFX1200-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-GFX1200-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[A_ADDR]] to ptr
+// CHECK-GFX1200-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[B_ADDR]] to ptr
+// CHECK-GFX1200-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[C_ADDR]] to ptr
+// CHECK-GFX1200-NEXT:    store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-GFX1200-NEXT:    store <4 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 
8
+// CHECK-GFX1200-NEXT:    store <4 x half> [[B]], ptr [[B_ADDR_ASCAST]], align 
8
+// CHECK-GFX1200-NEXT:    store <4 x float> [[C]], ptr [[C_ADDR_ASCAST]], 
align 16
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr 
[[A_ADDR_ASCAST]], align 8
+// CHECK-GFX1200-NEXT:    [[TMP1:%.*]] = load <4 x half>, ptr 
[[B_ADDR_ASCAST]], align 8
+// CHECK-GFX1200-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr 
[[C_ADDR_ASCAST]], align 16
+// CHECK-GFX1200-NEXT:    [[TMP3:%.*]] = call contract <4 x float> 
@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> [[TMP0]], <4 x half> 
[[TMP1]], <4 x float> [[TMP2]])
+// CHECK-GFX1200-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], 
align 8
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP3]], ptr [[TMP4]], align 16
+// CHECK-GFX1200-NEXT:    ret void
+//
+__device__ void test_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12_hip(v4f* out, v4h 
a, v4h b, v4f c) {
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: define dso_local void 
@_Z47test_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12_hipPDv4_DF16_S_S_S_(
+// CHECK-GFX1200-SAME: ptr noundef [[OUT:%.*]], <4 x half> noundef [[A:%.*]], 
<4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-GFX1200-NEXT:    [[A_ADDR:%.*]] = alloca <4 x half>, align 8, 
addrspace(5)
+// CHECK-GFX1200-NEXT:    [[B_ADDR:%.*]] = alloca <4 x half>, align 8, 
addrspace(5)
+// CHECK-GFX1200-NEXT:    [[C_ADDR:%.*]] = alloca <4 x half>, align 8, 
addrspace(5)
+// CHECK-GFX1200-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-GFX1200-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[A_ADDR]] to ptr
+// CHECK-GFX1200-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[B_ADDR]] to ptr
+// CHECK-GFX1200-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[C_ADDR]] to ptr
+// CHECK-GFX1200-NEXT:    store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-GFX1200-NEXT:    store <4 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 
8
+// CHECK-GFX1200-NEXT:    store <4 x half> [[B]], ptr [[B_ADDR_ASCAST]], align 
8
+// CHECK-GFX1200-NEXT:    store <4 x half> [[C]], ptr [[C_ADDR_ASCAST]], align 
8
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr 
[[A_ADDR_ASCAST]], align 8
+// CHECK-GFX1200-NEXT:    [[TMP1:%.*]] = load <4 x half>, ptr 
[[B_ADDR_ASCAST]], align 8
+// CHECK-GFX1200-NEXT:    [[TMP2:%.*]] = load <4 x half>, ptr 
[[C_ADDR_ASCAST]], align 8
+// CHECK-GFX1200-NEXT:    [[TMP3:%.*]] = call contract <4 x half> 
@llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> [[TMP0]], <4 x half> 
[[TMP1]], <4 x half> [[TMP2]], i1 false)
+// CHECK-GFX1200-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], 
align 8
+// CHECK-GFX1200-NEXT:    store <4 x half> [[TMP3]], ptr [[TMP4]], align 8
+// CHECK-GFX1200-NEXT:    ret void
+//
+__device__ void test_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12_hip(v4h* out, v4h 
a, v4h b, v4h c) {
+  *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12(a, b, c);
+}
diff --git a/clang/test/CodeGenHIP/builtins-amdgcn-raytracing.hip 
b/clang/test/CodeGenHIP/builtins-amdgcn-raytracing.hip
new file mode 100644
index 0000000000000..1f3c65201da30
--- /dev/null
+++ b/clang/test/CodeGenHIP/builtins-amdgcn-raytracing.hip
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 6
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1030 -emit-llvm 
-fcuda-is-device -o - %s | FileCheck %s --check-prefix=CHECK-GFX1030
+
+#define __device__ __attribute__((device))
+
+typedef unsigned int  v4ui  __attribute__((ext_vector_type(4)));
+typedef float         v4f   __attribute__((ext_vector_type(4)));
+typedef _Float16      v4h   __attribute__((ext_vector_type(4)));
+typedef unsigned long ulong;
+
+// CHECK-GFX1030-LABEL: define dso_local void 
@_Z34test_image_bvh_intersect_ray_h_hipPDv4_jjfDv4_fDv4_DF16_S2_S_(
+// CHECK-GFX1030-SAME: ptr noundef [[OUT:%.*]], i32 noundef [[NODE:%.*]], 
float noundef [[TMAX:%.*]], <4 x float> noundef [[ORIGIN:%.*]], <4 x half> 
noundef [[DIR:%.*]], <4 x half> noundef [[INV_DIR:%.*]], <4 x i32> noundef 
[[EXT:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-GFX1030-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1030-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-GFX1030-NEXT:    [[NODE_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-GFX1030-NEXT:    [[TMAX_ADDR:%.*]] = alloca float, align 4, 
addrspace(5)
+// CHECK-GFX1030-NEXT:    [[ORIGIN_ADDR:%.*]] = alloca <4 x float>, align 16, 
addrspace(5)
+// CHECK-GFX1030-NEXT:    [[DIR_ADDR:%.*]] = alloca <4 x half>, align 8, 
addrspace(5)
+// CHECK-GFX1030-NEXT:    [[INV_DIR_ADDR:%.*]] = alloca <4 x half>, align 8, 
addrspace(5)
+// CHECK-GFX1030-NEXT:    [[EXT_ADDR:%.*]] = alloca <4 x i32>, align 16, 
addrspace(5)
+// CHECK-GFX1030-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-GFX1030-NEXT:    [[NODE_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[NODE_ADDR]] to ptr
+// CHECK-GFX1030-NEXT:    [[TMAX_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[TMAX_ADDR]] to ptr
+// CHECK-GFX1030-NEXT:    [[ORIGIN_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[ORIGIN_ADDR]] to ptr
+// CHECK-GFX1030-NEXT:    [[DIR_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[DIR_ADDR]] to ptr
+// CHECK-GFX1030-NEXT:    [[INV_DIR_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[INV_DIR_ADDR]] to ptr
+// CHECK-GFX1030-NEXT:    [[EXT_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[EXT_ADDR]] to ptr
+// CHECK-GFX1030-NEXT:    store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-GFX1030-NEXT:    store i32 [[NODE]], ptr [[NODE_ADDR_ASCAST]], align 4
+// CHECK-GFX1030-NEXT:    store float [[TMAX]], ptr [[TMAX_ADDR_ASCAST]], 
align 4
+// CHECK-GFX1030-NEXT:    store <4 x float> [[ORIGIN]], ptr 
[[ORIGIN_ADDR_ASCAST]], align 16
+// CHECK-GFX1030-NEXT:    store <4 x half> [[DIR]], ptr [[DIR_ADDR_ASCAST]], 
align 8
+// CHECK-GFX1030-NEXT:    store <4 x half> [[INV_DIR]], ptr 
[[INV_DIR_ADDR_ASCAST]], align 8
+// CHECK-GFX1030-NEXT:    store <4 x i32> [[EXT]], ptr [[EXT_ADDR_ASCAST]], 
align 16
+// CHECK-GFX1030-NEXT:    [[TMP0:%.*]] = load i32, ptr [[NODE_ADDR_ASCAST]], 
align 4
+// CHECK-GFX1030-NEXT:    [[TMP1:%.*]] = load float, ptr [[TMAX_ADDR_ASCAST]], 
align 4
+// CHECK-GFX1030-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr 
[[ORIGIN_ADDR_ASCAST]], align 16
+// CHECK-GFX1030-NEXT:    [[TMP3:%.*]] = load <4 x half>, ptr 
[[DIR_ADDR_ASCAST]], align 8
+// CHECK-GFX1030-NEXT:    [[TMP4:%.*]] = load <4 x half>, ptr 
[[INV_DIR_ADDR_ASCAST]], align 8
+// CHECK-GFX1030-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr 
[[EXT_ADDR_ASCAST]], align 16
+// CHECK-GFX1030-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP2]], 
<4 x float> [[TMP2]], <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-GFX1030-NEXT:    [[TMP7:%.*]] = shufflevector <4 x half> [[TMP3]], <4 
x half> [[TMP3]], <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-GFX1030-NEXT:    [[TMP8:%.*]] = shufflevector <4 x half> [[TMP4]], <4 
x half> [[TMP4]], <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-GFX1030-NEXT:    [[TMP9:%.*]] = call <4 x i32> 
@llvm.amdgcn.image.bvh.intersect.ray.i32.v3f16(i32 [[TMP0]], float [[TMP1]], <3 
x float> [[TMP6]], <3 x half> [[TMP7]], <3 x half> [[TMP8]], <4 x i32> [[TMP5]])
+// CHECK-GFX1030-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], 
align 8
+// CHECK-GFX1030-NEXT:    store <4 x i32> [[TMP9]], ptr [[TMP10]], align 16
+// CHECK-GFX1030-NEXT:    ret void
+//
+__device__ void test_image_bvh_intersect_ray_h_hip(v4ui* out, unsigned int 
node, float tmax, v4f origin, v4h dir, v4h inv_dir, v4ui ext) {
+  *out = __builtin_amdgcn_image_bvh_intersect_ray_h(node, tmax, origin, dir, 
inv_dir, ext);
+}
+
+
+// CHECK-GFX1030-LABEL: define dso_local void 
@_Z35test_image_bvh_intersect_ray_lh_hipPDv4_jmfDv4_fDv4_DF16_S2_S_(
+// CHECK-GFX1030-SAME: ptr noundef [[OUT:%.*]], i64 noundef [[NODE:%.*]], 
float noundef [[TMAX:%.*]], <4 x float> noundef [[ORIGIN:%.*]], <4 x half> 
noundef [[DIR:%.*]], <4 x half> noundef [[INV_DIR:%.*]], <4 x i32> noundef 
[[EXT:%.*]]) #[[ATTR0]] {
+// CHECK-GFX1030-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1030-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-GFX1030-NEXT:    [[NODE_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
+// CHECK-GFX1030-NEXT:    [[TMAX_ADDR:%.*]] = alloca float, align 4, 
addrspace(5)
+// CHECK-GFX1030-NEXT:    [[ORIGIN_ADDR:%.*]] = alloca <4 x float>, align 16, 
addrspace(5)
+// CHECK-GFX1030-NEXT:    [[DIR_ADDR:%.*]] = alloca <4 x half>, align 8, 
addrspace(5)
+// CHECK-GFX1030-NEXT:    [[INV_DIR_ADDR:%.*]] = alloca <4 x half>, align 8, 
addrspace(5)
+// CHECK-GFX1030-NEXT:    [[EXT_ADDR:%.*]] = alloca <4 x i32>, align 16, 
addrspace(5)
+// CHECK-GFX1030-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-GFX1030-NEXT:    [[NODE_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[NODE_ADDR]] to ptr
+// CHECK-GFX1030-NEXT:    [[TMAX_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[TMAX_ADDR]] to ptr
+// CHECK-GFX1030-NEXT:    [[ORIGIN_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[ORIGIN_ADDR]] to ptr
+// CHECK-GFX1030-NEXT:    [[DIR_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[DIR_ADDR]] to ptr
+// CHECK-GFX1030-NEXT:    [[INV_DIR_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[INV_DIR_ADDR]] to ptr
+// CHECK-GFX1030-NEXT:    [[EXT_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[EXT_ADDR]] to ptr
+// CHECK-GFX1030-NEXT:    store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-GFX1030-NEXT:    store i64 [[NODE]], ptr [[NODE_ADDR_ASCAST]], align 8
+// CHECK-GFX1030-NEXT:    store float [[TMAX]], ptr [[TMAX_ADDR_ASCAST]], 
align 4
+// CHECK-GFX1030-NEXT:    store <4 x float> [[ORIGIN]], ptr 
[[ORIGIN_ADDR_ASCAST]], align 16
+// CHECK-GFX1030-NEXT:    store <4 x half> [[DIR]], ptr [[DIR_ADDR_ASCAST]], 
align 8
+// CHECK-GFX1030-NEXT:    store <4 x half> [[INV_DIR]], ptr 
[[INV_DIR_ADDR_ASCAST]], align 8
+// CHECK-GFX1030-NEXT:    store <4 x i32> [[EXT]], ptr [[EXT_ADDR_ASCAST]], 
align 16
+// CHECK-GFX1030-NEXT:    [[TMP0:%.*]] = load i64, ptr [[NODE_ADDR_ASCAST]], 
align 8
+// CHECK-GFX1030-NEXT:    [[TMP1:%.*]] = load float, ptr [[TMAX_ADDR_ASCAST]], 
align 4
+// CHECK-GFX1030-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr 
[[ORIGIN_ADDR_ASCAST]], align 16
+// CHECK-GFX1030-NEXT:    [[TMP3:%.*]] = load <4 x half>, ptr 
[[DIR_ADDR_ASCAST]], align 8
+// CHECK-GFX1030-NEXT:    [[TMP4:%.*]] = load <4 x half>, ptr 
[[INV_DIR_ADDR_ASCAST]], align 8
+// CHECK-GFX1030-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr 
[[EXT_ADDR_ASCAST]], align 16
+// CHECK-GFX1030-NEXT:    [[TMP6:%.*]] = shufflevector <4 x float> [[TMP2]], 
<4 x float> [[TMP2]], <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-GFX1030-NEXT:    [[TMP7:%.*]] = shufflevector <4 x half> [[TMP3]], <4 
x half> [[TMP3]], <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-GFX1030-NEXT:    [[TMP8:%.*]] = shufflevector <4 x half> [[TMP4]], <4 
x half> [[TMP4]], <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-GFX1030-NEXT:    [[TMP9:%.*]] = call <4 x i32> 
@llvm.amdgcn.image.bvh.intersect.ray.i64.v3f16(i64 [[TMP0]], float [[TMP1]], <3 
x float> [[TMP6]], <3 x half> [[TMP7]], <3 x half> [[TMP8]], <4 x i32> [[TMP5]])
+// CHECK-GFX1030-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], 
align 8
+// CHECK-GFX1030-NEXT:    store <4 x i32> [[TMP9]], ptr [[TMP10]], align 16
+// CHECK-GFX1030-NEXT:    ret void
+//
+__device__ void test_image_bvh_intersect_ray_lh_hip(v4ui* out, ulong node, 
float tmax, v4f origin, v4h dir, v4h inv_dir, v4ui ext) {
+  *out = __builtin_amdgcn_image_bvh_intersect_ray_lh(node, tmax, origin, dir, 
inv_dir, ext);
+}
+
diff --git a/clang/test/CodeGenHIP/builtins-amdgcn-wmma-w32.hip 
b/clang/test/CodeGenHIP/builtins-amdgcn-wmma-w32.hip
new file mode 100644
index 0000000000000..12c3cf14ee395
--- /dev/null
+++ b/clang/test/CodeGenHIP/builtins-amdgcn-wmma-w32.hip
@@ -0,0 +1,89 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 6
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1100 -emit-llvm 
-fcuda-is-device -o - %s | FileCheck %s --check-prefix=CHECK-GFX1100
+
+#define __device__ __attribute__((device))
+
+typedef float    v8f   __attribute__((ext_vector_type(8)));
+typedef _Float16 v16h  __attribute__((ext_vector_type(16)));
+
+// CHECK-GFX1100-LABEL: define dso_local void 
@_Z41test_amdgcn_wmma_f32_16x16x16_f16_w32_hipPDv8_fDv16_DF16_S1_S_(
+// CHECK-GFX1100-SAME: ptr noundef [[OUT:%.*]], <16 x half> noundef [[A:%.*]], 
<16 x half> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) #[[ATTR0:[0-9]+]] 
{
+// CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1100-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-GFX1100-NEXT:    [[A_ADDR:%.*]] = alloca <16 x half>, align 32, 
addrspace(5)
+// CHECK-GFX1100-NEXT:    [[B_ADDR:%.*]] = alloca <16 x half>, align 32, 
addrspace(5)
+// CHECK-GFX1100-NEXT:    [[C_ADDR:%.*]] = alloca <8 x float>, align 32, 
addrspace(5)
+// CHECK-GFX1100-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-GFX1100-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[A_ADDR]] to ptr
+// CHECK-GFX1100-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[B_ADDR]] to ptr
+// CHECK-GFX1100-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[C_ADDR]] to ptr
+// CHECK-GFX1100-NEXT:    store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-GFX1100-NEXT:    store <16 x half> [[A]], ptr [[A_ADDR_ASCAST]], 
align 32
+// CHECK-GFX1100-NEXT:    store <16 x half> [[B]], ptr [[B_ADDR_ASCAST]], 
align 32
+// CHECK-GFX1100-NEXT:    store <8 x float> [[C]], ptr [[C_ADDR_ASCAST]], 
align 32
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr 
[[A_ADDR_ASCAST]], align 32
+// CHECK-GFX1100-NEXT:    [[TMP1:%.*]] = load <16 x half>, ptr 
[[B_ADDR_ASCAST]], align 32
+// CHECK-GFX1100-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr 
[[C_ADDR_ASCAST]], align 32
+// CHECK-GFX1100-NEXT:    [[TMP3:%.*]] = call contract <8 x float> 
@llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> [[TMP0]], <16 x 
half> [[TMP1]], <8 x float> [[TMP2]])
+// CHECK-GFX1100-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], 
align 8
+// CHECK-GFX1100-NEXT:    store <8 x float> [[TMP3]], ptr [[TMP4]], align 32
+// CHECK-GFX1100-NEXT:    ret void
+//
+__device__ void test_amdgcn_wmma_f32_16x16x16_f16_w32_hip(v8f* out, v16h a, 
v16h b, v8f c) {
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a, b, c);
+}
+
+// CHECK-GFX1100-LABEL: define dso_local void 
@_Z41test_amdgcn_wmma_f16_16x16x16_f16_w32_hipPDv16_DF16_S_S_S_(
+// CHECK-GFX1100-SAME: ptr noundef [[OUT:%.*]], <16 x half> noundef [[A:%.*]], 
<16 x half> noundef [[B:%.*]], <16 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1100-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-GFX1100-NEXT:    [[A_ADDR:%.*]] = alloca <16 x half>, align 32, 
addrspace(5)
+// CHECK-GFX1100-NEXT:    [[B_ADDR:%.*]] = alloca <16 x half>, align 32, 
addrspace(5)
+// CHECK-GFX1100-NEXT:    [[C_ADDR:%.*]] = alloca <16 x half>, align 32, 
addrspace(5)
+// CHECK-GFX1100-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-GFX1100-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[A_ADDR]] to ptr
+// CHECK-GFX1100-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[B_ADDR]] to ptr
+// CHECK-GFX1100-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[C_ADDR]] to ptr
+// CHECK-GFX1100-NEXT:    store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-GFX1100-NEXT:    store <16 x half> [[A]], ptr [[A_ADDR_ASCAST]], 
align 32
+// CHECK-GFX1100-NEXT:    store <16 x half> [[B]], ptr [[B_ADDR_ASCAST]], 
align 32
+// CHECK-GFX1100-NEXT:    store <16 x half> [[C]], ptr [[C_ADDR_ASCAST]], 
align 32
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr 
[[A_ADDR_ASCAST]], align 32
+// CHECK-GFX1100-NEXT:    [[TMP1:%.*]] = load <16 x half>, ptr 
[[B_ADDR_ASCAST]], align 32
+// CHECK-GFX1100-NEXT:    [[TMP2:%.*]] = load <16 x half>, ptr 
[[C_ADDR_ASCAST]], align 32
+// CHECK-GFX1100-NEXT:    [[TMP3:%.*]] = call contract <16 x half> 
@llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[TMP0]], <16 x 
half> [[TMP1]], <16 x half> [[TMP2]], i1 true)
+// CHECK-GFX1100-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], 
align 8
+// CHECK-GFX1100-NEXT:    store <16 x half> [[TMP3]], ptr [[TMP4]], align 32
+// CHECK-GFX1100-NEXT:    ret void
+//
+__device__ void test_amdgcn_wmma_f16_16x16x16_f16_w32_hip(v16h* out, v16h a, 
v16h b, v16h c) {
+  *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32(a, b, c, true);
+}
+
+// CHECK-GFX1100-LABEL: define dso_local void 
@_Z46test_amdgcn_wmma_f16_16x16x16_f16_tied_w32_hipPDv16_DF16_S_S_S_(
+// CHECK-GFX1100-SAME: ptr noundef [[OUT:%.*]], <16 x half> noundef [[A:%.*]], 
<16 x half> noundef [[B:%.*]], <16 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1100-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-GFX1100-NEXT:    [[A_ADDR:%.*]] = alloca <16 x half>, align 32, 
addrspace(5)
+// CHECK-GFX1100-NEXT:    [[B_ADDR:%.*]] = alloca <16 x half>, align 32, 
addrspace(5)
+// CHECK-GFX1100-NEXT:    [[C_ADDR:%.*]] = alloca <16 x half>, align 32, 
addrspace(5)
+// CHECK-GFX1100-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-GFX1100-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[A_ADDR]] to ptr
+// CHECK-GFX1100-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[B_ADDR]] to ptr
+// CHECK-GFX1100-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[C_ADDR]] to ptr
+// CHECK-GFX1100-NEXT:    store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-GFX1100-NEXT:    store <16 x half> [[A]], ptr [[A_ADDR_ASCAST]], 
align 32
+// CHECK-GFX1100-NEXT:    store <16 x half> [[B]], ptr [[B_ADDR_ASCAST]], 
align 32
+// CHECK-GFX1100-NEXT:    store <16 x half> [[C]], ptr [[C_ADDR_ASCAST]], 
align 32
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr 
[[A_ADDR_ASCAST]], align 32
+// CHECK-GFX1100-NEXT:    [[TMP1:%.*]] = load <16 x half>, ptr 
[[B_ADDR_ASCAST]], align 32
+// CHECK-GFX1100-NEXT:    [[TMP2:%.*]] = load <16 x half>, ptr 
[[C_ADDR_ASCAST]], align 32
+// CHECK-GFX1100-NEXT:    [[TMP3:%.*]] = call contract <16 x half> 
@llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v16f16.v16f16(<16 x half> [[TMP0]], <16 
x half> [[TMP1]], <16 x half> [[TMP2]], i1 true)
+// CHECK-GFX1100-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], 
align 8
+// CHECK-GFX1100-NEXT:    store <16 x half> [[TMP3]], ptr [[TMP4]], align 32
+// CHECK-GFX1100-NEXT:    ret void
+//
+__device__ void test_amdgcn_wmma_f16_16x16x16_f16_tied_w32_hip(v16h* out, v16h 
a, v16h b, v16h c) {
+  *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32(a, b, c, true);
+}
diff --git a/clang/test/CodeGenHIP/builtins-amdgcn-wmma-w64.hip 
b/clang/test/CodeGenHIP/builtins-amdgcn-wmma-w64.hip
new file mode 100644
index 0000000000000..d4dbe6e975ef0
--- /dev/null
+++ b/clang/test/CodeGenHIP/builtins-amdgcn-wmma-w64.hip
@@ -0,0 +1,90 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 6
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1100 
-target-feature +wavefrontsize64 -emit-llvm -fcuda-is-device -o - %s | 
FileCheck %s --check-prefix=CHECK-GFX1100
+
+#define __device__ __attribute__((device))
+
+typedef float    v4f   __attribute__((ext_vector_type(4)));
+typedef _Float16 v16h  __attribute__((ext_vector_type(16)));
+typedef _Float16 v8h   __attribute__((ext_vector_type(8)));
+
+// CHECK-GFX1100-LABEL: define dso_local void 
@_Z41test_amdgcn_wmma_f32_16x16x16_f16_w64_hipPDv4_fDv16_DF16_S1_S_(
+// CHECK-GFX1100-SAME: ptr noundef [[OUT:%.*]], <16 x half> noundef [[A:%.*]], 
<16 x half> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) #[[ATTR0:[0-9]+]] 
{
+// CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1100-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-GFX1100-NEXT:    [[A_ADDR:%.*]] = alloca <16 x half>, align 32, 
addrspace(5)
+// CHECK-GFX1100-NEXT:    [[B_ADDR:%.*]] = alloca <16 x half>, align 32, 
addrspace(5)
+// CHECK-GFX1100-NEXT:    [[C_ADDR:%.*]] = alloca <4 x float>, align 16, 
addrspace(5)
+// CHECK-GFX1100-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-GFX1100-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[A_ADDR]] to ptr
+// CHECK-GFX1100-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[B_ADDR]] to ptr
+// CHECK-GFX1100-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[C_ADDR]] to ptr
+// CHECK-GFX1100-NEXT:    store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-GFX1100-NEXT:    store <16 x half> [[A]], ptr [[A_ADDR_ASCAST]], 
align 32
+// CHECK-GFX1100-NEXT:    store <16 x half> [[B]], ptr [[B_ADDR_ASCAST]], 
align 32
+// CHECK-GFX1100-NEXT:    store <4 x float> [[C]], ptr [[C_ADDR_ASCAST]], 
align 16
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr 
[[A_ADDR_ASCAST]], align 32
+// CHECK-GFX1100-NEXT:    [[TMP1:%.*]] = load <16 x half>, ptr 
[[B_ADDR_ASCAST]], align 32
+// CHECK-GFX1100-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr 
[[C_ADDR_ASCAST]], align 16
+// CHECK-GFX1100-NEXT:    [[TMP3:%.*]] = call contract <4 x float> 
@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half> [[TMP0]], <16 x 
half> [[TMP1]], <4 x float> [[TMP2]])
+// CHECK-GFX1100-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], 
align 8
+// CHECK-GFX1100-NEXT:    store <4 x float> [[TMP3]], ptr [[TMP4]], align 16
+// CHECK-GFX1100-NEXT:    ret void
+//
+__device__ void test_amdgcn_wmma_f32_16x16x16_f16_w64_hip(v4f* out, v16h a, 
v16h b, v4f c) {
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64(a, b, c);
+}
+
+// CHECK-GFX1100-LABEL: define dso_local void 
@_Z41test_amdgcn_wmma_f16_16x16x16_f16_w64_hipPDv8_DF16_Dv16_DF16_S1_S_(
+// CHECK-GFX1100-SAME: ptr noundef [[OUT:%.*]], <16 x half> noundef [[A:%.*]], 
<16 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1100-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-GFX1100-NEXT:    [[A_ADDR:%.*]] = alloca <16 x half>, align 32, 
addrspace(5)
+// CHECK-GFX1100-NEXT:    [[B_ADDR:%.*]] = alloca <16 x half>, align 32, 
addrspace(5)
+// CHECK-GFX1100-NEXT:    [[C_ADDR:%.*]] = alloca <8 x half>, align 16, 
addrspace(5)
+// CHECK-GFX1100-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-GFX1100-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[A_ADDR]] to ptr
+// CHECK-GFX1100-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[B_ADDR]] to ptr
+// CHECK-GFX1100-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[C_ADDR]] to ptr
+// CHECK-GFX1100-NEXT:    store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-GFX1100-NEXT:    store <16 x half> [[A]], ptr [[A_ADDR_ASCAST]], 
align 32
+// CHECK-GFX1100-NEXT:    store <16 x half> [[B]], ptr [[B_ADDR_ASCAST]], 
align 32
+// CHECK-GFX1100-NEXT:    store <8 x half> [[C]], ptr [[C_ADDR_ASCAST]], align 
16
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr 
[[A_ADDR_ASCAST]], align 32
+// CHECK-GFX1100-NEXT:    [[TMP1:%.*]] = load <16 x half>, ptr 
[[B_ADDR_ASCAST]], align 32
+// CHECK-GFX1100-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr 
[[C_ADDR_ASCAST]], align 16
+// CHECK-GFX1100-NEXT:    [[TMP3:%.*]] = call contract <8 x half> 
@llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> [[TMP0]], <16 x 
half> [[TMP1]], <8 x half> [[TMP2]], i1 true)
+// CHECK-GFX1100-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], 
align 8
+// CHECK-GFX1100-NEXT:    store <8 x half> [[TMP3]], ptr [[TMP4]], align 16
+// CHECK-GFX1100-NEXT:    ret void
+//
+__device__ void test_amdgcn_wmma_f16_16x16x16_f16_w64_hip(v8h* out, v16h a, 
v16h b, v8h c) {
+  *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64(a, b, c, true);
+}
+
+// CHECK-GFX1100-LABEL: define dso_local void 
@_Z46test_amdgcn_wmma_f16_16x16x16_f16_tied_w64_hipPDv8_DF16_Dv16_DF16_S1_S_(
+// CHECK-GFX1100-SAME: ptr noundef [[OUT:%.*]], <16 x half> noundef [[A:%.*]], 
<16 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-GFX1100-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1100-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-GFX1100-NEXT:    [[A_ADDR:%.*]] = alloca <16 x half>, align 32, 
addrspace(5)
+// CHECK-GFX1100-NEXT:    [[B_ADDR:%.*]] = alloca <16 x half>, align 32, 
addrspace(5)
+// CHECK-GFX1100-NEXT:    [[C_ADDR:%.*]] = alloca <8 x half>, align 16, 
addrspace(5)
+// CHECK-GFX1100-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-GFX1100-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[A_ADDR]] to ptr
+// CHECK-GFX1100-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[B_ADDR]] to ptr
+// CHECK-GFX1100-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr 
addrspace(5) [[C_ADDR]] to ptr
+// CHECK-GFX1100-NEXT:    store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-GFX1100-NEXT:    store <16 x half> [[A]], ptr [[A_ADDR_ASCAST]], 
align 32
+// CHECK-GFX1100-NEXT:    store <16 x half> [[B]], ptr [[B_ADDR_ASCAST]], 
align 32
+// CHECK-GFX1100-NEXT:    store <8 x half> [[C]], ptr [[C_ADDR_ASCAST]], align 
16
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = load <16 x half>, ptr 
[[A_ADDR_ASCAST]], align 32
+// CHECK-GFX1100-NEXT:    [[TMP1:%.*]] = load <16 x half>, ptr 
[[B_ADDR_ASCAST]], align 32
+// CHECK-GFX1100-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr 
[[C_ADDR_ASCAST]], align 16
+// CHECK-GFX1100-NEXT:    [[TMP3:%.*]] = call contract <8 x half> 
@llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v8f16.v16f16(<16 x half> [[TMP0]], <16 
x half> [[TMP1]], <8 x half> [[TMP2]], i1 true)
+// CHECK-GFX1100-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], 
align 8
+// CHECK-GFX1100-NEXT:    store <8 x half> [[TMP3]], ptr [[TMP4]], align 16
+// CHECK-GFX1100-NEXT:    ret void
+//
+__device__ void test_amdgcn_wmma_f16_16x16x16_f16_tied_w64_hip(v8h* out, v16h 
a, v16h b, v8h c) {
+  *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64(a, b, c, true);
+}

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to