[clang] [llvm] [AMDGPU] Fix operand types for `V_DOT2_F32_BF16` (PR #82044)

2024-02-20 Thread Shilei Tian via cfe-commits

https://github.com/shiltian closed 
https://github.com/llvm/llvm-project/pull/82044
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Fix operand types for `V_DOT2_F32_BF16` (PR #82044)

2024-02-20 Thread Stanislav Mekhanoshin via cfe-commits

https://github.com/rampitec approved this pull request.


https://github.com/llvm/llvm-project/pull/82044
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Fix operand types for `V_DOT2_F32_BF16` (PR #82044)

2024-02-19 Thread Shilei Tian via cfe-commits

https://github.com/shiltian updated 
https://github.com/llvm/llvm-project/pull/82044

>From a769826f1ff424dab5377fff249bfdd1465633bb Mon Sep 17 00:00:00 2001
From: Shilei Tian 
Date: Mon, 19 Feb 2024 23:06:14 -0500
Subject: [PATCH] [AMDGPU] Fix operand types for `V_DOT2_F32_BF16`

---
 .../builtins-amdgcn-dl-insts-gfx11.cl |  4 +-
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  4 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.td |  2 +-
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   |  2 +-
 .../AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll  | 14 ++---
 llvm/test/MC/AMDGPU/bf16_imm.s| 63 +++
 llvm/test/MC/Disassembler/AMDGPU/bf16_imm.txt | 63 +++
 7 files changed, 139 insertions(+), 13 deletions(-)

diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl
index 7688dfa55a78e3..1ada16610d0b3a 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl
@@ -15,8 +15,8 @@ typedef unsigned short __attribute__((ext_vector_type(2))) 
ushort2;
 // CHECK-NEXT: [[s2:%[0-9]+]] = bitcast <2 x i16> %v2ssB to <2 x bfloat>
 // CHECK-NEXT: [[s3:%[0-9]+]] = bitcast i16 %sC to bfloat
 // CHECK-NEXT: [[d:%[0-9]+]] = tail call bfloat 
@llvm.amdgcn.fdot2.bf16.bf16(<2 x bfloat> [[s1]], <2 x bfloat> [[s2]], bfloat 
[[s3]])
-// CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> 
%v2ssB, float %fC, i1 false)
-// CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> 
%v2ssB, float %fC, i1 true)
+// CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> [[s1]], <2 x 
bfloat> [[s2]], float %fC, i1 false)
+// CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> [[s1]], <2 x 
bfloat> [[s2]], float %fC, i1 true)
 // CHECK: call i32 @llvm.amdgcn.udot4(i32 %uiA, i32 %uiB, i32 %uiC, i1 false)
 // CHECK: call i32 @llvm.amdgcn.udot4(i32 %uiA, i32 %uiB, i32 %uiC, i1 true)
 // CHECK: call i32 @llvm.amdgcn.sudot4(i1 true, i32 %A, i1 false, i32 %B, i32 
%C, i1 false)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 6795fb7aa0edb8..0f29653f1f5bec 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2835,8 +2835,8 @@ def int_amdgcn_fdot2_f32_bf16 :
   DefaultAttrsIntrinsic<
 [llvm_float_ty], // %r
 [
-  llvm_v2i16_ty, // %a
-  llvm_v2i16_ty, // %b
+  llvm_v2bf16_ty, // %a
+  llvm_v2bf16_ty, // %b
   llvm_float_ty, // %c
   llvm_i1_ty // %clamp
 ],
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td 
b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 140c99ff304143..cd14c12a8a80c6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2494,7 +2494,7 @@ def VOP_V2I16_I32_I32 : VOPProfile <[v2i16, i32, i32, 
untyped]>;
 
 def VOP_F16_V2F16_V2F16_F16 : VOPProfile <[f16, v2f16, v2f16, f16]>;
 def VOP_BF16_V2BF16_V2BF16_BF16: VOPProfile <[bf16, v2bf16, v2bf16, bf16]>;
-def VOP_F32_V2I16_V2I16_F32 : VOPProfile <[f32, v2i16, v2i16, f32]>;
+def VOP_F32_V2BF16_V2BF16_F32 : VOPProfile <[f32, v2bf16, v2bf16, f32]>;
 
 def VOP_F32_V2F16_V2F16_V2F16 : VOPProfile <[f32, v2f16, v2f16, v2f16]>;
 
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td 
b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 886858b5ab1ad3..74f451b6d4f7fe 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -396,7 +396,7 @@ defm V_DOT8_I32_I4  : VOP3PInst<"v_dot8_i32_i4",
 } // End OtherPredicates = [HasDot1Insts]
 
 def DOT2_BF16_Profile
-  : VOP3P_Profile {
+  : VOP3P_Profile {
   let HasSrc1Mods = 1;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
index 367ff57bae2fd6..e51b1d2da2e414 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | 
FileCheck %s --check-prefixes=GFX11
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < 
%s | FileCheck %s --check-prefixes=GFX11
 
-declare float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %a, <2 x i16> %b, float 
%c, i1 %clamp)
+declare float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, 
float %c, i1 %clamp)
 
 define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp(
 ; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp:
@@ -25,10 +25,10 @@ define amdgpu_kernel void 
@test_llvm_amdgcn_fdot2_f32_bf16_clamp(
 ptr addrspace(1) %b,
 ptr addrspace(1) %c) {
 entry:
-  %a.val = load <2 x i16>, ptr addrspace(1) %a
-  %b.val = load <2 x i16>, ptr addrspace(1) %b
+  %a.val = load <2 x bfloat>, ptr addrspace(1) %a
+  %b.val = load <2 x bfloat>, ptr addrspace(1) %b
   %

[clang] [llvm] [AMDGPU] Fix operand types for `V_DOT2_F32_BF16` (PR #82044)

2024-02-19 Thread Shilei Tian via cfe-commits

https://github.com/shiltian updated 
https://github.com/llvm/llvm-project/pull/82044

>From c78fe7c5e3de222539d6ac324fedf55e0d01d321 Mon Sep 17 00:00:00 2001
From: Shilei Tian 
Date: Mon, 19 Feb 2024 22:52:26 -0500
Subject: [PATCH] [AMDGPU] Fix operand types for `V_DOT2_F32_BF16`

---
 .../builtins-amdgcn-dl-insts-gfx11.cl  |  4 ++--
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td   |  4 ++--
 llvm/lib/Target/AMDGPU/SIInstrInfo.td  |  2 +-
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td|  2 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll   | 14 +++---
 llvm/test/MC/AMDGPU/bf16_imm.s |  9 +
 llvm/test/MC/Disassembler/AMDGPU/bf16_imm.txt  |  6 ++
 7 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl
index 7688dfa55a78e3..1ada16610d0b3a 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl
@@ -15,8 +15,8 @@ typedef unsigned short __attribute__((ext_vector_type(2))) 
ushort2;
 // CHECK-NEXT: [[s2:%[0-9]+]] = bitcast <2 x i16> %v2ssB to <2 x bfloat>
 // CHECK-NEXT: [[s3:%[0-9]+]] = bitcast i16 %sC to bfloat
 // CHECK-NEXT: [[d:%[0-9]+]] = tail call bfloat 
@llvm.amdgcn.fdot2.bf16.bf16(<2 x bfloat> [[s1]], <2 x bfloat> [[s2]], bfloat 
[[s3]])
-// CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> 
%v2ssB, float %fC, i1 false)
-// CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> 
%v2ssB, float %fC, i1 true)
+// CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> [[s1]], <2 x 
bfloat> [[s2]], float %fC, i1 false)
+// CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> [[s1]], <2 x 
bfloat> [[s2]], float %fC, i1 true)
 // CHECK: call i32 @llvm.amdgcn.udot4(i32 %uiA, i32 %uiB, i32 %uiC, i1 false)
 // CHECK: call i32 @llvm.amdgcn.udot4(i32 %uiA, i32 %uiB, i32 %uiC, i1 true)
 // CHECK: call i32 @llvm.amdgcn.sudot4(i1 true, i32 %A, i1 false, i32 %B, i32 
%C, i1 false)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 6795fb7aa0edb8..0f29653f1f5bec 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2835,8 +2835,8 @@ def int_amdgcn_fdot2_f32_bf16 :
   DefaultAttrsIntrinsic<
 [llvm_float_ty], // %r
 [
-  llvm_v2i16_ty, // %a
-  llvm_v2i16_ty, // %b
+  llvm_v2bf16_ty, // %a
+  llvm_v2bf16_ty, // %b
   llvm_float_ty, // %c
   llvm_i1_ty // %clamp
 ],
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td 
b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 140c99ff304143..cd14c12a8a80c6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2494,7 +2494,7 @@ def VOP_V2I16_I32_I32 : VOPProfile <[v2i16, i32, i32, 
untyped]>;
 
 def VOP_F16_V2F16_V2F16_F16 : VOPProfile <[f16, v2f16, v2f16, f16]>;
 def VOP_BF16_V2BF16_V2BF16_BF16: VOPProfile <[bf16, v2bf16, v2bf16, bf16]>;
-def VOP_F32_V2I16_V2I16_F32 : VOPProfile <[f32, v2i16, v2i16, f32]>;
+def VOP_F32_V2BF16_V2BF16_F32 : VOPProfile <[f32, v2bf16, v2bf16, f32]>;
 
 def VOP_F32_V2F16_V2F16_V2F16 : VOPProfile <[f32, v2f16, v2f16, v2f16]>;
 
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td 
b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 886858b5ab1ad3..74f451b6d4f7fe 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -396,7 +396,7 @@ defm V_DOT8_I32_I4  : VOP3PInst<"v_dot8_i32_i4",
 } // End OtherPredicates = [HasDot1Insts]
 
 def DOT2_BF16_Profile
-  : VOP3P_Profile {
+  : VOP3P_Profile {
   let HasSrc1Mods = 1;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
index 367ff57bae2fd6..e51b1d2da2e414 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | 
FileCheck %s --check-prefixes=GFX11
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < 
%s | FileCheck %s --check-prefixes=GFX11
 
-declare float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %a, <2 x i16> %b, float 
%c, i1 %clamp)
+declare float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, 
float %c, i1 %clamp)
 
 define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp(
 ; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp:
@@ -25,10 +25,10 @@ define amdgpu_kernel void 
@test_llvm_amdgcn_fdot2_f32_bf16_clamp(
 ptr addrspace(1) %b,
 ptr addrspace(1) %c) {
 entry:
-  %a.val = load <2 x i16>, ptr addrspace(1) %a
-  %b.val = load <2 x i16>, ptr addrspace(1) %b
+  %a.val = load <2 x bfloat>, ptr addrspace(1) %a
+  %b.val = load <2 x bfloat>, 

[clang] [llvm] [AMDGPU] Fix operand types for `V_DOT2_F32_BF16` (PR #82044)

2024-02-16 Thread Shilei Tian via cfe-commits

https://github.com/shiltian updated 
https://github.com/llvm/llvm-project/pull/82044

>From b964fee8219e655d3c7df34cd01e5650ae357f6b Mon Sep 17 00:00:00 2001
From: Shilei Tian 
Date: Fri, 16 Feb 2024 17:49:23 -0500
Subject: [PATCH] [AMDGPU] Fix operand types for `V_DOT2_F32_BF16`

---
 .../builtins-amdgcn-dl-insts-gfx11.cl  |  4 ++--
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td   |  4 ++--
 llvm/lib/Target/AMDGPU/SIInstrInfo.td  |  2 +-
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td|  2 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll   | 14 +++---
 llvm/test/MC/AMDGPU/bf16_imm.s |  9 +
 llvm/test/MC/Disassembler/AMDGPU/bf16_imm.txt  | 11 ++-
 7 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl 
b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl
index 7688dfa55a78e3..1ada16610d0b3a 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-gfx11.cl
@@ -15,8 +15,8 @@ typedef unsigned short __attribute__((ext_vector_type(2))) 
ushort2;
 // CHECK-NEXT: [[s2:%[0-9]+]] = bitcast <2 x i16> %v2ssB to <2 x bfloat>
 // CHECK-NEXT: [[s3:%[0-9]+]] = bitcast i16 %sC to bfloat
 // CHECK-NEXT: [[d:%[0-9]+]] = tail call bfloat 
@llvm.amdgcn.fdot2.bf16.bf16(<2 x bfloat> [[s1]], <2 x bfloat> [[s2]], bfloat 
[[s3]])
-// CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> 
%v2ssB, float %fC, i1 false)
-// CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %v2ssA, <2 x i16> 
%v2ssB, float %fC, i1 true)
+// CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> [[s1]], <2 x 
bfloat> [[s2]], float %fC, i1 false)
+// CHECK: call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> [[s1]], <2 x 
bfloat> [[s2]], float %fC, i1 true)
 // CHECK: call i32 @llvm.amdgcn.udot4(i32 %uiA, i32 %uiB, i32 %uiC, i1 false)
 // CHECK: call i32 @llvm.amdgcn.udot4(i32 %uiA, i32 %uiB, i32 %uiC, i1 true)
 // CHECK: call i32 @llvm.amdgcn.sudot4(i1 true, i32 %A, i1 false, i32 %B, i32 
%C, i1 false)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 6795fb7aa0edb8..0f29653f1f5bec 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2835,8 +2835,8 @@ def int_amdgcn_fdot2_f32_bf16 :
   DefaultAttrsIntrinsic<
 [llvm_float_ty], // %r
 [
-  llvm_v2i16_ty, // %a
-  llvm_v2i16_ty, // %b
+  llvm_v2bf16_ty, // %a
+  llvm_v2bf16_ty, // %b
   llvm_float_ty, // %c
   llvm_i1_ty // %clamp
 ],
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td 
b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 140c99ff304143..cd14c12a8a80c6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2494,7 +2494,7 @@ def VOP_V2I16_I32_I32 : VOPProfile <[v2i16, i32, i32, 
untyped]>;
 
 def VOP_F16_V2F16_V2F16_F16 : VOPProfile <[f16, v2f16, v2f16, f16]>;
 def VOP_BF16_V2BF16_V2BF16_BF16: VOPProfile <[bf16, v2bf16, v2bf16, bf16]>;
-def VOP_F32_V2I16_V2I16_F32 : VOPProfile <[f32, v2i16, v2i16, f32]>;
+def VOP_F32_V2BF16_V2BF16_F32 : VOPProfile <[f32, v2bf16, v2bf16, f32]>;
 
 def VOP_F32_V2F16_V2F16_V2F16 : VOPProfile <[f32, v2f16, v2f16, v2f16]>;
 
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td 
b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 886858b5ab1ad3..74f451b6d4f7fe 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -396,7 +396,7 @@ defm V_DOT8_I32_I4  : VOP3PInst<"v_dot8_i32_i4",
 } // End OtherPredicates = [HasDot1Insts]
 
 def DOT2_BF16_Profile
-  : VOP3P_Profile {
+  : VOP3P_Profile {
   let HasSrc1Mods = 1;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll 
b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
index 367ff57bae2fd6..e51b1d2da2e414 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | 
FileCheck %s --check-prefixes=GFX11
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < 
%s | FileCheck %s --check-prefixes=GFX11
 
-declare float @llvm.amdgcn.fdot2.f32.bf16(<2 x i16> %a, <2 x i16> %b, float 
%c, i1 %clamp)
+declare float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, 
float %c, i1 %clamp)
 
 define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp(
 ; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp:
@@ -25,10 +25,10 @@ define amdgpu_kernel void 
@test_llvm_amdgcn_fdot2_f32_bf16_clamp(
 ptr addrspace(1) %b,
 ptr addrspace(1) %c) {
 entry:
-  %a.val = load <2 x i16>, ptr addrspace(1) %a
-  %b.val = load <2 x i16>, ptr addrspace(1) %b
+  %a.val = load <2 x bfloat>, ptr addrspace(1) %a
+  %b.val = load <2 x bflo