[clang] [OpenCL] Fix an infinite loop in builidng AddrSpaceQualType (PR #92612)

2024-05-20 Thread Changpeng Fang via cfe-commits

https://github.com/changpeng closed 
https://github.com/llvm/llvm-project/pull/92612
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [OpenCL] Fix an infinite loop in builidng AddrSpaceQualType (PR #92612)

2024-05-18 Thread Changpeng Fang via cfe-commits


@@ -0,0 +1,25 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 4
+//RUN: %clang_cc1 %s -emit-llvm -O1 -o - | FileCheck %s

changpeng wrote:

add "triple spir", same as other tests in the same directory

https://github.com/llvm/llvm-project/pull/92612
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [OpenCL] Fix an infinite loop in builidng AddrSpaceQualType (PR #92612)

2024-05-18 Thread Changpeng Fang via cfe-commits

https://github.com/changpeng updated 
https://github.com/llvm/llvm-project/pull/92612

>From 2468a85a47499d90a99610846c632332eb7307b8 Mon Sep 17 00:00:00 2001
From: Changpeng Fang 
Date: Fri, 17 May 2024 15:13:07 -0700
Subject: [PATCH 1/3] [OpenCL] Fix an infinite loop in builidng
 AddrSpaceQualType

 In building AddrSpaceQualType 
(https://github.com/llvm/llvm-project/pull/90048),
there is a bug in removeAddrSpaceQualType() for arrays. Arrays are weird because
qualifiers on the element type also count as qualifiers on the type, so
getSingleStepDesugaredType() can't remove the sugar on arrays. This results
in an infinite loop in removeAddrSpaceQualType. To fix the issue,
we use ASTContext::getUnqualifiedArrayType, which strips the qualifier off
the element type, then reconstruct the array type.
---
 clang/lib/CodeGen/CGExprAgg.cpp   |  3 ++-
 .../array-type-infinite-loop.clcpp| 25 +++
 2 files changed, 27 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/CodeGenOpenCLCXX/array-type-infinite-loop.clcpp

diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index 6172eb9cdc1bb..53ce133e8cbc6 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -537,8 +537,9 @@ void AggExprEmitter::EmitArrayInit(Address DestPtr, 
llvm::ArrayType *AType,
   elementType.isTriviallyCopyableType(CGF.getContext())) {
 CodeGen::CodeGenModule  = CGF.CGM;
 ConstantEmitter Emitter(CGF);
+Qualifiers Quals;
 QualType GVArrayQTy = CGM.getContext().getAddrSpaceQualType(
-CGM.getContext().removeAddrSpaceQualType(ArrayQTy),
+CGM.getContext().getUnqualifiedArrayType(ArrayQTy, Quals),
 CGM.GetGlobalConstantAddressSpace());
 LangAS AS = GVArrayQTy.getAddressSpace();
 if (llvm::Constant *C =
diff --git a/clang/test/CodeGenOpenCLCXX/array-type-infinite-loop.clcpp 
b/clang/test/CodeGenOpenCLCXX/array-type-infinite-loop.clcpp
new file mode 100644
index 0..5a5b104e892f7
--- /dev/null
+++ b/clang/test/CodeGenOpenCLCXX/array-type-infinite-loop.clcpp
@@ -0,0 +1,25 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 4
+//RUN: %clang_cc1 %s -emit-llvm -O1 -o - | FileCheck %s
+
+// CHECK-LABEL: define dso_local spir_kernel void @test(
+// CHECK-SAME: ptr nocapture noundef readonly align 8 [[IN:%.*]], ptr 
nocapture noundef writeonly align 8 [[OUT:%.*]]) local_unnamed_addr 
#[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] 
!kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] 
!kernel_arg_base_type [[META5]] !kernel_arg_type_qual [[META6:![0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[IN]], 
i64 8
+// CHECK-NEXT:[[TMP0:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8, !tbaa 
[[TBAA7:![0-9]+]]
+// CHECK-NEXT:store i64 [[TMP0]], ptr [[OUT]], align 8, !tbaa [[TBAA7]]
+// CHECK-NEXT:ret void
+//
+__kernel void test(__global long *In, __global long *Out) {
+   long m[4] = {  In[0], In[1], 0, 0 };
+   *Out = m[1];
+}
+//.
+// CHECK: [[META3]] = !{i32 1, i32 1}
+// CHECK: [[META4]] = !{!"none", !"none"}
+// CHECK: [[META5]] = !{!"long*", !"long*"}
+// CHECK: [[META6]] = !{!"", !""}
+// CHECK: [[TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0}
+// CHECK: [[META8]] = !{!"long", [[META9:![0-9]+]], i64 0}
+// CHECK: [[META9]] = !{!"omnipotent char", [[META10:![0-9]+]], i64 0}
+// CHECK: [[META10]] = !{!"Simple C++ TBAA"}
+//.

>From 17ac766cdcbf22af685b89b9a054a22afb42f46e Mon Sep 17 00:00:00 2001
From: Changpeng Fang 
Date: Fri, 17 May 2024 18:20:06 -0700
Subject: [PATCH 2/3] [OpenCL] Fix an infinite loop in builidng
 AddrSpaceQualType

  Fix ASTContext::removeAddrSpaceQualType()
---
 clang/include/clang/AST/ASTContext.h | 2 +-
 clang/lib/AST/ASTContext.cpp | 9 -
 clang/lib/CodeGen/CGExprAgg.cpp  | 3 +--
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/clang/include/clang/AST/ASTContext.h 
b/clang/include/clang/AST/ASTContext.h
index e03b112194786..2ce2b810d3636 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -2611,7 +2611,7 @@ class ASTContext : public RefCountedBase {
   ///
   /// \returns if this is an array type, the completely unqualified array type
   /// that corresponds to it. Otherwise, returns T.getUnqualifiedType().
-  QualType getUnqualifiedArrayType(QualType T, Qualifiers );
+  QualType getUnqualifiedArrayType(QualType T, Qualifiers ) const;
 
   /// Determine whether the given types are equivalent after
   /// cvr-qualifiers have been removed.
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 8fc2bb8c401c2..388233c554d46 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -3054,6 +3054,13 @@ QualType ASTContext::removeAddrSpaceQualType(QualType T) 
const {
   if (!T.hasAddressSpace())
 

[clang] [OpenCL] Fix an infinite loop in builidng AddrSpaceQualType (PR #92612)

2024-05-18 Thread Changpeng Fang via cfe-commits

https://github.com/changpeng edited 
https://github.com/llvm/llvm-project/pull/92612
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [OpenCL] Fix an infinite loop in builidng AddrSpaceQualType (PR #92612)

2024-05-18 Thread Changpeng Fang via cfe-commits


@@ -3054,6 +3054,13 @@ QualType ASTContext::removeAddrSpaceQualType(QualType T) 
const {
   if (!T.hasAddressSpace())
 return T;
 
+  // For arrays, strip the qualifier off the element type, then reconstruct the
+  // array type
+  if (T.getTypePtr()->isArrayType()) {
+Qualifiers Qualfs;
+return getUnqualifiedArrayType(T, Qualfs);

changpeng wrote:

Thanks.  Can I do as the following?
Note that I am passing QualifierCollector to getUnqualifiedArrayType, which has 
Qualifiers as the second argument.
 Also,  TypeNode = T.getTypePtr(); after I is unqualified.

 QualifierCollector Quals;
  const Type *TypeNode;
  if (T.getTypePtr()->isArrayType()) {
T = getUnqualifiedArrayType(T, Quals);
TypeNode = T.getTypePtr();
  } else {
  while (T.hasAddressSpace()) {
  

https://github.com/llvm/llvm-project/pull/92612
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [OpenCL] Fix an infinite loop in builidng AddrSpaceQualType (PR #92612)

2024-05-17 Thread Changpeng Fang via cfe-commits


@@ -537,8 +537,9 @@ void AggExprEmitter::EmitArrayInit(Address DestPtr, 
llvm::ArrayType *AType,
   elementType.isTriviallyCopyableType(CGF.getContext())) {
 CodeGen::CodeGenModule  = CGF.CGM;
 ConstantEmitter Emitter(CGF);
+Qualifiers Quals;
 QualType GVArrayQTy = CGM.getContext().getAddrSpaceQualType(
-CGM.getContext().removeAddrSpaceQualType(ArrayQTy),
+CGM.getContext().getUnqualifiedArrayType(ArrayQTy, Quals),

changpeng wrote:

> Something like that, yes.

Thanks. Updated. Should be closer!

https://github.com/llvm/llvm-project/pull/92612
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [OpenCL] Fix an infinite loop in builidng AddrSpaceQualType (PR #92612)

2024-05-17 Thread Changpeng Fang via cfe-commits

https://github.com/changpeng updated 
https://github.com/llvm/llvm-project/pull/92612

>From 2468a85a47499d90a99610846c632332eb7307b8 Mon Sep 17 00:00:00 2001
From: Changpeng Fang 
Date: Fri, 17 May 2024 15:13:07 -0700
Subject: [PATCH 1/2] [OpenCL] Fix an infinite loop in builidng
 AddrSpaceQualType

 In building AddrSpaceQualType 
(https://github.com/llvm/llvm-project/pull/90048),
there is a bug in removeAddrSpaceQualType() for arrays. Arrays are weird because
qualifiers on the element type also count as qualifiers on the type, so
getSingleStepDesugaredType() can't remove the sugar on arrays. This results
in an infinite loop in removeAddrSpaceQualType. To fix the issue,
we use ASTContext::getUnqualifiedArrayType, which strips the qualifier off
the element type, then reconstruct the array type.
---
 clang/lib/CodeGen/CGExprAgg.cpp   |  3 ++-
 .../array-type-infinite-loop.clcpp| 25 +++
 2 files changed, 27 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/CodeGenOpenCLCXX/array-type-infinite-loop.clcpp

diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index 6172eb9cdc1bb..53ce133e8cbc6 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -537,8 +537,9 @@ void AggExprEmitter::EmitArrayInit(Address DestPtr, 
llvm::ArrayType *AType,
   elementType.isTriviallyCopyableType(CGF.getContext())) {
 CodeGen::CodeGenModule  = CGF.CGM;
 ConstantEmitter Emitter(CGF);
+Qualifiers Quals;
 QualType GVArrayQTy = CGM.getContext().getAddrSpaceQualType(
-CGM.getContext().removeAddrSpaceQualType(ArrayQTy),
+CGM.getContext().getUnqualifiedArrayType(ArrayQTy, Quals),
 CGM.GetGlobalConstantAddressSpace());
 LangAS AS = GVArrayQTy.getAddressSpace();
 if (llvm::Constant *C =
diff --git a/clang/test/CodeGenOpenCLCXX/array-type-infinite-loop.clcpp 
b/clang/test/CodeGenOpenCLCXX/array-type-infinite-loop.clcpp
new file mode 100644
index 0..5a5b104e892f7
--- /dev/null
+++ b/clang/test/CodeGenOpenCLCXX/array-type-infinite-loop.clcpp
@@ -0,0 +1,25 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 4
+//RUN: %clang_cc1 %s -emit-llvm -O1 -o - | FileCheck %s
+
+// CHECK-LABEL: define dso_local spir_kernel void @test(
+// CHECK-SAME: ptr nocapture noundef readonly align 8 [[IN:%.*]], ptr 
nocapture noundef writeonly align 8 [[OUT:%.*]]) local_unnamed_addr 
#[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] 
!kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] 
!kernel_arg_base_type [[META5]] !kernel_arg_type_qual [[META6:![0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[IN]], 
i64 8
+// CHECK-NEXT:[[TMP0:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8, !tbaa 
[[TBAA7:![0-9]+]]
+// CHECK-NEXT:store i64 [[TMP0]], ptr [[OUT]], align 8, !tbaa [[TBAA7]]
+// CHECK-NEXT:ret void
+//
+__kernel void test(__global long *In, __global long *Out) {
+   long m[4] = {  In[0], In[1], 0, 0 };
+   *Out = m[1];
+}
+//.
+// CHECK: [[META3]] = !{i32 1, i32 1}
+// CHECK: [[META4]] = !{!"none", !"none"}
+// CHECK: [[META5]] = !{!"long*", !"long*"}
+// CHECK: [[META6]] = !{!"", !""}
+// CHECK: [[TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0}
+// CHECK: [[META8]] = !{!"long", [[META9:![0-9]+]], i64 0}
+// CHECK: [[META9]] = !{!"omnipotent char", [[META10:![0-9]+]], i64 0}
+// CHECK: [[META10]] = !{!"Simple C++ TBAA"}
+//.

>From 17ac766cdcbf22af685b89b9a054a22afb42f46e Mon Sep 17 00:00:00 2001
From: Changpeng Fang 
Date: Fri, 17 May 2024 18:20:06 -0700
Subject: [PATCH 2/2] [OpenCL] Fix an infinite loop in builidng
 AddrSpaceQualType

  Fix ASTContext::removeAddrSpaceQualType()
---
 clang/include/clang/AST/ASTContext.h | 2 +-
 clang/lib/AST/ASTContext.cpp | 9 -
 clang/lib/CodeGen/CGExprAgg.cpp  | 3 +--
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/clang/include/clang/AST/ASTContext.h 
b/clang/include/clang/AST/ASTContext.h
index e03b112194786..2ce2b810d3636 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -2611,7 +2611,7 @@ class ASTContext : public RefCountedBase {
   ///
   /// \returns if this is an array type, the completely unqualified array type
   /// that corresponds to it. Otherwise, returns T.getUnqualifiedType().
-  QualType getUnqualifiedArrayType(QualType T, Qualifiers );
+  QualType getUnqualifiedArrayType(QualType T, Qualifiers ) const;
 
   /// Determine whether the given types are equivalent after
   /// cvr-qualifiers have been removed.
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 8fc2bb8c401c2..388233c554d46 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -3054,6 +3054,13 @@ QualType ASTContext::removeAddrSpaceQualType(QualType T) 
const {
   if (!T.hasAddressSpace())
 

[clang] [OpenCL] Fix an infinite loop in builidng AddrSpaceQualType (PR #92612)

2024-05-17 Thread Changpeng Fang via cfe-commits


@@ -537,8 +537,9 @@ void AggExprEmitter::EmitArrayInit(Address DestPtr, 
llvm::ArrayType *AType,
   elementType.isTriviallyCopyableType(CGF.getContext())) {
 CodeGen::CodeGenModule  = CGF.CGM;
 ConstantEmitter Emitter(CGF);
+Qualifiers Quals;
 QualType GVArrayQTy = CGM.getContext().getAddrSpaceQualType(
-CGM.getContext().removeAddrSpaceQualType(ArrayQTy),
+CGM.getContext().getUnqualifiedArrayType(ArrayQTy, Quals),

changpeng wrote:

Do you mean we should actually fix removeAddrSpaceQualType? Somewhere inside 
removeAddrSpaceQualType, we
should use getUnqualifiedArrayType if it is an arrayType, and 
getSingleStepDesugaredType othereise?
I have to admit that I have no experience in this field, so I am relying on you 
and @svenvh to move on for a reasonable fix. Thanks.

https://github.com/llvm/llvm-project/pull/92612
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [OpenCL] Put constant initializer globals into constant addrspace (PR #90048)

2024-05-17 Thread Changpeng Fang via cfe-commits


@@ -535,20 +535,23 @@ void AggExprEmitter::EmitArrayInit(Address DestPtr, 
llvm::ArrayType *AType,
   elementType.isTriviallyCopyableType(CGF.getContext())) {
 CodeGen::CodeGenModule  = CGF.CGM;
 ConstantEmitter Emitter(CGF);
-LangAS AS = ArrayQTy.getAddressSpace();
+QualType GVArrayQTy = CGM.getContext().getAddrSpaceQualType(
+CGM.getContext().removeAddrSpaceQualType(ArrayQTy),

changpeng wrote:

> I think it's a bug in removeAddrSpaceQualType(): it needs to special-case 
> arrays. Arrays are weird because qualifiers on the element type also count as 
> qualifiers on the type, so getSingleStepDesugaredType() can't remove the 
> sugar on arrays. So it needs to strip the qualifier off the element type, 
> then reconstruct the array type. Maybe it can use 
> ASTContext::getUnqualifiedArrayType.

Thanks for the suggestion. I drafted a fix:
https://github.com/llvm/llvm-project/pull/92612

https://github.com/llvm/llvm-project/pull/90048
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [OpenCL] Fix an infinite loop in builidng AddrSpaceQualType (PR #92612)

2024-05-17 Thread Changpeng Fang via cfe-commits

https://github.com/changpeng created 
https://github.com/llvm/llvm-project/pull/92612

 In building AddrSpaceQualType 
(https://github.com/llvm/llvm-project/pull/90048), there is a bug in 
removeAddrSpaceQualType() for arrays. Arrays are weird because qualifiers on 
the element type also count as qualifiers on the type, so 
getSingleStepDesugaredType() can't remove the sugar on arrays. This results in 
an infinite loop in removeAddrSpaceQualType. To fix the issue, we use 
ASTContext::getUnqualifiedArrayType instead, which strips the qualifier off the 
element type, then reconstruct the array type.

>From 2468a85a47499d90a99610846c632332eb7307b8 Mon Sep 17 00:00:00 2001
From: Changpeng Fang 
Date: Fri, 17 May 2024 15:13:07 -0700
Subject: [PATCH] [OpenCL] Fix an infinite loop in builidng AddrSpaceQualType

 In building AddrSpaceQualType 
(https://github.com/llvm/llvm-project/pull/90048),
there is a bug in removeAddrSpaceQualType() for arrays. Arrays are weird because
qualifiers on the element type also count as qualifiers on the type, so
getSingleStepDesugaredType() can't remove the sugar on arrays. This results
in an infinite loop in removeAddrSpaceQualType. To fix the issue,
we use ASTContext::getUnqualifiedArrayType, which strips the qualifier off
the element type, then reconstruct the array type.
---
 clang/lib/CodeGen/CGExprAgg.cpp   |  3 ++-
 .../array-type-infinite-loop.clcpp| 25 +++
 2 files changed, 27 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/CodeGenOpenCLCXX/array-type-infinite-loop.clcpp

diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index 6172eb9cdc1bb..53ce133e8cbc6 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -537,8 +537,9 @@ void AggExprEmitter::EmitArrayInit(Address DestPtr, 
llvm::ArrayType *AType,
   elementType.isTriviallyCopyableType(CGF.getContext())) {
 CodeGen::CodeGenModule  = CGF.CGM;
 ConstantEmitter Emitter(CGF);
+Qualifiers Quals;
 QualType GVArrayQTy = CGM.getContext().getAddrSpaceQualType(
-CGM.getContext().removeAddrSpaceQualType(ArrayQTy),
+CGM.getContext().getUnqualifiedArrayType(ArrayQTy, Quals),
 CGM.GetGlobalConstantAddressSpace());
 LangAS AS = GVArrayQTy.getAddressSpace();
 if (llvm::Constant *C =
diff --git a/clang/test/CodeGenOpenCLCXX/array-type-infinite-loop.clcpp 
b/clang/test/CodeGenOpenCLCXX/array-type-infinite-loop.clcpp
new file mode 100644
index 0..5a5b104e892f7
--- /dev/null
+++ b/clang/test/CodeGenOpenCLCXX/array-type-infinite-loop.clcpp
@@ -0,0 +1,25 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 4
+//RUN: %clang_cc1 %s -emit-llvm -O1 -o - | FileCheck %s
+
+// CHECK-LABEL: define dso_local spir_kernel void @test(
+// CHECK-SAME: ptr nocapture noundef readonly align 8 [[IN:%.*]], ptr 
nocapture noundef writeonly align 8 [[OUT:%.*]]) local_unnamed_addr 
#[[ATTR0:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] 
!kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] 
!kernel_arg_base_type [[META5]] !kernel_arg_type_qual [[META6:![0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:[[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[IN]], 
i64 8
+// CHECK-NEXT:[[TMP0:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8, !tbaa 
[[TBAA7:![0-9]+]]
+// CHECK-NEXT:store i64 [[TMP0]], ptr [[OUT]], align 8, !tbaa [[TBAA7]]
+// CHECK-NEXT:ret void
+//
+__kernel void test(__global long *In, __global long *Out) {
+   long m[4] = {  In[0], In[1], 0, 0 };
+   *Out = m[1];
+}
+//.
+// CHECK: [[META3]] = !{i32 1, i32 1}
+// CHECK: [[META4]] = !{!"none", !"none"}
+// CHECK: [[META5]] = !{!"long*", !"long*"}
+// CHECK: [[META6]] = !{!"", !""}
+// CHECK: [[TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0}
+// CHECK: [[META8]] = !{!"long", [[META9:![0-9]+]], i64 0}
+// CHECK: [[META9]] = !{!"omnipotent char", [[META10:![0-9]+]], i64 0}
+// CHECK: [[META10]] = !{!"Simple C++ TBAA"}
+//.

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [OpenCL] Put constant initializer globals into constant addrspace (PR #90048)

2024-05-17 Thread Changpeng Fang via cfe-commits


@@ -535,20 +535,23 @@ void AggExprEmitter::EmitArrayInit(Address DestPtr, 
llvm::ArrayType *AType,
   elementType.isTriviallyCopyableType(CGF.getContext())) {
 CodeGen::CodeGenModule  = CGF.CGM;
 ConstantEmitter Emitter(CGF);
-LangAS AS = ArrayQTy.getAddressSpace();
+QualType GVArrayQTy = CGM.getContext().getAddrSpaceQualType(
+CGM.getContext().removeAddrSpaceQualType(ArrayQTy),

changpeng wrote:

Reduced further:

clang -c -Xclang -emit-llvm -O0 test.clcpp

__kernel void test(__global ulong *In, __global ulong *Out) {
   ulong m[4] = {  In[0], In[1], 0, 0 };

   *Out = m[1];  
}

https://github.com/llvm/llvm-project/pull/90048
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [OpenCL] Put constant initializer globals into constant addrspace (PR #90048)

2024-05-17 Thread Changpeng Fang via cfe-commits

changpeng wrote:

[test.cl.txt](https://github.com/llvm/llvm-project/files/15355457/test.cl.txt)


https://github.com/llvm/llvm-project/pull/90048
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [OpenCL] Put constant initializer globals into constant addrspace (PR #90048)

2024-05-17 Thread Changpeng Fang via cfe-commits


@@ -535,20 +535,23 @@ void AggExprEmitter::EmitArrayInit(Address DestPtr, 
llvm::ArrayType *AType,
   elementType.isTriviallyCopyableType(CGF.getContext())) {
 CodeGen::CodeGenModule  = CGF.CGM;
 ConstantEmitter Emitter(CGF);
-LangAS AS = ArrayQTy.getAddressSpace();
+QualType GVArrayQTy = CGM.getContext().getAddrSpaceQualType(
+CGM.getContext().removeAddrSpaceQualType(ArrayQTy),

changpeng wrote:

> @changpeng would you be able to provide an input source that demonstrates the 
> issue?

Hi, @svenvh : I attached test.cl.txt here which is the dumped opencl source 
file. Unfortunately I do not know exactly how to reproduce the infinite loop 
offline with this source. I extracted out the following simplified kernel which 
can reproduce the hang with

clang -c -Xclang -emit-llvm -O0 test.clcpp

__kernel void nonceGrind(__global ulong *headerIn, __global ulong *nonceOut) {
   ulong m[16] = {headerIn[0], headerIn[1], 
 
   headerIn[2], headerIn[3],
 
   0, headerIn[5],  
   
   headerIn[6], headerIn[7],
 
   headerIn[8], headerIn[9], 0, 0, 0, 0, 0, 0 };


   *nonceOut = m[4];  
}

However, I am afraid it may not fully represent the original issue. This is 
because after I break out the loop  in 
ASTContext::removeAddrSpaceQualType, I am seeing the following assert:

clang: /home/chfang/llvm-project/clang/include/clang/AST/Type.h:677: void 
clang::Qualifiers::addConsistentQualifiers(Qualifiers): Assertion 
`getAddressSpace() == qs.getAddressSpace() || !hasAddressSpace() || 
!qs.hasAddressSpace()' failed.

Hopefully the information is useful, and you are able to help. Thanks.

https://github.com/llvm/llvm-project/pull/90048
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [OpenCL] Put constant initializer globals into constant addrspace (PR #90048)

2024-05-17 Thread Changpeng Fang via cfe-commits


@@ -535,20 +535,23 @@ void AggExprEmitter::EmitArrayInit(Address DestPtr, 
llvm::ArrayType *AType,
   elementType.isTriviallyCopyableType(CGF.getContext())) {
 CodeGen::CodeGenModule  = CGF.CGM;
 ConstantEmitter Emitter(CGF);
-LangAS AS = ArrayQTy.getAddressSpace();
+QualType GVArrayQTy = CGM.getContext().getAddrSpaceQualType(
+CGM.getContext().removeAddrSpaceQualType(ArrayQTy),

changpeng wrote:

> I think it's a bug in removeAddrSpaceQualType(): it needs to special-case 
> arrays. Arrays are weird because qualifiers on the element type also count as 
> qualifiers on the type, so getSingleStepDesugaredType() can't remove the 
> sugar on arrays. So it needs to strip the qualifier off the element type, 
> then reconstruct the array type. Maybe it can use ASTC
getSingleStepDesugaredType
Yes, the issue is in removeAddrSpaceQualType(ArrayQTy),  And 
getSingleStepDesugaredType can not remove "Sugar".

https://github.com/llvm/llvm-project/pull/90048
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [OpenCL] Put constant initializer globals into constant addrspace (PR #90048)

2024-05-16 Thread Changpeng Fang via cfe-commits


@@ -535,20 +535,23 @@ void AggExprEmitter::EmitArrayInit(Address DestPtr, 
llvm::ArrayType *AType,
   elementType.isTriviallyCopyableType(CGF.getContext())) {
 CodeGen::CodeGenModule  = CGF.CGM;
 ConstantEmitter Emitter(CGF);
-LangAS AS = ArrayQTy.getAddressSpace();
+QualType GVArrayQTy = CGM.getContext().getAddrSpaceQualType(
+CGM.getContext().removeAddrSpaceQualType(ArrayQTy),

changpeng wrote:

We saw a regression caused by this PR. It is a soft hang in 
CGM.getContext().removeAddrSpaceQualType.
Specifically it is in the following while loop:
while (T.hasAddressSpace()) {
TypeNode = Quals.strip(T);

// If the type no longer has an address space after stripping qualifiers,
// jump out.
if (!QualType(TypeNode, 0).hasAddressSpace())
  break;

// There might be sugar in the way. Strip it and try again.
T = T.getSingleStepDesugaredType(*this);
  }
We found that "T == T.getSingleStepDesugaredType(*this);" and this it could not 
proceed.

I am not sure whether we should break out this loop when "T == 
T.getSingleStepDesugaredType(*this)"
or something else is wrong that we should never see such case.

Here is the dump of T:
ConstantArrayType 0x65b40640 '__private ulong[16]' 16
`-QualType 0x65b403f8 '__private ulong' __private
  `-ElaboratedType 0x65b3ff40 'ulong' sugar imported
`-TypedefType 0x65b3fef0 'ulong' sugar imported
  |-Typedef 0x65b3fe80 'ulong'
  `-BuiltinType 0x6583f430 'unsigned long'

https://github.com/llvm/llvm-project/pull/90048
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] AMDGPU: Simplify EmitAMDGPUBuiltinExpr for load transposes, NFC (PR #86707)

2024-03-26 Thread Changpeng Fang via cfe-commits

https://github.com/changpeng closed 
https://github.com/llvm/llvm-project/pull/86707
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] AMDGPU: Simplify EmitAMDGPUBuiltinExpr for load transposes, NFC (PR #86707)

2024-03-26 Thread Changpeng Fang via cfe-commits

https://github.com/changpeng updated 
https://github.com/llvm/llvm-project/pull/86707

>From 485dff66813104ad73d8eada7cd7d43edf9d093d Mon Sep 17 00:00:00 2001
From: Changpeng Fang 
Date: Tue, 26 Mar 2024 11:06:48 -0700
Subject: [PATCH 1/3] AMDGPU: Simplify EmitAMDGPUBuiltinExpr for load
 transposes, NFC

  We should not manually get the types of the loading data.
Instead, we can get the types from the intrinsics directly.
---
 clang/lib/CodeGen/CGBuiltin.cpp | 12 ++--
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 46a815155e7b87..d05ea9f626ba86 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18544,31 +18544,23 @@ Value 
*CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: {
 
 Intrinsic::ID IID;
-llvm::Type *ArgTy;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
-  ArgTy = llvm::Type::getInt32Ty(getLLVMContext());
   IID = Intrinsic::amdgcn_global_load_tr_b64;
   break;
 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
-  ArgTy = llvm::FixedVectorType::get(
-  llvm::Type::getInt32Ty(getLLVMContext()), 2);
   IID = Intrinsic::amdgcn_global_load_tr_b64;
   break;
 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
-  ArgTy = llvm::FixedVectorType::get(
-  llvm::Type::getInt16Ty(getLLVMContext()), 4);
   IID = Intrinsic::amdgcn_global_load_tr_b128;
   break;
 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
-  ArgTy = llvm::FixedVectorType::get(
-  llvm::Type::getInt16Ty(getLLVMContext()), 8);
   IID = Intrinsic::amdgcn_global_load_tr_b128;
   break;
 }
-
+llvm::Type *LoadTy = ConvertType(E->getType());
 llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
-llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy});
+llvm::Function *F = CGM.getIntrinsic(IID, {});
 return Builder.CreateCall(F, {Addr});
   }
   case AMDGPU::BI__builtin_amdgcn_get_fpenv: {

>From 2375ed4269c4feedf5767383838bb6c5d4cfd80c Mon Sep 17 00:00:00 2001
From: Changpeng Fang 
Date: Tue, 26 Mar 2024 11:17:40 -0700
Subject: [PATCH 2/3] AMDGPU: Simplify EmitAMDGPUBuiltinExpr for load
 transposes, NFC

---
 clang/lib/CodeGen/CGBuiltin.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index d05ea9f626ba86..d25d79d085a8eb 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18560,7 +18560,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 }
 llvm::Type *LoadTy = ConvertType(E->getType());
 llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
-llvm::Function *F = CGM.getIntrinsic(IID, {});
+llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy});
 return Builder.CreateCall(F, {Addr});
   }
   case AMDGPU::BI__builtin_amdgcn_get_fpenv: {

>From 6aeb3debd25b5fe2ca3191d51f1d0589a9ce24da Mon Sep 17 00:00:00 2001
From: Changpeng Fang 
Date: Tue, 26 Mar 2024 14:12:50 -0700
Subject: [PATCH 3/3] AMDGPU: Simplify EmitAMDGPUBuiltinExpr for load
 transposes, NFC

---
 clang/lib/CodeGen/CGBuiltin.cpp | 4 
 1 file changed, 4 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index d25d79d085a8eb..3cfdb261a0eac0 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18546,14 +18546,10 @@ Value 
*CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
 Intrinsic::ID IID;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
-  IID = Intrinsic::amdgcn_global_load_tr_b64;
-  break;
 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
   IID = Intrinsic::amdgcn_global_load_tr_b64;
   break;
 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
-  IID = Intrinsic::amdgcn_global_load_tr_b128;
-  break;
 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
   IID = Intrinsic::amdgcn_global_load_tr_b128;
   break;

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] AMDGPU: Simplify EmitAMDGPUBuiltinExpr for load transposes, NFC (PR #86707)

2024-03-26 Thread Changpeng Fang via cfe-commits

https://github.com/changpeng updated 
https://github.com/llvm/llvm-project/pull/86707

>From 485dff66813104ad73d8eada7cd7d43edf9d093d Mon Sep 17 00:00:00 2001
From: Changpeng Fang 
Date: Tue, 26 Mar 2024 11:06:48 -0700
Subject: [PATCH 1/2] AMDGPU: Simplify EmitAMDGPUBuiltinExpr for load
 transposes, NFC

  We should not manually get the types of the loading data.
Instead, we can get the types from the intrinsics directly.
---
 clang/lib/CodeGen/CGBuiltin.cpp | 12 ++--
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 46a815155e7b87..d05ea9f626ba86 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18544,31 +18544,23 @@ Value 
*CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: {
 
 Intrinsic::ID IID;
-llvm::Type *ArgTy;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
-  ArgTy = llvm::Type::getInt32Ty(getLLVMContext());
   IID = Intrinsic::amdgcn_global_load_tr_b64;
   break;
 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
-  ArgTy = llvm::FixedVectorType::get(
-  llvm::Type::getInt32Ty(getLLVMContext()), 2);
   IID = Intrinsic::amdgcn_global_load_tr_b64;
   break;
 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
-  ArgTy = llvm::FixedVectorType::get(
-  llvm::Type::getInt16Ty(getLLVMContext()), 4);
   IID = Intrinsic::amdgcn_global_load_tr_b128;
   break;
 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
-  ArgTy = llvm::FixedVectorType::get(
-  llvm::Type::getInt16Ty(getLLVMContext()), 8);
   IID = Intrinsic::amdgcn_global_load_tr_b128;
   break;
 }
-
+llvm::Type *LoadTy = ConvertType(E->getType());
 llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
-llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy});
+llvm::Function *F = CGM.getIntrinsic(IID, {});
 return Builder.CreateCall(F, {Addr});
   }
   case AMDGPU::BI__builtin_amdgcn_get_fpenv: {

>From 2375ed4269c4feedf5767383838bb6c5d4cfd80c Mon Sep 17 00:00:00 2001
From: Changpeng Fang 
Date: Tue, 26 Mar 2024 11:17:40 -0700
Subject: [PATCH 2/2] AMDGPU: Simplify EmitAMDGPUBuiltinExpr for load
 transposes, NFC

---
 clang/lib/CodeGen/CGBuiltin.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index d05ea9f626ba86..d25d79d085a8eb 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18560,7 +18560,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 }
 llvm::Type *LoadTy = ConvertType(E->getType());
 llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
-llvm::Function *F = CGM.getIntrinsic(IID, {});
+llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy});
 return Builder.CreateCall(F, {Addr});
   }
   case AMDGPU::BI__builtin_amdgcn_get_fpenv: {

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] AMDGPU: Simplify EmitAMDGPUBuiltinExpr for load transposes, NFC (PR #86707)

2024-03-26 Thread Changpeng Fang via cfe-commits

https://github.com/changpeng created 
https://github.com/llvm/llvm-project/pull/86707

  We should not manually get the types of the loading data.
Instead, we can get the types from the intrinsics directly.

>From 485dff66813104ad73d8eada7cd7d43edf9d093d Mon Sep 17 00:00:00 2001
From: Changpeng Fang 
Date: Tue, 26 Mar 2024 11:06:48 -0700
Subject: [PATCH] AMDGPU: Simplify EmitAMDGPUBuiltinExpr for load transposes,
 NFC

  We should not manually get the types of the loading data.
Instead, we can get the types from the intrinsics directly.
---
 clang/lib/CodeGen/CGBuiltin.cpp | 12 ++--
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 46a815155e7b87..d05ea9f626ba86 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18544,31 +18544,23 @@ Value 
*CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: {
 
 Intrinsic::ID IID;
-llvm::Type *ArgTy;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
-  ArgTy = llvm::Type::getInt32Ty(getLLVMContext());
   IID = Intrinsic::amdgcn_global_load_tr_b64;
   break;
 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
-  ArgTy = llvm::FixedVectorType::get(
-  llvm::Type::getInt32Ty(getLLVMContext()), 2);
   IID = Intrinsic::amdgcn_global_load_tr_b64;
   break;
 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
-  ArgTy = llvm::FixedVectorType::get(
-  llvm::Type::getInt16Ty(getLLVMContext()), 4);
   IID = Intrinsic::amdgcn_global_load_tr_b128;
   break;
 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
-  ArgTy = llvm::FixedVectorType::get(
-  llvm::Type::getInt16Ty(getLLVMContext()), 8);
   IID = Intrinsic::amdgcn_global_load_tr_b128;
   break;
 }
-
+llvm::Type *LoadTy = ConvertType(E->getType());
 llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
-llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy});
+llvm::Function *F = CGM.getIntrinsic(IID, {});
 return Builder.CreateCall(F, {Addr});
   }
   case AMDGPU::BI__builtin_amdgcn_get_fpenv: {

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] AMDGPU: Rename intrinsics and remove f16/bf16 versions for load transpose (PR #86313)

2024-03-25 Thread Changpeng Fang via cfe-commits

https://github.com/changpeng edited 
https://github.com/llvm/llvm-project/pull/86313
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] AMDGPU: Rename intrinsics and remove f16/bf16 versions for load transpose (PR #86313)

2024-03-25 Thread Changpeng Fang via cfe-commits


@@ -18533,51 +18533,35 @@ Value 
*CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
   }
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
-  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
-  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
-  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
-  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: {
 

changpeng wrote:

Maybe. People may have good reason to place it there, and this patch should not 
touch it.

https://github.com/llvm/llvm-project/pull/86313
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] AMDGPU: Rename intrinsics and remove f16/bf16 versions for load transpose (PR #86313)

2024-03-25 Thread Changpeng Fang via cfe-commits

changpeng wrote:

ping

https://github.com/llvm/llvm-project/pull/86313
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] AMDGPU: Rename intrinsics and remove f16/bf16 versions for load transpose (PR #86313)

2024-03-22 Thread Changpeng Fang via cfe-commits

changpeng wrote:

> There is no issue in changing the names in principle. Curious, what is the 
> rationale to use more demangled names?

more user friendly.

https://github.com/llvm/llvm-project/pull/86313
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] AMDGPU: Rename intrinsics and remove f16/bf16 versions for load transpose (PR #86313)

2024-03-22 Thread Changpeng Fang via cfe-commits

changpeng wrote:

> > global_load_re_b64
> 
> Type global_load_re_b64.

Changed! Thanks.

https://github.com/llvm/llvm-project/pull/86313
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] AMDGPU: Rename intrinsics and remove f16/bf16 versions for load transpose (PR #86313)

2024-03-22 Thread Changpeng Fang via cfe-commits

https://github.com/changpeng edited 
https://github.com/llvm/llvm-project/pull/86313
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] AMDGPU: Rename intrinsics and remove f16/bf16 versions for load transpose (PR #86313)

2024-03-22 Thread Changpeng Fang via cfe-commits

https://github.com/changpeng created 
https://github.com/llvm/llvm-project/pull/86313

  Rename the intrinsics to close to the instruction mnemonic names:
Use global_load_re_b64 and global_load_tr_b128 instead of global_load_tr.

  This patch also removes f16/bf16 versions of builtins/intrinsics. To simplify 
the design, we should avoid enumerating all possible types in implementing 
builtins. We can always use bitcast.

>From 79fd7cf6eee74d4485a215e47ddd8349b126f2f4 Mon Sep 17 00:00:00 2001
From: Changpeng Fang 
Date: Fri, 22 Mar 2024 10:06:02 -0700
Subject: [PATCH] AMDGPU: Rename intrinsics and remove f16/bf16 versions for
 load transpose

  Rename the intrinsics to close to the instruction mnemonic names:
Use global_load_re_b64 and global_load_tr_b128 instead of global_load_tr.

  This patch also removes f16/bf16 versions of builtins/intrinsics. To simplify
the design, we should avoid enumerating all possible types in implementing
builtins. We can always use bitcast.
---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   5 -
 clang/lib/CodeGen/CGBuiltin.cpp   |  28 +---
 ...uiltins-amdgcn-global-load-tr-gfx11-err.cl |  16 +-
 ...ins-amdgcn-global-load-tr-gfx12-w32-err.cl |   6 +-
 ...ins-amdgcn-global-load-tr-gfx12-w64-err.cl |   6 +-
 .../builtins-amdgcn-global-load-tr-w32.cl |  26 +---
 .../builtins-amdgcn-global-load-tr-w64.cl |  26 +---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td  |  15 +-
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   3 +-
 .../Target/AMDGPU/AMDGPUSearchableTables.td   |   3 +-
 llvm/lib/Target/AMDGPU/FLATInstructions.td|  12 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp |   6 +-
 .../UniformityAnalysis/AMDGPU/intrinsics.ll   |  70 ++---
 .../AMDGPU/llvm.amdgcn.global.load.tr-w32.ll  | 146 --
 .../AMDGPU/llvm.amdgcn.global.load.tr-w64.ll  | 146 --
 15 files changed, 104 insertions(+), 410 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 4153b316c22b1d..c660582cc98e66 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -434,13 +434,8 @@ TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, 
"Uii", "n", "gfx12-insts")
 
 TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_v2i32, "V2iV2i*1", "nc", 
"gfx12-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8i16, "V8sV8s*1", "nc", 
"gfx12-insts,wavefrontsize32")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8f16, "V8hV8h*1", "nc", 
"gfx12-insts,wavefrontsize32")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8bf16, "V8yV8y*1", "nc", 
"gfx12-insts,wavefrontsize32")
-
 TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_i32, "ii*1", "nc", 
"gfx12-insts,wavefrontsize64")
 TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4i16, "V4sV4s*1", "nc", 
"gfx12-insts,wavefrontsize64")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4f16, "V4hV4h*1", "nc", 
"gfx12-insts,wavefrontsize64")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4bf16, "V4yV4y*1", "nc", 
"gfx12-insts,wavefrontsize64")
 
 
//===--===//
 // WMMA builtins.
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 2eaceeba617700..e476234b1379ab 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18533,51 +18533,35 @@ Value 
*CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
   }
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
-  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
-  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
-  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
-  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
   case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: {
 
+Intrinsic::ID IID;
 llvm::Type *ArgTy;
 switch (BuiltinID) {
 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
   ArgTy = llvm::Type::getInt32Ty(getLLVMContext());
+  IID = Intrinsic::amdgcn_global_load_tr_b64;
   break;
 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
   ArgTy = llvm::FixedVectorType::get(
   llvm::Type::getInt32Ty(getLLVMContext()), 2);
-  break;
-case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
-  ArgTy = llvm::FixedVectorType::get(
-  llvm::Type::getBFloatTy(getLLVMContext()), 4);
-  break;
-case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
-  ArgTy = llvm::FixedVectorType::get(
-  llvm::Type::getHalfTy(getLLVMContext()), 4);
+  IID = Intrinsic::amdgcn_global_load_tr_b64;
   break;
 case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
   ArgTy = 

[clang] AMDGPU: Rename and add bf16 support for global_load_tr builtins (PR #86202)

2024-03-22 Thread Changpeng Fang via cfe-commits

changpeng wrote:

I am going to propose to rename intrinsics and remove f16/bf16 versions of 
builtins/intrinsics

https://github.com/llvm/llvm-project/pull/86202
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] AMDGPU: Rename and add bf16 support for global_load_tr builtins (PR #86202)

2024-03-22 Thread Changpeng Fang via cfe-commits

https://github.com/changpeng closed 
https://github.com/llvm/llvm-project/pull/86202
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] AMDGPU: Rename and add bf16 support for global_load_tr builtins (PR #86202)

2024-03-22 Thread Changpeng Fang via cfe-commits

changpeng wrote:

[AMD Official Use Only - General]

I am fine to remove f16/bf16 versions. Enumerating all possible types could be 
very painful. For example we gave up enumerating for B64, and ended up using 
v2i32 only. What do others think removing f16/bf16 versions? Thanks

Get Outlook for iOS

From: Matt Arsenault ***@***.***>
Sent: Friday, March 22, 2024 3:45:53 AM
To: llvm/llvm-project ***@***.***>
Cc: Fang, Changpeng ***@***.***>; Author ***@***.***>
Subject: Re: [llvm/llvm-project] AMDGPU: Rename and add bf16 support for 
global_load_tr builtins (PR #86202)

Caution: This message originated from an External Source. Use proper caution 
when opening attachments, clicking links, or responding.


@arsenm commented on this pull request.



In 
clang/include/clang/Basic/BuiltinsAMDGPU.def:

> -TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v2i32, "V2iV2i*1", "nc", 
> "gfx12-insts,wavefrontsize32")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v8i16, "V8sV8s*1", "nc", 
"gfx12-insts,wavefrontsize32")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v8f16, "V8hV8h*1", "nc", 
"gfx12-insts,wavefrontsize32")
-
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_i32, "ii*1", "nc", 
"gfx12-insts,wavefrontsize64")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4i16, "V4sV4s*1", "nc", 
"gfx12-insts,wavefrontsize64")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4f16, "V4hV4h*1", "nc", 
"gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_v2i32, "V2iV2i*1", "nc", 
"gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8i16, "V8sV8s*1", "nc", 
"gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8f16, "V8hV8h*1", "nc", 
"gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8bf16, "V8yV8y*1", "nc", 
"gfx12-insts,wavefrontsize32")
+
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_i32, "ii*1", "nc", 
"gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4i16, "V4sV4s*1", "nc", 
"gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4f16, "V4hV4h*1", "nc", 
"gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4bf16, "V4yV4y*1", "nc", 
"gfx12-insts,wavefrontsize64")


Do we really need the f16/bf16 versions? You can always bitcast the i16 
versions.

—
Reply to this email directly, view it on 
GitHub,
 or 
unsubscribe.
You are receiving this because you authored the thread.Message ID: ***@***.***>


https://github.com/llvm/llvm-project/pull/86202
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] AMDGPU: Rename and add bf16 support for global_load_tr builtins (PR #86202)

2024-03-21 Thread Changpeng Fang via cfe-commits

changpeng wrote:

> I don't think intrinsics are meant for users. Builtins are the user-facing 
> front. :-)

Then renaing the intrinsics should be relatively at a lower priority. We may do 
it in a separate patch once we have reached an agreement.

https://github.com/llvm/llvm-project/pull/86202
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] AMDGPU: Rename and add bf16 support for global_load_tr builtins (PR #86202)

2024-03-21 Thread Changpeng Fang via cfe-commits

changpeng wrote:

> > > Do you want to rename intrinsics as well? Because now intrinsic names do 
> > > not match builtin names.
> > 
> > 
> > Do we have to match builtins with intrinsics? Renaming intrinsics here 
> > means we will have to duplicate the intrinsics.
> 
> Is that because of the mangling?
Right.  It was originally suggested to use  a single instrinsic "load_lr".  But 
eventually we use global_load_tr to indicate this is in global address space.  
If we want to rename intrinsics here, it should be global_load_tr_b64 and 
global_load_tr_b128. 

We should rename intrinsic if users can use intrinsics directly. I think 
use-friendly is more important.

https://github.com/llvm/llvm-project/pull/86202
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] AMDGPU: Rename and add bf16 support for global_load_tr builtins (PR #86202)

2024-03-21 Thread Changpeng Fang via cfe-commits

changpeng wrote:

> Do you want to rename intrinsics as well? Because now intrinsic names do not 
> match builtin names.

Do we have to match builtins with intrinsics? Renaming intrinsics here means we 
will have to duplicate the intrinsics. 

https://github.com/llvm/llvm-project/pull/86202
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] AMDGPU: Rename and add bf16 support for global_load_tr builtins (PR #86202)

2024-03-21 Thread Changpeng Fang via cfe-commits

https://github.com/changpeng created 
https://github.com/llvm/llvm-project/pull/86202

  Make the name of a clang builtin as close to the mnemonic instruction name as 
possible. The data type suffix may not be enough to tell what instruction the 
builtin is going to produce.
  This patch also add the bf16 support for global_load_tr_b128 builtins.

>From a65bd5bd52db208d9aa9c22cbb834787aff978d4 Mon Sep 17 00:00:00 2001
From: Changpeng Fang 
Date: Thu, 21 Mar 2024 14:24:43 -0700
Subject: [PATCH] AMDGPU: Rename and add bf16 support for global_load_tr
 builtins

  Make the name of a clang builtin as close to the mnemonic
instruction name as possible. The data type suffix may not be
enough to tell what instruction the builtin is going to produce.

  This patch also add the bf16 support for global_load_tr_b128
builtins.
---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  | 16 
 clang/lib/CodeGen/CGBuiltin.cpp   | 34 +++--
 ...uiltins-amdgcn-global-load-tr-gfx11-err.cl | 25 ++--
 ...ins-amdgcn-global-load-tr-gfx12-w32-err.cl | 11 +++---
 ...ins-amdgcn-global-load-tr-gfx12-w64-err.cl | 11 +++---
 .../builtins-amdgcn-global-load-tr-w32.cl | 38 +--
 .../builtins-amdgcn-global-load-tr-w64.cl | 38 +--
 7 files changed, 94 insertions(+), 79 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 61ec8b79bf054d..4153b316c22b1d 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -432,13 +432,15 @@ TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vi", 
"n", "gfx12-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "b", "n", "gfx12-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts")
 
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v2i32, "V2iV2i*1", "nc", 
"gfx12-insts,wavefrontsize32")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v8i16, "V8sV8s*1", "nc", 
"gfx12-insts,wavefrontsize32")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v8f16, "V8hV8h*1", "nc", 
"gfx12-insts,wavefrontsize32")
-
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_i32, "ii*1", "nc", 
"gfx12-insts,wavefrontsize64")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4i16, "V4sV4s*1", "nc", 
"gfx12-insts,wavefrontsize64")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4f16, "V4hV4h*1", "nc", 
"gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_v2i32, "V2iV2i*1", "nc", 
"gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8i16, "V8sV8s*1", "nc", 
"gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8f16, "V8hV8h*1", "nc", 
"gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8bf16, "V8yV8y*1", "nc", 
"gfx12-insts,wavefrontsize32")
+
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_i32, "ii*1", "nc", 
"gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4i16, "V4sV4s*1", "nc", 
"gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4f16, "V4hV4h*1", "nc", 
"gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4bf16, "V4yV4y*1", "nc", 
"gfx12-insts,wavefrontsize64")
 
 
//===--===//
 // WMMA builtins.
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index e14e8908828218..2eaceeba617700 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18531,35 +18531,45 @@ Value 
*CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
 llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy});
 return Builder.CreateCall(F, {Addr, Val, ZeroI32, ZeroI32, ZeroI1});
   }
-  case AMDGPU::BI__builtin_amdgcn_global_load_tr_i32:
-  case AMDGPU::BI__builtin_amdgcn_global_load_tr_v2i32:
-  case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4f16:
-  case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4i16:
-  case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8f16:
-  case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8i16: {
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: {
 
 llvm::Type *ArgTy;
 switch (BuiltinID) {
-case AMDGPU::BI__builtin_amdgcn_global_load_tr_i32:
+case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
   ArgTy = llvm::Type::getInt32Ty(getLLVMContext());
   break;
-case 

[clang] [llvm] AMDGPU: Define a feature for v_dot4_f32_* instructions (PR #84248)

2024-03-06 Thread Changpeng Fang via cfe-commits

https://github.com/changpeng closed 
https://github.com/llvm/llvm-project/pull/84248
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] AMDGPU: Define a feature for v_dot4_f32_* instructions (PR #84248)

2024-03-06 Thread Changpeng Fang via cfe-commits

https://github.com/changpeng created 
https://github.com/llvm/llvm-project/pull/84248

FeatureDot11Insts (dot11-insts) for:
  v_dot4_f32_fp8_fp8, v_dot4_f32_fp8_bf8,
  v_dot4_f32_bf8_fp8, v_dot4_f32_bf8_bf8

>From 1bfc1e048d10e57c3d07038f52b072163f3b4ff9 Mon Sep 17 00:00:00 2001
From: Changpeng Fang 
Date: Wed, 6 Mar 2024 14:13:46 -0800
Subject: [PATCH] AMDGPU: Define a feature for v_dot4_f32_* instructions

FeatureDot11Insts (dot11-insts) for:
  v_dot4_f32_fp8_fp8, v_dot4_f32_fp8_bf8,
  v_dot4_f32_bf8_fp8, v_dot4_f32_bf8_bf8
---
 clang/include/clang/Basic/BuiltinsAMDGPU.def   |  8 
 clang/test/CodeGenOpenCL/amdgpu-features.cl|  4 ++--
 .../test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl |  8 
 llvm/lib/Target/AMDGPU/AMDGPU.td   | 10 ++
 llvm/lib/Target/AMDGPU/GCNSubtarget.h  |  5 +
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td|  2 ++
 llvm/lib/TargetParser/TargetParser.cpp |  1 +
 7 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def 
b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 6628e8f265fe48..61ec8b79bf054d 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -256,10 +256,10 @@ TARGET_BUILTIN(__builtin_amdgcn_sudot4, "iIbiIbiiIb", 
"nc", "dot8-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sdot8, "SiSiSiSiIb", "nc", "dot1-insts")
 TARGET_BUILTIN(__builtin_amdgcn_udot8, "UiUiUiUiIb", "nc", "dot7-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sudot8, "iIbiIbiiIb", "nc", "dot8-insts")
-TARGET_BUILTIN(__builtin_amdgcn_dot4_f32_fp8_bf8, "fUiUif", "nc", 
"gfx12-insts")
-TARGET_BUILTIN(__builtin_amdgcn_dot4_f32_bf8_fp8, "fUiUif", "nc", 
"gfx12-insts")
-TARGET_BUILTIN(__builtin_amdgcn_dot4_f32_fp8_fp8, "fUiUif", "nc", 
"gfx12-insts")
-TARGET_BUILTIN(__builtin_amdgcn_dot4_f32_bf8_bf8, "fUiUif", "nc", 
"gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_dot4_f32_fp8_bf8, "fUiUif", "nc", 
"dot11-insts")
+TARGET_BUILTIN(__builtin_amdgcn_dot4_f32_bf8_fp8, "fUiUif", "nc", 
"dot11-insts")
+TARGET_BUILTIN(__builtin_amdgcn_dot4_f32_fp8_fp8, "fUiUif", "nc", 
"dot11-insts")
+TARGET_BUILTIN(__builtin_amdgcn_dot4_f32_bf8_bf8, "fUiUif", "nc", 
"dot11-insts")
 
 
//===--===//
 // GFX10+ only builtins.
diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl 
b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index 9c8ca0bb96f612..7387f9a22f0dfc 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -100,8 +100,8 @@
 // GFX1103: 
"target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
 // GFX1150: 
"target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
 // GFX1151: 
"target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1200: 
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1201: 
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1200: 
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1201: 
"target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
 
 // GFX1103-W64: 

[clang] [mlir] [llvm] [libc] [AMDGPU] Rename AMDGPULoadTr intrinsic class. NFC. (PR #79394)

2024-01-24 Thread Changpeng Fang via cfe-commits

https://github.com/changpeng closed 
https://github.com/llvm/llvm-project/pull/79394
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [mlir] [llvm] [libc] [AMDGPU] Rename AMDGPULoadTr intrinsic class. NFC. (PR #79394)

2024-01-24 Thread Changpeng Fang via cfe-commits

https://github.com/changpeng updated 
https://github.com/llvm/llvm-project/pull/79394

>From 0f3af077baeff26d9796db73e7af19b097272fa2 Mon Sep 17 00:00:00 2001
From: Changpeng Fang 
Date: Wed, 24 Jan 2024 16:28:23 -0800
Subject: [PATCH] [AMDGPU] Rename AMDGPULoadTr intrinsic class. NFC.

  This class is not specific to "Tr"(Transpose). It is
just for a normal load.
---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td 
b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 9eb1ac8e27befb..1fbaf569c32133 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2755,7 +2755,7 @@ def int_amdgcn_global_atomic_fmax_num : 
AMDGPUAtomicRtn;
 
 def int_amdgcn_atomic_cond_sub_u32 : AMDGPUAtomicRtn;
 
-class AMDGPULoadTr:
+class AMDGPULoadIntrinsic:
   Intrinsic<
 [llvm_any_ty],
 [ptr_ty],
@@ -2775,7 +2775,7 @@ class AMDGPULoadTr:
 // <4 x half>   @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1))  -> 
global_load_tr_b128
 // <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16(ptr addrspace(1)) -> 
global_load_tr_b128
 
-def int_amdgcn_global_load_tr : AMDGPULoadTr;
+def int_amdgcn_global_load_tr : AMDGPULoadIntrinsic;
 
 
//===--===//
 // Deep learning intrinsics.

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[libcxx] [flang] [mlir] [llvm] [compiler-rt] [clang-tools-extra] [openmp] [libc] [lldb] [lld] [clang] AMDGPU: Add SourceOfDivergence for int_amdgcn_global_load_tr (PR #79218)

2024-01-23 Thread Changpeng Fang via cfe-commits

https://github.com/changpeng closed 
https://github.com/llvm/llvm-project/pull/79218
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[libcxx] [libc] [lldb] [openmp] [clang] [mlir] [clang-tools-extra] [flang] [lld] [compiler-rt] [llvm] AMDGPU: Do not generate non-temporal hint when Load_Tr intrinsic did not specify it (PR #79104)

2024-01-23 Thread Changpeng Fang via cfe-commits

https://github.com/changpeng closed 
https://github.com/llvm/llvm-project/pull/79104
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Add global_load_tr for GFX12 (PR #77772)

2024-01-18 Thread Changpeng Fang via cfe-commits

https://github.com/changpeng approved this pull request.


https://github.com/llvm/llvm-project/pull/2
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[llvm] [clang] [AMDGPU] Add global_load_tr for GFX12 (PR #77772)

2024-01-12 Thread Changpeng Fang via cfe-commits

https://github.com/changpeng deleted 
https://github.com/llvm/llvm-project/pull/2
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Add global_load_tr for GFX12 (PR #77772)

2024-01-12 Thread Changpeng Fang via cfe-commits


@@ -2496,6 +2496,26 @@ def int_amdgcn_flat_atomic_fmax_num   : 
AMDGPUAtomicRtn;
 def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn;
 def int_amdgcn_global_atomic_fmax_num : AMDGPUAtomicRtn;
 
+class AMDGPUGlobalLoadTr :
+  Intrinsic<
+[data_ty],
+[global_ptr_ty],
+[IntrReadMem, IntrWillReturn, IntrConvergent, NoCapture>, 
IntrNoCallback, IntrNoFree],
+"",
+[SDNPMemOperand]
+  >;
+
+// Wave32
+// <2 x i32>  @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1)) -> 
global_load_tr_b64
+// <8 x i16>  @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1)) -> 
global_load_tr_b128
+// <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1)) -> 
global_load_tr_b128

changpeng wrote:

global_load_tr_b128 transposes to vector of b16. Do we really need to enumerate 
every possible types (i16, f16)? In that case, we may also need to consider 
bf16. 
  

https://github.com/llvm/llvm-project/pull/2
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [AMDGPU] Add global_load_tr for GFX12 (PR #77772)

2024-01-12 Thread Changpeng Fang via cfe-commits


@@ -18178,6 +18178,51 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned 
BuiltinID,
 llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy});
 return Builder.CreateCall(F, {Addr, Val, ZeroI32, ZeroI32, ZeroI1});
   }
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
+  case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16: {
+
+Intrinsic::ID IID;
+llvm::Type *ArgTy;
+switch (BuiltinID) {
+case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
+  ArgTy = llvm::FixedVectorType::get(
+  llvm::Type::getInt32Ty(getLLVMContext()), 2);
+  IID = Intrinsic::amdgcn_global_load_tr_b64;

changpeng wrote:

> Initially I thought it was better to have _b64/_b128 explicit to avoid 
> confusion as the number of bits loaded depends also on wave size. On the 
> second thought, I believe that having just one intrinsic would be cleaner - 
> will make an update.

This doesn't work when we have instructions that transposes to vectors of B8, 
B6 and B4. We could not differentiate when we use (2 x i32) to workaround at 
this moment. 

https://github.com/llvm/llvm-project/pull/2
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[lldb] [llvm] [openmp] [clang-tools-extra] [flang] [mlir] [libcxx] [libc] [clang] GlobalISel: Guard return in llvm::getIConstantSplatVal (PR #71989)

2023-11-14 Thread Changpeng Fang via cfe-commits

https://github.com/changpeng closed 
https://github.com/llvm/llvm-project/pull/71989
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[mlir] [clang-tools-extra] [llvm] [clang] [openmp] [libc] [lldb] [libcxx] [flang] GlobalISel: Guard return in llvm::getIConstantSplatVal (PR #71989)

2023-11-14 Thread Changpeng Fang via cfe-commits

https://github.com/changpeng reopened 
https://github.com/llvm/llvm-project/pull/71989
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[mlir] [clang-tools-extra] [llvm] [clang] [openmp] [libc] [lldb] [libcxx] [flang] GlobalISel: Guard return in llvm::getIConstantSplatVal (PR #71989)

2023-11-14 Thread Changpeng Fang via cfe-commits

https://github.com/changpeng closed 
https://github.com/llvm/llvm-project/pull/71989
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[lldb] [flang] [mlir] [clang-tools-extra] [openmp] [clang] [libcxx] [libc] [llvm] GlobalISel: Guard return in llvm::getIConstantSplatVal (PR #71989)

2023-11-14 Thread Changpeng Fang via cfe-commits

changpeng wrote:

> Typo in subject "**Guard** return ..."?

You are right. Thanks.

https://github.com/llvm/llvm-project/pull/71989
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[lldb] [flang] [mlir] [clang-tools-extra] [openmp] [clang] [libcxx] [libc] [llvm] GlobalISel: Guard return in llvm::getIConstantSplatVal (PR #71989)

2023-11-14 Thread Changpeng Fang via cfe-commits

https://github.com/changpeng edited 
https://github.com/llvm/llvm-project/pull/71989
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[lldb] [mlir] [flang] [llvm] [libc] [libcxx] [openmp] [clang-tools-extra] [clang] GlobalISel: Guide return in llvm::getIConstantSplatVal (PR #71989)

2023-11-10 Thread Changpeng Fang via cfe-commits

changpeng wrote:

> Any tests?
Encountered this issue during a downstream branch testing. No test for trunk 
yet but think the issue should be here.  

https://github.com/llvm/llvm-project/pull/71989
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AMDGPU] make v32i16/v32f16 legal (PR #70484)

2023-10-27 Thread Changpeng Fang via cfe-commits

https://github.com/changpeng closed 
https://github.com/llvm/llvm-project/pull/70484
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [AMDGPU] make v32i16/v32f16 legal (PR #70484)

2023-10-27 Thread Changpeng Fang via cfe-commits

https://github.com/changpeng edited 
https://github.com/llvm/llvm-project/pull/70484
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] d77c620 - [clang][AMDGPU]: Don't use byval for struct arguments in function ABI

2023-08-11 Thread Changpeng Fang via cfe-commits

Author: Changpeng Fang
Date: 2023-08-11T16:37:42-07:00
New Revision: d77c62053c944652846c00a35c921e14b43b1877

URL: 
https://github.com/llvm/llvm-project/commit/d77c62053c944652846c00a35c921e14b43b1877
DIFF: 
https://github.com/llvm/llvm-project/commit/d77c62053c944652846c00a35c921e14b43b1877.diff

LOG: [clang][AMDGPU]: Don't use byval for struct arguments in function ABI

Summary:
  Byval requires allocating additional stack space, and always requires an 
implicit copy to be inserted in codegen,
where it can be difficult to optimize. In this work, we use 
byref/IndirectAliased promotion method instead of
byval with the implicit copy semantics.

Reviewers:
  arsenm

Differential Revision:
  https://reviews.llvm.org/D155986

Added: 
clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl

Modified: 
clang/docs/ReleaseNotes.rst
clang/lib/CodeGen/CGCall.cpp
clang/lib/CodeGen/Targets/AMDGPU.cpp
clang/test/CodeGenCUDA/kernel-args.cu
clang/test/CodeGenCXX/amdgcn-func-arg.cpp
clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl
clang/test/CodeGenOpenCL/byval.cl
llvm/docs/AMDGPUUsage.rst

Removed: 




diff  --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 860bcceeef21ff..cd7beff546c932 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -193,6 +193,10 @@ Target Specific Changes
 
 AMDGPU Support
 ^^
+- Use pass-by-reference (byref) in stead of pass-by-value (byval) for struct
+  arguments in C ABI. Callee is responsible for allocating stack memory and
+  copying the value of the struct if modified. Note that AMDGPU backend still
+  supports byval for struct arguments.
 
 X86 Support
 ^^^

diff  --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 28c3bc7c9f70f6..2b5121a7b23063 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -2156,7 +2156,8 @@ static bool DetermineNoUndef(QualType QTy, CodeGenTypes 
,
  const llvm::DataLayout , const ABIArgInfo ,
  bool CheckCoerce = true) {
   llvm::Type *Ty = Types.ConvertTypeForMem(QTy);
-  if (AI.getKind() == ABIArgInfo::Indirect)
+  if (AI.getKind() == ABIArgInfo::Indirect ||
+  AI.getKind() == ABIArgInfo::IndirectAliased)
 return true;
   if (AI.getKind() == ABIArgInfo::Extend)
 return true;
@@ -5126,12 +5127,15 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo 
,
   auto LV = I->getKnownLValue();
   auto AS = LV.getAddressSpace();
 
-  if (!ArgInfo.getIndirectByVal() ||
+  bool isByValOrRef =
+  ArgInfo.isIndirectAliased() || ArgInfo.getIndirectByVal();
+
+  if (!isByValOrRef ||
   (LV.getAlignment() < getContext().getTypeAlignInChars(I->Ty))) {
 NeedCopy = true;
   }
   if (!getLangOpts().OpenCL) {
-if ((ArgInfo.getIndirectByVal() &&
+if ((isByValOrRef &&
 (AS != LangAS::Default &&
  AS != CGM.getASTAllocaAddressSpace( {
   NeedCopy = true;
@@ -5139,7 +5143,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo 
,
   }
   // For OpenCL even if RV is located in default or alloca address 
space
   // we don't want to perform address space cast for it.
-  else if ((ArgInfo.getIndirectByVal() &&
+  else if ((isByValOrRef &&
 Addr.getType()->getAddressSpace() != IRFuncTy->
   getParamType(FirstIRArg)->getPointerAddressSpace())) {
 NeedCopy = true;

diff  --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp 
b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 6e40c0a6607fae..1e7b036de82efd 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -248,6 +248,12 @@ ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty,
 return ABIArgInfo::getDirect();
   }
 }
+
+// Use pass-by-reference in stead of pass-by-value for struct arguments in
+// function ABI.
+return ABIArgInfo::getIndirectAliased(
+getContext().getTypeAlignInChars(Ty),
+getContext().getTargetAddressSpace(LangAS::opencl_private));
   }
 
   // Otherwise just do the default thing.

diff  --git a/clang/test/CodeGenCUDA/kernel-args.cu 
b/clang/test/CodeGenCUDA/kernel-args.cu
index 5f064694223b55..bcce729f14481c 100644
--- a/clang/test/CodeGenCUDA/kernel-args.cu
+++ b/clang/test/CodeGenCUDA/kernel-args.cu
@@ -9,14 +9,14 @@ struct A {
   float *p;
 };
 
-// AMDGCN: define{{.*}} amdgpu_kernel void @_Z6kernel1A(ptr addrspace(4) 
byref(%struct.A) align 8 %{{.+}})
+// AMDGCN: define{{.*}} amdgpu_kernel void @_Z6kernel1A(ptr addrspace(4) 
noundef byref(%struct.A) align 8 %{{.+}})
 // NVPTX: define{{.*}} void @_Z6kernel1A(ptr noundef 

[clang] 4608686 - [clang][test] Fix LIT test failures for the following commit

2023-08-09 Thread Changpeng Fang via cfe-commits

Author: Changpeng Fang
Date: 2023-08-09T18:23:18-07:00
New Revision: 4608686849bcb6e20de827750862d5345cbd

URL: 
https://github.com/llvm/llvm-project/commit/4608686849bcb6e20de827750862d5345cbd
DIFF: 
https://github.com/llvm/llvm-project/commit/4608686849bcb6e20de827750862d5345cbd.diff

LOG: [clang][test] Fix LIT test failures for the following commit

 commit c1803d5366c794ecade4e4ccd0013690a1976d49 (HEAD -> main, origin/main, 
origin/HEAD)
Author: Changpeng Fang 
Date:   Wed Aug 9 17:49:14 2023 -0700

[FunctionAttrs] Unconditionally perform argument attribute inference in the 
first function-attrs pass

Differential Revision:
  https://reviews.llvm.org/D156397

Added: 


Modified: 
clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl
clang/test/CodeGenOpenCL/kernel-param-alignment.cl

Removed: 




diff  --git a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl 
b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl
index 8c5ce6dfb704c6..1da27e54e6810d 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl
@@ -307,7 +307,7 @@ void 
func_single_struct_element_struct_arg(single_struct_element_struct_arg_t ar
 // CHECK: void @func_
diff erent_size_type_pair_arg(i64 %arg1.coerce0, i32 %arg1.coerce1)
 void func_
diff erent_size_type_pair_arg(
diff erent_size_type_pair arg1) { }
 
-// CHECK: void @func_flexible_array_arg(ptr addrspace(5) nocapture noundef 
byval(%struct.flexible_array) align 4 %arg)
+// CHECK: void @func_flexible_array_arg(ptr addrspace(5) nocapture noundef 
readnone byval(%struct.flexible_array) align 4 %arg)
 void func_flexible_array_arg(flexible_array arg) { }
 
 // CHECK: define{{.*}} float @func_f32_ret()
@@ -448,11 +448,11 @@ flexible_array func_flexible_array_ret()
 // CHECK: define{{.*}} void @func_reg_state_lo(<4 x i32> noundef %arg0, <4 x 
i32> noundef %arg1, <4 x i32> noundef %arg2, i32 noundef %arg3, i32 %s.coerce0, 
float %s.coerce1, i32 %s.coerce2)
 void func_reg_state_lo(int4 arg0, int4 arg1, int4 arg2, int arg3, struct_arg_t 
s) { }
 
-// CHECK: define{{.*}} void @func_reg_state_hi(<4 x i32> noundef %arg0, <4 x 
i32> noundef %arg1, <4 x i32> noundef %arg2, i32 noundef %arg3, i32 noundef 
%arg4, ptr addrspace(5) nocapture noundef byval(%struct.struct_arg) align 4 %s)
+// CHECK: define{{.*}} void @func_reg_state_hi(<4 x i32> noundef %arg0, <4 x 
i32> noundef %arg1, <4 x i32> noundef %arg2, i32 noundef %arg3, i32 noundef 
%arg4, ptr addrspace(5) nocapture noundef readnone byval(%struct.struct_arg) 
align 4 %s)
 void func_reg_state_hi(int4 arg0, int4 arg1, int4 arg2, int arg3, int arg4, 
struct_arg_t s) { }
 
 // XXX - Why don't the inner structs flatten?
-// CHECK: define{{.*}} void @func_reg_state_num_regs_nested_struct(<4 x i32> 
noundef %arg0, i32 noundef %arg1, i32 %arg2.coerce0, %struct.nested 
%arg2.coerce1, i32 %arg3.coerce0, %struct.nested %arg3.coerce1, ptr 
addrspace(5) nocapture noundef byval(%struct.num_regs_nested_struct) align 8 
%arg4)
+// CHECK: define{{.*}} void @func_reg_state_num_regs_nested_struct(<4 x i32> 
noundef %arg0, i32 noundef %arg1, i32 %arg2.coerce0, %struct.nested 
%arg2.coerce1, i32 %arg3.coerce0, %struct.nested %arg3.coerce1, ptr 
addrspace(5) nocapture noundef readnone byval(%struct.num_regs_nested_struct) 
align 8 %arg4)
 void func_reg_state_num_regs_nested_struct(int4 arg0, int arg1, 
num_regs_nested_struct arg2, num_regs_nested_struct arg3, 
num_regs_nested_struct arg4) { }
 
 // CHECK: define{{.*}} void @func_double_nested_struct_arg(<4 x i32> noundef 
%arg0, i32 noundef %arg1, i32 %arg2.coerce0, %struct.double_nested 
%arg2.coerce1, i16 %arg2.coerce2)
@@ -477,7 +477,7 @@ void v3i32_reg_count(int3 arg1, int3 arg2, int3 arg3, int3 
arg4, struct_arg_t ar
 
 // Function signature from blender, nothing should be passed byval. The v3i32
 // should not count as 4 passed registers.
-// CHECK: define{{.*}} void @v3i32_pair_reg_count(ptr addrspace(5) nocapture 
noundef %arg0, <3 x i32> %arg1.coerce0, <3 x i32> %arg1.coerce1, <3 x i32> 
noundef %arg2, <3 x i32> %arg3.coerce0, <3 x i32> %arg3.coerce1, <3 x i32> 
noundef %arg4, float noundef %arg5)
+// CHECK: define{{.*}} void @v3i32_pair_reg_count(ptr addrspace(5) nocapture 
noundef readnone %arg0, <3 x i32> %arg1.coerce0, <3 x i32> %arg1.coerce1, <3 x 
i32> noundef %arg2, <3 x i32> %arg3.coerce0, <3 x i32> %arg3.coerce1, <3 x i32> 
noundef %arg4, float noundef %arg5)
 void v3i32_pair_reg_count(int3_pair *arg0, int3_pair arg1, int3 arg2, 
int3_pair arg3, int3 arg4, float arg5) { }
 
 // Each short4 should fit pack into 2 registers.
@@ -485,7 +485,7 @@ void v3i32_pair_reg_count(int3_pair *arg0, int3_pair arg1, 
int3 arg2, int3_pair
 void v4i16_reg_count(short4 arg0, short4 arg1, short4 arg2, short4 arg3,
  short4 arg4, short4 arg5, struct_4regs arg6) { }
 
-// CHECK: define{{.*}} void 

[clang] dd5895c - AMDGPU: Use the implicit kernargs for code object version 5

2022-03-17 Thread Changpeng Fang via cfe-commits

Author: Changpeng Fang
Date: 2022-03-17T14:12:36-07:00
New Revision: dd5895cc39864393f8ca357bc4e23e8d7b5b9723

URL: 
https://github.com/llvm/llvm-project/commit/dd5895cc39864393f8ca357bc4e23e8d7b5b9723
DIFF: 
https://github.com/llvm/llvm-project/commit/dd5895cc39864393f8ca357bc4e23e8d7b5b9723.diff

LOG: AMDGPU: Use the implicit kernargs for code object version 5

Summary:
  Specifically, for trap handling, for targets that do not support 
getDoorbellID,
we load the queue_ptr from the implicit kernarg, and move queue_ptr to s[0:1].
To get aperture bases when targets do not have aperture registers, we load
private_base or shared_base directly from the implicit kernarg. In clang, we use
implicitarg_ptr + offsets to implement __builtin_amdgcn_workgroup_size_{xyz}.

Reviewers: arsenm, sameerds, yaxunl

Differential Revision: https://reviews.llvm.org/D120265

Added: 

llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll

Modified: 
clang/lib/CodeGen/CGBuiltin.cpp
clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
llvm/lib/Target/AMDGPU/SIDefines.h
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.h
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Removed: 




diff  --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 4ac7b6e79ff3e..39e88482db94d 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -16258,12 +16258,31 @@ Value *EmitAMDGPUDispatchPtr(CodeGenFunction ,
   return CGF.Builder.CreateAddrSpaceCast(Call, RetTy);
 }
 
+Value *EmitAMDGPUImplicitArgPtr(CodeGenFunction ) {
+  auto *F = CGF.CGM.getIntrinsic(Intrinsic::amdgcn_implicitarg_ptr);
+  auto *Call = CGF.Builder.CreateCall(F);
+  Call->addRetAttr(
+  Attribute::getWithDereferenceableBytes(Call->getContext(), 256));
+  Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(8)));
+  return Call;
+}
+
 // \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
 Value *EmitAMDGPUWorkGroupSize(CodeGenFunction , unsigned Index) {
-  const unsigned XOffset = 4;
-  auto *DP = EmitAMDGPUDispatchPtr(CGF);
-  // Indexing the HSA kernel_dispatch_packet struct.
-  auto *Offset = llvm::ConstantInt::get(CGF.Int32Ty, XOffset + Index * 2);
+  bool IsCOV_5 = CGF.getTarget().getTargetOpts().CodeObjectVersion ==
+ clang::TargetOptions::COV_5;
+  Constant *Offset;
+  Value *DP;
+  if (IsCOV_5) {
+// Indexing the implicit kernarg segment.
+Offset = llvm::ConstantInt::get(CGF.Int32Ty, 12 + Index * 2);
+DP = EmitAMDGPUImplicitArgPtr(CGF);
+  } else {
+// Indexing the HSA kernel_dispatch_packet struct.
+Offset = llvm::ConstantInt::get(CGF.Int32Ty, 4 + Index * 2);
+DP = EmitAMDGPUDispatchPtr(CGF);
+  }
+
   auto *GEP = CGF.Builder.CreateGEP(CGF.Int8Ty, DP, Offset);
   auto *DstTy =
   CGF.Int16Ty->getPointerTo(GEP->getType()->getPointerAddressSpace());

diff  --git a/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu 
b/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
index 5928320b89f00..4c1c4c883a152 100644
--- a/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
@@ -1,17 +1,31 @@
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \
 // RUN: -fcuda-is-device -emit-llvm -o - -x hip %s \
-// RUN: | FileCheck %s
+// RUN: | FileCheck -check-prefix=PRECOV5 %s
+
+
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \
+// RUN: -fcuda-is-device -mcode-object-version=5 -emit-llvm -o - -x hip %s 
\
+// RUN: | FileCheck -check-prefix=COV5 %s
 
 #include "Inputs/cuda.h"
 
-// CHECK-LABEL: test_get_workgroup_size
-// CHECK: call align 4 dereferenceable(64) i8 addrspace(4)* 
@llvm.amdgcn.dispatch.ptr()
-// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 4
-// CHECK: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range 
[[$WS_RANGE:![0-9]*]], !invariant.load
-// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 6
-// CHECK: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range 
[[$WS_RANGE:![0-9]*]], !invariant.load
-// CHECK: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 8
-// CHECK: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range 
[[$WS_RANGE:![0-9]*]], !invariant.load
+// PRECOV5-LABEL: test_get_workgroup_size
+// PRECOV5: call align 4 dereferenceable(64) i8 addrspace(4)* 
@llvm.amdgcn.dispatch.ptr()
+// PRECOV5: getelementptr i8, i8 addrspace(4)* %{{.*}}, i32 4
+// PRECOV5: load i16, i16 addrspace(4)* %{{.*}}, align 2, !range 
[[$WS_RANGE:![0-9]*]], !invariant.load
+// PRECOV5: getelementptr i8, i8 

r279165 - AMDGPU: Add clang builtin for ds_swizzle.

2016-08-18 Thread Changpeng Fang via cfe-commits
Author: chfang
Date: Thu Aug 18 17:04:54 2016
New Revision: 279165

URL: http://llvm.org/viewvc/llvm-project?rev=279165=rev
Log:
AMDGPU: Add clang builtin for ds_swizzle.

Summary:
  int __builtin_amdgcn_ds_swizzle (int a, int imm);
while imm is a constant.

Differential Revision:
  http://reviews.llvm.org/D23682

Modified:
cfe/trunk/include/clang/Basic/BuiltinsAMDGPU.def
cfe/trunk/lib/CodeGen/CGBuiltin.cpp
cfe/trunk/test/CodeGenOpenCL/builtins-amdgcn-error.cl
cfe/trunk/test/CodeGenOpenCL/builtins-amdgcn.cl

Modified: cfe/trunk/include/clang/Basic/BuiltinsAMDGPU.def
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/BuiltinsAMDGPU.def?rev=279165=279164=279165=diff
==
--- cfe/trunk/include/clang/Basic/BuiltinsAMDGPU.def (original)
+++ cfe/trunk/include/clang/Basic/BuiltinsAMDGPU.def Thu Aug 18 17:04:54 2016
@@ -76,6 +76,7 @@ BUILTIN(__builtin_amdgcn_sicmp, "LUiiiIi
 BUILTIN(__builtin_amdgcn_sicmpl, "LUiLiLiIi", "nc")
 BUILTIN(__builtin_amdgcn_fcmp, "LUiddIi", "nc")
 BUILTIN(__builtin_amdgcn_fcmpf, "LUiffIi", "nc")
+BUILTIN(__builtin_amdgcn_ds_swizzle, "iiIi", "nc")
 
 
//===--===//
 // VI+ only builtins.

Modified: cfe/trunk/lib/CodeGen/CGBuiltin.cpp
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CGBuiltin.cpp?rev=279165=279164=279165=diff
==
--- cfe/trunk/lib/CodeGen/CGBuiltin.cpp (original)
+++ cfe/trunk/lib/CodeGen/CGBuiltin.cpp Thu Aug 18 17:04:54 2016
@@ -7652,6 +7652,9 @@ Value *CodeGenFunction::EmitAMDGPUBuilti
 llvm::Value *Src3ToBool = Builder.CreateIsNotNull(Src3);
 return Builder.CreateCall(F, {Src0, Src1, Src2, Src3ToBool});
   }
+
+  case AMDGPU::BI__builtin_amdgcn_ds_swizzle:
+return emitBinaryBuiltin(*this, E, Intrinsic::amdgcn_ds_swizzle);
   case AMDGPU::BI__builtin_amdgcn_div_fixup:
   case AMDGPU::BI__builtin_amdgcn_div_fixupf:
 return emitTernaryBuiltin(*this, E, Intrinsic::amdgcn_div_fixup);

Modified: cfe/trunk/test/CodeGenOpenCL/builtins-amdgcn-error.cl
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGenOpenCL/builtins-amdgcn-error.cl?rev=279165=279164=279165=diff
==
--- cfe/trunk/test/CodeGenOpenCL/builtins-amdgcn-error.cl (original)
+++ cfe/trunk/test/CodeGenOpenCL/builtins-amdgcn-error.cl Thu Aug 18 17:04:54 
2016
@@ -48,3 +48,7 @@ void test_fcmp_f64(global ulong* out, do
   *out = __builtin_amdgcn_fcmp(a, b, c); // expected-error {{argument to 
'__builtin_amdgcn_fcmp' must be a constant integer}}
 }
 
+void test_ds_swizzle(global int* out, int a, int b)
+{
+  *out = __builtin_amdgcn_ds_swizzle(a, b); // expected-error {{argument to 
'__builtin_amdgcn_ds_swizzle' must be a constant integer}}
+}

Modified: cfe/trunk/test/CodeGenOpenCL/builtins-amdgcn.cl
URL: 
http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGenOpenCL/builtins-amdgcn.cl?rev=279165=279164=279165=diff
==
--- cfe/trunk/test/CodeGenOpenCL/builtins-amdgcn.cl (original)
+++ cfe/trunk/test/CodeGenOpenCL/builtins-amdgcn.cl Thu Aug 18 17:04:54 2016
@@ -228,6 +228,13 @@ void test_uicmp_i64(global ulong* out, u
   *out = __builtin_amdgcn_uicmpl(a, b, 30+5);
 }
 
+// CHECK-LABEL: @test_ds_swizzle
+// CHECK: call i32 @llvm.amdgcn.ds.swizzle(i32 %a, i32 32)
+void test_ds_swizzle(global int* out, int a)
+{
+  *out = __builtin_amdgcn_ds_swizzle(a, 32);
+}
+
 // CHECK-LABEL: @test_fcmp_f32
 // CHECK: call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 5)
 void test_fcmp_f32(global ulong* out, float a, float b)


___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits