[PATCH] D76795: [HIP] Change default --gpu-max-threads-per-block value to 1024

2020-06-03 Thread Yaxun Liu via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
yaxunl marked an inline comment as done.
Closed by commit rG04abbb3a7818: [HIP] Change default 
--gpu-max-threads-per-block value to 1024 (authored by yaxunl).
Herald added a project: clang.

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D76795/new/

https://reviews.llvm.org/D76795

Files:
  clang/include/clang/Basic/LangOptions.def
  clang/lib/CodeGen/TargetInfo.cpp
  clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
  clang/test/CodeGenCUDA/kernel-amdgcn.cu


Index: clang/test/CodeGenCUDA/kernel-amdgcn.cu
===
--- clang/test/CodeGenCUDA/kernel-amdgcn.cu
+++ clang/test/CodeGenCUDA/kernel-amdgcn.cu
@@ -39,4 +39,4 @@
   launch((void*)D.Empty());
   return 0;
 }
-// CHECK: attributes #[[ATTR]] = {{.*}}"amdgpu-flat-work-group-size"="1,256"
+// CHECK: attributes #[[ATTR]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"
Index: clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
===
--- clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
+++ clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
@@ -39,7 +39,7 @@
 // NAMD-NOT: "amdgpu-num-vgpr"
 // NAMD-NOT: "amdgpu-num-sgpr"
 
-// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = 
{{.*}}"amdgpu-flat-work-group-size"="1,256"{{.*}}"uniform-work-group-size"="true"
+// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = 
{{.*}}"amdgpu-flat-work-group-size"="1,1024"{{.*}}"uniform-work-group-size"="true"
 // MAX1024-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = 
{{.*}}"amdgpu-flat-work-group-size"="1,1024"
 // CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = 
{{.*}}"amdgpu-flat-work-group-size"="32,64"
 // CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2"
Index: clang/lib/CodeGen/TargetInfo.cpp
===
--- clang/lib/CodeGen/TargetInfo.cpp
+++ clang/lib/CodeGen/TargetInfo.cpp
@@ -8815,9 +8815,13 @@
   assert(Max == 0 && "Max must be zero");
   } else if (IsOpenCLKernel || IsHIPKernel) {
 // By default, restrict the maximum size to a value specified by
-// --gpu-max-threads-per-block=n or its default value.
+// --gpu-max-threads-per-block=n or its default value for HIP.
+const unsigned OpenCLDefaultMaxWorkGroupSize = 256;
+const unsigned DefaultMaxWorkGroupSize =
+IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize
+   : M.getLangOpts().GPUMaxThreadsPerBlock;
 std::string AttrVal =
-std::string("1,") + 
llvm::utostr(M.getLangOpts().GPUMaxThreadsPerBlock);
+std::string("1,") + llvm::utostr(DefaultMaxWorkGroupSize);
 F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
   }
 
Index: clang/include/clang/Basic/LangOptions.def
===
--- clang/include/clang/Basic/LangOptions.def
+++ clang/include/clang/Basic/LangOptions.def
@@ -238,7 +238,7 @@
 LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate 
transcendental functions")
 LANGOPT(GPURelocatableDeviceCode, 1, 0, "generate relocatable device code")
 LANGOPT(GPUAllowDeviceInit, 1, 0, "allowing device side global init functions 
for HIP")
-LANGOPT(GPUMaxThreadsPerBlock, 32, 256, "default max threads per block for 
kernel launch bounds for HIP")
+LANGOPT(GPUMaxThreadsPerBlock, 32, 1024, "default max threads per block for 
kernel launch bounds for HIP")
 
 LANGOPT(SYCL  , 1, 0, "SYCL")
 LANGOPT(SYCLIsDevice  , 1, 0, "Generate code for SYCL device")


Index: clang/test/CodeGenCUDA/kernel-amdgcn.cu
===
--- clang/test/CodeGenCUDA/kernel-amdgcn.cu
+++ clang/test/CodeGenCUDA/kernel-amdgcn.cu
@@ -39,4 +39,4 @@
   launch((void*)D.Empty());
   return 0;
 }
-// CHECK: attributes #[[ATTR]] = {{.*}}"amdgpu-flat-work-group-size"="1,256"
+// CHECK: attributes #[[ATTR]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"
Index: clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
===
--- clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
+++ clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
@@ -39,7 +39,7 @@
 // NAMD-NOT: "amdgpu-num-vgpr"
 // NAMD-NOT: "amdgpu-num-sgpr"
 
-// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,256"{{.*}}"uniform-work-group-size"="true"
+// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"{{.*}}"uniform-work-group-size"="true"
 // MAX1024-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"
 // CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = {{.*}}"amdgpu-flat-work-group-size"="32,64"
 // CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2"
Index: 

[PATCH] D76795: [HIP] Change default --gpu-max-threads-per-block value to 1024

2020-03-25 Thread Brian Sumner via Phabricator via cfe-commits
b-sumner added a comment.

Thanks.  This looks fine to me.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D76795/new/

https://reviews.llvm.org/D76795



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D76795: [HIP] Change default --gpu-max-threads-per-block value to 1024

2020-03-25 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl updated this revision to Diff 252661.
yaxunl added a comment.

change variable names


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D76795/new/

https://reviews.llvm.org/D76795

Files:
  clang/include/clang/Basic/LangOptions.def
  clang/lib/CodeGen/TargetInfo.cpp
  clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
  clang/test/CodeGenCUDA/kernel-amdgcn.cu


Index: clang/test/CodeGenCUDA/kernel-amdgcn.cu
===
--- clang/test/CodeGenCUDA/kernel-amdgcn.cu
+++ clang/test/CodeGenCUDA/kernel-amdgcn.cu
@@ -39,4 +39,4 @@
   launch((void*)D.Empty());
   return 0;
 }
-// CHECK: attributes #[[ATTR]] = {{.*}}"amdgpu-flat-work-group-size"="1,256"
+// CHECK: attributes #[[ATTR]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"
Index: clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
===
--- clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
+++ clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
@@ -39,7 +39,7 @@
 // NAMD-NOT: "amdgpu-num-vgpr"
 // NAMD-NOT: "amdgpu-num-sgpr"
 
-// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = 
{{.*}}"amdgpu-flat-work-group-size"="1,256"{{.*}}"uniform-work-group-size"="true"
+// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = 
{{.*}}"amdgpu-flat-work-group-size"="1,1024"{{.*}}"uniform-work-group-size"="true"
 // MAX1024-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = 
{{.*}}"amdgpu-flat-work-group-size"="1,1024"
 // CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = 
{{.*}}"amdgpu-flat-work-group-size"="32,64"
 // CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2"
Index: clang/lib/CodeGen/TargetInfo.cpp
===
--- clang/lib/CodeGen/TargetInfo.cpp
+++ clang/lib/CodeGen/TargetInfo.cpp
@@ -8119,9 +8119,13 @@
   assert(Max == 0 && "Max must be zero");
   } else if (IsOpenCLKernel || IsHIPKernel) {
 // By default, restrict the maximum size to a value specified by
-// --gpu-max-threads-per-block=n or its default value.
+// --gpu-max-threads-per-block=n or its default value for HIP.
+const unsigned OpenCLDefaultMaxWorkGroupSize = 256;
+const unsigned DefaultMaxWorkGroupSize =
+IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize
+   : M.getLangOpts().GPUMaxThreadsPerBlock;
 std::string AttrVal =
-std::string("1,") + 
llvm::utostr(M.getLangOpts().GPUMaxThreadsPerBlock);
+std::string("1,") + llvm::utostr(DefaultMaxWorkGroupSize);
 F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
   }
 
Index: clang/include/clang/Basic/LangOptions.def
===
--- clang/include/clang/Basic/LangOptions.def
+++ clang/include/clang/Basic/LangOptions.def
@@ -231,7 +231,7 @@
 LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate 
transcendental functions")
 LANGOPT(GPURelocatableDeviceCode, 1, 0, "generate relocatable device code")
 LANGOPT(GPUAllowDeviceInit, 1, 0, "allowing device side global init functions 
for HIP")
-LANGOPT(GPUMaxThreadsPerBlock, 32, 256, "default max threads per block for 
kernel launch bounds for HIP")
+LANGOPT(GPUMaxThreadsPerBlock, 32, 1024, "default max threads per block for 
kernel launch bounds for HIP")
 
 LANGOPT(SYCL  , 1, 0, "SYCL")
 LANGOPT(SYCLIsDevice  , 1, 0, "Generate code for SYCL device")


Index: clang/test/CodeGenCUDA/kernel-amdgcn.cu
===
--- clang/test/CodeGenCUDA/kernel-amdgcn.cu
+++ clang/test/CodeGenCUDA/kernel-amdgcn.cu
@@ -39,4 +39,4 @@
   launch((void*)D.Empty());
   return 0;
 }
-// CHECK: attributes #[[ATTR]] = {{.*}}"amdgpu-flat-work-group-size"="1,256"
+// CHECK: attributes #[[ATTR]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"
Index: clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
===
--- clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
+++ clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
@@ -39,7 +39,7 @@
 // NAMD-NOT: "amdgpu-num-vgpr"
 // NAMD-NOT: "amdgpu-num-sgpr"
 
-// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,256"{{.*}}"uniform-work-group-size"="true"
+// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"{{.*}}"uniform-work-group-size"="true"
 // MAX1024-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"
 // CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = {{.*}}"amdgpu-flat-work-group-size"="32,64"
 // CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2"
Index: clang/lib/CodeGen/TargetInfo.cpp
===
--- clang/lib/CodeGen/TargetInfo.cpp
+++ clang/lib/CodeGen/TargetInfo.cpp
@@ -8119,9 +8119,13 @@
   assert

[PATCH] D76795: [HIP] Change default --gpu-max-threads-per-block value to 1024

2020-03-25 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl marked 2 inline comments as done.
yaxunl added inline comments.



Comment at: clang/lib/CodeGen/TargetInfo.cpp:8123
+// --gpu-max-threads-per-block=n or its default value for HIP.
+const unsigned OpenCLMaxWorkGroupSize = 256;
+const unsigned MaxWorkGroupSize =

b-sumner wrote:
> I'd like to see the word default, e.g. OpenCLDefaultMaxWorkGroupSize, used 
> more since that is what this is about.  Ideally the option would have been 
> named gpu-default-max-threads-per-block, but I suppose I can see why it was 
> shortened.
changed variable names.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D76795/new/

https://reviews.llvm.org/D76795



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D76795: [HIP] Change default --gpu-max-threads-per-block value to 1024

2020-03-25 Thread Brian Sumner via Phabricator via cfe-commits
b-sumner added inline comments.



Comment at: clang/lib/CodeGen/TargetInfo.cpp:8123
+// --gpu-max-threads-per-block=n or its default value for HIP.
+const unsigned OpenCLMaxWorkGroupSize = 256;
+const unsigned MaxWorkGroupSize =

I'd like to see the word default, e.g. OpenCLDefaultMaxWorkGroupSize, used more 
since that is what this is about.  Ideally the option would have been named 
gpu-default-max-threads-per-block, but I suppose I can see why it was shortened.


CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D76795/new/

https://reviews.llvm.org/D76795



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D76795: [HIP] Change default --gpu-max-threads-per-block value to 1024

2020-03-25 Thread Yaxun Liu via Phabricator via cfe-commits
yaxunl created this revision.
yaxunl added reviewers: b-sumner, tra.
Herald added subscribers: kerbowa, nhaehnle, jvesely.

This better matches CUDA behavior.


https://reviews.llvm.org/D76795

Files:
  clang/include/clang/Basic/LangOptions.def
  clang/lib/CodeGen/TargetInfo.cpp
  clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
  clang/test/CodeGenCUDA/kernel-amdgcn.cu


Index: clang/test/CodeGenCUDA/kernel-amdgcn.cu
===
--- clang/test/CodeGenCUDA/kernel-amdgcn.cu
+++ clang/test/CodeGenCUDA/kernel-amdgcn.cu
@@ -39,4 +39,4 @@
   launch((void*)D.Empty());
   return 0;
 }
-// CHECK: attributes #[[ATTR]] = {{.*}}"amdgpu-flat-work-group-size"="1,256"
+// CHECK: attributes #[[ATTR]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"
Index: clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
===
--- clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
+++ clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
@@ -39,7 +39,7 @@
 // NAMD-NOT: "amdgpu-num-vgpr"
 // NAMD-NOT: "amdgpu-num-sgpr"
 
-// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = 
{{.*}}"amdgpu-flat-work-group-size"="1,256"{{.*}}"uniform-work-group-size"="true"
+// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = 
{{.*}}"amdgpu-flat-work-group-size"="1,1024"{{.*}}"uniform-work-group-size"="true"
 // MAX1024-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = 
{{.*}}"amdgpu-flat-work-group-size"="1,1024"
 // CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = 
{{.*}}"amdgpu-flat-work-group-size"="32,64"
 // CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2"
Index: clang/lib/CodeGen/TargetInfo.cpp
===
--- clang/lib/CodeGen/TargetInfo.cpp
+++ clang/lib/CodeGen/TargetInfo.cpp
@@ -8119,9 +8119,12 @@
   assert(Max == 0 && "Max must be zero");
   } else if (IsOpenCLKernel || IsHIPKernel) {
 // By default, restrict the maximum size to a value specified by
-// --gpu-max-threads-per-block=n or its default value.
-std::string AttrVal =
-std::string("1,") + 
llvm::utostr(M.getLangOpts().GPUMaxThreadsPerBlock);
+// --gpu-max-threads-per-block=n or its default value for HIP.
+const unsigned OpenCLMaxWorkGroupSize = 256;
+const unsigned MaxWorkGroupSize =
+IsOpenCLKernel ? OpenCLMaxWorkGroupSize
+   : M.getLangOpts().GPUMaxThreadsPerBlock;
+std::string AttrVal = std::string("1,") + llvm::utostr(MaxWorkGroupSize);
 F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
   }
 
Index: clang/include/clang/Basic/LangOptions.def
===
--- clang/include/clang/Basic/LangOptions.def
+++ clang/include/clang/Basic/LangOptions.def
@@ -231,7 +231,7 @@
 LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate 
transcendental functions")
 LANGOPT(GPURelocatableDeviceCode, 1, 0, "generate relocatable device code")
 LANGOPT(GPUAllowDeviceInit, 1, 0, "allowing device side global init functions 
for HIP")
-LANGOPT(GPUMaxThreadsPerBlock, 32, 256, "default max threads per block for 
kernel launch bounds for HIP")
+LANGOPT(GPUMaxThreadsPerBlock, 32, 1024, "default max threads per block for 
kernel launch bounds for HIP")
 
 LANGOPT(SYCL  , 1, 0, "SYCL")
 LANGOPT(SYCLIsDevice  , 1, 0, "Generate code for SYCL device")


Index: clang/test/CodeGenCUDA/kernel-amdgcn.cu
===
--- clang/test/CodeGenCUDA/kernel-amdgcn.cu
+++ clang/test/CodeGenCUDA/kernel-amdgcn.cu
@@ -39,4 +39,4 @@
   launch((void*)D.Empty());
   return 0;
 }
-// CHECK: attributes #[[ATTR]] = {{.*}}"amdgpu-flat-work-group-size"="1,256"
+// CHECK: attributes #[[ATTR]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"
Index: clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
===
--- clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
+++ clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu
@@ -39,7 +39,7 @@
 // NAMD-NOT: "amdgpu-num-vgpr"
 // NAMD-NOT: "amdgpu-num-sgpr"
 
-// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,256"{{.*}}"uniform-work-group-size"="true"
+// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"{{.*}}"uniform-work-group-size"="true"
 // MAX1024-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"
 // CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = {{.*}}"amdgpu-flat-work-group-size"="32,64"
 // CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2"
Index: clang/lib/CodeGen/TargetInfo.cpp
===
--- clang/lib/CodeGen/TargetInfo.cpp
+++ clang/lib/CodeGen/TargetInfo.cpp
@@ -8119,9 +8119,12 @@
   assert(Max == 0