[PATCH] D76795: [HIP] Change default --gpu-max-threads-per-block value to 1024
This revision was automatically updated to reflect the committed changes. yaxunl marked an inline comment as done. Closed by commit rG04abbb3a7818: [HIP] Change default --gpu-max-threads-per-block value to 1024 (authored by yaxunl). Herald added a project: clang. Repository: rG LLVM Github Monorepo CHANGES SINCE LAST ACTION https://reviews.llvm.org/D76795/new/ https://reviews.llvm.org/D76795 Files: clang/include/clang/Basic/LangOptions.def clang/lib/CodeGen/TargetInfo.cpp clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu clang/test/CodeGenCUDA/kernel-amdgcn.cu Index: clang/test/CodeGenCUDA/kernel-amdgcn.cu === --- clang/test/CodeGenCUDA/kernel-amdgcn.cu +++ clang/test/CodeGenCUDA/kernel-amdgcn.cu @@ -39,4 +39,4 @@ launch((void*)D.Empty()); return 0; } -// CHECK: attributes #[[ATTR]] = {{.*}}"amdgpu-flat-work-group-size"="1,256" +// CHECK: attributes #[[ATTR]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024" Index: clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu === --- clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu +++ clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu @@ -39,7 +39,7 @@ // NAMD-NOT: "amdgpu-num-vgpr" // NAMD-NOT: "amdgpu-num-sgpr" -// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,256"{{.*}}"uniform-work-group-size"="true" +// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"{{.*}}"uniform-work-group-size"="true" // MAX1024-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024" // CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = {{.*}}"amdgpu-flat-work-group-size"="32,64" // CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2" Index: clang/lib/CodeGen/TargetInfo.cpp === --- clang/lib/CodeGen/TargetInfo.cpp +++ clang/lib/CodeGen/TargetInfo.cpp @@ -8815,9 +8815,13 @@ assert(Max == 0 && "Max must be zero"); } else if (IsOpenCLKernel || IsHIPKernel) { // By default, restrict the maximum size to a value specified by -// --gpu-max-threads-per-block=n or its default value. +// --gpu-max-threads-per-block=n or its default value for HIP. +const unsigned OpenCLDefaultMaxWorkGroupSize = 256; +const unsigned DefaultMaxWorkGroupSize = +IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize + : M.getLangOpts().GPUMaxThreadsPerBlock; std::string AttrVal = -std::string("1,") + llvm::utostr(M.getLangOpts().GPUMaxThreadsPerBlock); +std::string("1,") + llvm::utostr(DefaultMaxWorkGroupSize); F->addFnAttr("amdgpu-flat-work-group-size", AttrVal); } Index: clang/include/clang/Basic/LangOptions.def === --- clang/include/clang/Basic/LangOptions.def +++ clang/include/clang/Basic/LangOptions.def @@ -238,7 +238,7 @@ LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions") LANGOPT(GPURelocatableDeviceCode, 1, 0, "generate relocatable device code") LANGOPT(GPUAllowDeviceInit, 1, 0, "allowing device side global init functions for HIP") -LANGOPT(GPUMaxThreadsPerBlock, 32, 256, "default max threads per block for kernel launch bounds for HIP") +LANGOPT(GPUMaxThreadsPerBlock, 32, 1024, "default max threads per block for kernel launch bounds for HIP") LANGOPT(SYCL , 1, 0, "SYCL") LANGOPT(SYCLIsDevice , 1, 0, "Generate code for SYCL device") Index: clang/test/CodeGenCUDA/kernel-amdgcn.cu === --- clang/test/CodeGenCUDA/kernel-amdgcn.cu +++ clang/test/CodeGenCUDA/kernel-amdgcn.cu @@ -39,4 +39,4 @@ launch((void*)D.Empty()); return 0; } -// CHECK: attributes #[[ATTR]] = {{.*}}"amdgpu-flat-work-group-size"="1,256" +// CHECK: attributes #[[ATTR]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024" Index: clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu === --- clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu +++ clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu @@ -39,7 +39,7 @@ // NAMD-NOT: "amdgpu-num-vgpr" // NAMD-NOT: "amdgpu-num-sgpr" -// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,256"{{.*}}"uniform-work-group-size"="true" +// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"{{.*}}"uniform-work-group-size"="true" // MAX1024-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024" // CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = {{.*}}"amdgpu-flat-work-group-size"="32,64" // CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2" Index:
[PATCH] D76795: [HIP] Change default --gpu-max-threads-per-block value to 1024
b-sumner added a comment. Thanks. This looks fine to me. CHANGES SINCE LAST ACTION https://reviews.llvm.org/D76795/new/ https://reviews.llvm.org/D76795 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D76795: [HIP] Change default --gpu-max-threads-per-block value to 1024
yaxunl updated this revision to Diff 252661. yaxunl added a comment. change variable names CHANGES SINCE LAST ACTION https://reviews.llvm.org/D76795/new/ https://reviews.llvm.org/D76795 Files: clang/include/clang/Basic/LangOptions.def clang/lib/CodeGen/TargetInfo.cpp clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu clang/test/CodeGenCUDA/kernel-amdgcn.cu Index: clang/test/CodeGenCUDA/kernel-amdgcn.cu === --- clang/test/CodeGenCUDA/kernel-amdgcn.cu +++ clang/test/CodeGenCUDA/kernel-amdgcn.cu @@ -39,4 +39,4 @@ launch((void*)D.Empty()); return 0; } -// CHECK: attributes #[[ATTR]] = {{.*}}"amdgpu-flat-work-group-size"="1,256" +// CHECK: attributes #[[ATTR]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024" Index: clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu === --- clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu +++ clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu @@ -39,7 +39,7 @@ // NAMD-NOT: "amdgpu-num-vgpr" // NAMD-NOT: "amdgpu-num-sgpr" -// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,256"{{.*}}"uniform-work-group-size"="true" +// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"{{.*}}"uniform-work-group-size"="true" // MAX1024-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024" // CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = {{.*}}"amdgpu-flat-work-group-size"="32,64" // CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2" Index: clang/lib/CodeGen/TargetInfo.cpp === --- clang/lib/CodeGen/TargetInfo.cpp +++ clang/lib/CodeGen/TargetInfo.cpp @@ -8119,9 +8119,13 @@ assert(Max == 0 && "Max must be zero"); } else if (IsOpenCLKernel || IsHIPKernel) { // By default, restrict the maximum size to a value specified by -// --gpu-max-threads-per-block=n or its default value. +// --gpu-max-threads-per-block=n or its default value for HIP. +const unsigned OpenCLDefaultMaxWorkGroupSize = 256; +const unsigned DefaultMaxWorkGroupSize = +IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize + : M.getLangOpts().GPUMaxThreadsPerBlock; std::string AttrVal = -std::string("1,") + llvm::utostr(M.getLangOpts().GPUMaxThreadsPerBlock); +std::string("1,") + llvm::utostr(DefaultMaxWorkGroupSize); F->addFnAttr("amdgpu-flat-work-group-size", AttrVal); } Index: clang/include/clang/Basic/LangOptions.def === --- clang/include/clang/Basic/LangOptions.def +++ clang/include/clang/Basic/LangOptions.def @@ -231,7 +231,7 @@ LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions") LANGOPT(GPURelocatableDeviceCode, 1, 0, "generate relocatable device code") LANGOPT(GPUAllowDeviceInit, 1, 0, "allowing device side global init functions for HIP") -LANGOPT(GPUMaxThreadsPerBlock, 32, 256, "default max threads per block for kernel launch bounds for HIP") +LANGOPT(GPUMaxThreadsPerBlock, 32, 1024, "default max threads per block for kernel launch bounds for HIP") LANGOPT(SYCL , 1, 0, "SYCL") LANGOPT(SYCLIsDevice , 1, 0, "Generate code for SYCL device") Index: clang/test/CodeGenCUDA/kernel-amdgcn.cu === --- clang/test/CodeGenCUDA/kernel-amdgcn.cu +++ clang/test/CodeGenCUDA/kernel-amdgcn.cu @@ -39,4 +39,4 @@ launch((void*)D.Empty()); return 0; } -// CHECK: attributes #[[ATTR]] = {{.*}}"amdgpu-flat-work-group-size"="1,256" +// CHECK: attributes #[[ATTR]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024" Index: clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu === --- clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu +++ clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu @@ -39,7 +39,7 @@ // NAMD-NOT: "amdgpu-num-vgpr" // NAMD-NOT: "amdgpu-num-sgpr" -// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,256"{{.*}}"uniform-work-group-size"="true" +// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"{{.*}}"uniform-work-group-size"="true" // MAX1024-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024" // CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = {{.*}}"amdgpu-flat-work-group-size"="32,64" // CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2" Index: clang/lib/CodeGen/TargetInfo.cpp === --- clang/lib/CodeGen/TargetInfo.cpp +++ clang/lib/CodeGen/TargetInfo.cpp @@ -8119,9 +8119,13 @@ assert
[PATCH] D76795: [HIP] Change default --gpu-max-threads-per-block value to 1024
yaxunl marked 2 inline comments as done. yaxunl added inline comments. Comment at: clang/lib/CodeGen/TargetInfo.cpp:8123 +// --gpu-max-threads-per-block=n or its default value for HIP. +const unsigned OpenCLMaxWorkGroupSize = 256; +const unsigned MaxWorkGroupSize = b-sumner wrote: > I'd like to see the word default, e.g. OpenCLDefaultMaxWorkGroupSize, used > more since that is what this is about. Ideally the option would have been > named gpu-default-max-threads-per-block, but I suppose I can see why it was > shortened. changed variable names. CHANGES SINCE LAST ACTION https://reviews.llvm.org/D76795/new/ https://reviews.llvm.org/D76795 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D76795: [HIP] Change default --gpu-max-threads-per-block value to 1024
b-sumner added inline comments. Comment at: clang/lib/CodeGen/TargetInfo.cpp:8123 +// --gpu-max-threads-per-block=n or its default value for HIP. +const unsigned OpenCLMaxWorkGroupSize = 256; +const unsigned MaxWorkGroupSize = I'd like to see the word default, e.g. OpenCLDefaultMaxWorkGroupSize, used more since that is what this is about. Ideally the option would have been named gpu-default-max-threads-per-block, but I suppose I can see why it was shortened. CHANGES SINCE LAST ACTION https://reviews.llvm.org/D76795/new/ https://reviews.llvm.org/D76795 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D76795: [HIP] Change default --gpu-max-threads-per-block value to 1024
yaxunl created this revision. yaxunl added reviewers: b-sumner, tra. Herald added subscribers: kerbowa, nhaehnle, jvesely. This better matches CUDA behavior. https://reviews.llvm.org/D76795 Files: clang/include/clang/Basic/LangOptions.def clang/lib/CodeGen/TargetInfo.cpp clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu clang/test/CodeGenCUDA/kernel-amdgcn.cu Index: clang/test/CodeGenCUDA/kernel-amdgcn.cu === --- clang/test/CodeGenCUDA/kernel-amdgcn.cu +++ clang/test/CodeGenCUDA/kernel-amdgcn.cu @@ -39,4 +39,4 @@ launch((void*)D.Empty()); return 0; } -// CHECK: attributes #[[ATTR]] = {{.*}}"amdgpu-flat-work-group-size"="1,256" +// CHECK: attributes #[[ATTR]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024" Index: clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu === --- clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu +++ clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu @@ -39,7 +39,7 @@ // NAMD-NOT: "amdgpu-num-vgpr" // NAMD-NOT: "amdgpu-num-sgpr" -// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,256"{{.*}}"uniform-work-group-size"="true" +// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"{{.*}}"uniform-work-group-size"="true" // MAX1024-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024" // CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = {{.*}}"amdgpu-flat-work-group-size"="32,64" // CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2" Index: clang/lib/CodeGen/TargetInfo.cpp === --- clang/lib/CodeGen/TargetInfo.cpp +++ clang/lib/CodeGen/TargetInfo.cpp @@ -8119,9 +8119,12 @@ assert(Max == 0 && "Max must be zero"); } else if (IsOpenCLKernel || IsHIPKernel) { // By default, restrict the maximum size to a value specified by -// --gpu-max-threads-per-block=n or its default value. -std::string AttrVal = -std::string("1,") + llvm::utostr(M.getLangOpts().GPUMaxThreadsPerBlock); +// --gpu-max-threads-per-block=n or its default value for HIP. +const unsigned OpenCLMaxWorkGroupSize = 256; +const unsigned MaxWorkGroupSize = +IsOpenCLKernel ? OpenCLMaxWorkGroupSize + : M.getLangOpts().GPUMaxThreadsPerBlock; +std::string AttrVal = std::string("1,") + llvm::utostr(MaxWorkGroupSize); F->addFnAttr("amdgpu-flat-work-group-size", AttrVal); } Index: clang/include/clang/Basic/LangOptions.def === --- clang/include/clang/Basic/LangOptions.def +++ clang/include/clang/Basic/LangOptions.def @@ -231,7 +231,7 @@ LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions") LANGOPT(GPURelocatableDeviceCode, 1, 0, "generate relocatable device code") LANGOPT(GPUAllowDeviceInit, 1, 0, "allowing device side global init functions for HIP") -LANGOPT(GPUMaxThreadsPerBlock, 32, 256, "default max threads per block for kernel launch bounds for HIP") +LANGOPT(GPUMaxThreadsPerBlock, 32, 1024, "default max threads per block for kernel launch bounds for HIP") LANGOPT(SYCL , 1, 0, "SYCL") LANGOPT(SYCLIsDevice , 1, 0, "Generate code for SYCL device") Index: clang/test/CodeGenCUDA/kernel-amdgcn.cu === --- clang/test/CodeGenCUDA/kernel-amdgcn.cu +++ clang/test/CodeGenCUDA/kernel-amdgcn.cu @@ -39,4 +39,4 @@ launch((void*)D.Empty()); return 0; } -// CHECK: attributes #[[ATTR]] = {{.*}}"amdgpu-flat-work-group-size"="1,256" +// CHECK: attributes #[[ATTR]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024" Index: clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu === --- clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu +++ clang/test/CodeGenCUDA/amdgpu-kernel-attrs.cu @@ -39,7 +39,7 @@ // NAMD-NOT: "amdgpu-num-vgpr" // NAMD-NOT: "amdgpu-num-sgpr" -// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,256"{{.*}}"uniform-work-group-size"="true" +// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"{{.*}}"uniform-work-group-size"="true" // MAX1024-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024" // CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = {{.*}}"amdgpu-flat-work-group-size"="32,64" // CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2" Index: clang/lib/CodeGen/TargetInfo.cpp === --- clang/lib/CodeGen/TargetInfo.cpp +++ clang/lib/CodeGen/TargetInfo.cpp @@ -8119,9 +8119,12 @@ assert(Max == 0