yaxunl created this revision.
yaxunl added reviewers: tra, b-sumner, MaskRay, arsenm, scchan.
Herald added subscribers: jdoerfert, kerbowa, jvesely.
Herald added a project: All.
yaxunl requested review of this revision.
Herald added a subscriber: wdng.

By default, clang assumes HIP kernels are launched with uniform block size,
which is the case for kernels launched through triple chevron or
hipLaunchKernelGGL. Clang adds uniform-work-group-size function attribute
to HIP kernels to allow the backend to do optimizations on that.

However, in some rare cases, HIP kernels can be launched
through hipExtModuleLaunchKernel where global work size is specified,
which may result in non-uniform block size.

To be able to support non-uniform block size for HIP kernels,
an option `-f[no-]hip-uniform-block is added. By default it is on.


https://reviews.llvm.org/D155213

Files:
  clang/include/clang/Basic/LangOptions.def
  clang/include/clang/Driver/Options.td
  clang/lib/CodeGen/CGCall.cpp
  clang/lib/CodeGen/Targets/AMDGPU.cpp
  clang/lib/Driver/ToolChains/Clang.cpp
  clang/test/CodeGenHIP/default-attributes.hip
  clang/test/Driver/hip-options.hip

Index: clang/test/Driver/hip-options.hip
===================================================================
--- clang/test/Driver/hip-options.hip
+++ clang/test/Driver/hip-options.hip
@@ -169,3 +169,28 @@
 // RUN: %clang -### -nogpuinc -nogpulib -fhip-fp32-correctly-rounded-divide-sqrt \
 // RUN:   --cuda-gpu-arch=gfx906  %s 2>&1 | FileCheck -check-prefixes=CRDS %s
 // CRDS-NOT: "-f{{(no-)?}}hip-fp32-correctly-rounded-divide-sqrt"
+
+// Check -fno-hip-uniform-block is passed to clang -cc1 but
+// (default) -fhip-uniform-block is not.
+
+// RUN: %clang -### --target=x86_64-unknown-linux-gnu -nogpuinc -nogpulib -fno-hip-uniform-block \
+// RUN:   --cuda-gpu-arch=gfx906 %s 2>&1 | FileCheck -check-prefix=NOUNIBLK %s
+
+// NOUNIBLK: "-cc1"{{.*}} "-triple" "amdgcn-amd-amdhsa" {{.*}} "-fno-hip-uniform-block"
+// NOUNIBLK: "-cc1"{{.*}} "-triple" "x86_64-unknown-linux-gnu" {{.*}} "-fno-hip-uniform-block"
+
+// RUN: %clang -### -nogpuinc -nogpulib -fhip-uniform-block \
+// RUN:   --cuda-gpu-arch=gfx906 %s 2>&1 | FileCheck -check-prefix=UNIBLK %s
+
+// RUN: %clang -### -nogpuinc -nogpulib \
+// RUN:   --cuda-gpu-arch=gfx906 %s 2>&1 | FileCheck -check-prefix=UNIBLK %s
+
+// UNIBLK-NOT: "-f{{(no-)?}}hip-uniform-block"
+
+// Check no warnings for -f[no-]uniform-block.
+
+// RUN: %clang -fdriver-only -Werror --target=x86_64-unknown-linux-gnu -nogpuinc -nogpulib -fno-hip-uniform-block \
+// RUN:   -fhip-uniform-block --cuda-gpu-arch=gfx906 %s 2>&1 | count 0
+
+// RUN: %clang  -fdriver-only -Werror --target=x86_64-unknown-linux-gnu -nostdinc -nostdlib -fno-hip-uniform-block \
+// RUN:   -fhip-uniform-block -x c++ %s 2>&1 | count 0
Index: clang/test/CodeGenHIP/default-attributes.hip
===================================================================
--- clang/test/CodeGenHIP/default-attributes.hip
+++ clang/test/CodeGenHIP/default-attributes.hip
@@ -5,6 +5,9 @@
 // RUN: %clang_cc1 -O3 -triple amdgcn-amd-amdhsa -x hip -fno-ident -fcuda-is-device \
 // RUN:    -emit-llvm -o - %s | FileCheck -check-prefix=OPT %s
 
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -fno-ident -fcuda-is-device -fno-hip-uniform-block \
+// RUN:    -emit-llvm -o - %s | FileCheck -check-prefix=NOUNIBLK %s
+
 #define __device__ __attribute__((device))
 #define __global__ __attribute__((global))
 
@@ -20,6 +23,12 @@
 // OPT-NEXT:  entry:
 // OPT-NEXT:    ret void
 //
+// NOUNIBLK: Function Attrs: convergent mustprogress noinline nounwind optnone
+// NOUNIBLK-LABEL: define {{[^@]+}}@_Z4funcv
+// NOUNIBLK-SAME: () #[[ATTR0:[0-9]+]] {
+// NOUNIBLK-NEXT:  entry:
+// NOUNIBLK-NEXT:    ret void
+//
 __device__ void func() {
 
 }
@@ -36,21 +45,34 @@
 // OPT-NEXT:  entry:
 // OPT-NEXT:    ret void
 //
+// NOUNIBLK: Function Attrs: convergent mustprogress noinline norecurse nounwind optnone
+// NOUNIBLK-LABEL: define {{[^@]+}}@_Z6kernelv
+// NOUNIBLK-SAME: () #[[ATTR1:[0-9]+]] {
+// NOUNIBLK-NEXT:  entry:
+// NOUNIBLK-NEXT:    ret void
+//
 __global__ void kernel() {
 
 }
 //.
-// OPTNONE: attributes #0 = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// OPTNONE: attributes #1 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
+// OPTNONE: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// OPTNONE: attributes #[[ATTR1]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
+//.
+// OPT: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// OPT: attributes #[[ATTR1]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
+//.
+// NOUNIBLK: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// NOUNIBLK: attributes #[[ATTR1]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
 //.
-// OPT: attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// OPT: attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
+// OPTNONE: [[META0:![0-9]+]] = !{i32 1, !"amdgpu_code_object_version", i32 400}
+// OPTNONE: [[META1:![0-9]+]] = !{i32 1, !"amdgpu_printf_kind", !"hostcall"}
+// OPTNONE: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 //.
-// OPTNONE: !0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
-// OPTNONE: !1 = !{i32 1, !"amdgpu_printf_kind", !"hostcall"}
-// OPTNONE: !2 = !{i32 1, !"wchar_size", i32 4}
+// OPT: [[META0:![0-9]+]] = !{i32 1, !"amdgpu_code_object_version", i32 400}
+// OPT: [[META1:![0-9]+]] = !{i32 1, !"amdgpu_printf_kind", !"hostcall"}
+// OPT: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 //.
-// OPT: !0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
-// OPT: !1 = !{i32 1, !"amdgpu_printf_kind", !"hostcall"}
-// OPT: !2 = !{i32 1, !"wchar_size", i32 4}
+// NOUNIBLK: [[META0:![0-9]+]] = !{i32 1, !"amdgpu_code_object_version", i32 400}
+// NOUNIBLK: [[META1:![0-9]+]] = !{i32 1, !"amdgpu_printf_kind", !"hostcall"}
+// NOUNIBLK: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 //.
Index: clang/lib/Driver/ToolChains/Clang.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Clang.cpp
+++ clang/lib/Driver/ToolChains/Clang.cpp
@@ -7220,6 +7220,11 @@
   if (IsHIP) {
     CmdArgs.push_back("-fcuda-allow-variadic-functions");
     Args.AddLastArg(CmdArgs, options::OPT_fgpu_default_stream_EQ);
+    Args.addOptOutFlag(CmdArgs, options::OPT_fhip_uniform_block,
+                       options::OPT_fno_hip_uniform_block);
+  } else {
+    Args.claimAllArgs(options::OPT_fhip_uniform_block,
+                      options::OPT_fno_hip_uniform_block);
   }
 
   if (IsCudaDevice || IsHIPDevice) {
Index: clang/lib/CodeGen/Targets/AMDGPU.cpp
===================================================================
--- clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -401,13 +401,6 @@
   if (FD)
     setFunctionDeclAttributes(FD, F, M);
 
-  const bool IsHIPKernel =
-      M.getLangOpts().HIP && FD && FD->hasAttr<CUDAGlobalAttr>();
-
-  // TODO: This should be moved to language specific attributes instead.
-  if (IsHIPKernel)
-    F->addFnAttr("uniform-work-group-size", "true");
-
   if (M.getContext().getTargetInfo().allowAMDGPUUnsafeFPAtomics())
     F->addFnAttr("amdgpu-unsafe-fp-atomics", "true");
 
Index: clang/lib/CodeGen/CGCall.cpp
===================================================================
--- clang/lib/CodeGen/CGCall.cpp
+++ clang/lib/CodeGen/CGCall.cpp
@@ -2402,6 +2402,9 @@
                                llvm::toStringRef(CodeGenOpts.UniformWGSize));
       }
     }
+
+    if (TargetDecl->hasAttr<CUDAGlobalAttr>() && getLangOpts().HIPUniformBlock)
+      FuncAttrs.addAttribute("uniform-work-group-size", "true");
   }
 
   // Attach "no-builtins" attributes to:
Index: clang/include/clang/Driver/Options.td
===================================================================
--- clang/include/clang/Driver/Options.td
+++ clang/include/clang/Driver/Options.td
@@ -1089,6 +1089,10 @@
   NegFlag<SetFalse, [], "Don't specify">,
   BothFlags<[], " that kernel argument names are preserved (HIP only)">>,
   ShouldParseIf<hip.KeyPath>;
+defm hip_uniform_block : BoolFOption<"hip-uniform-block",
+  LangOpts<"HIPUniformBlock">, DefaultTrue,
+  PosFlag<SetTrue, [], "Assume">, NegFlag<SetFalse, [CC1Option], "Don't assume">,
+  BothFlags<[], " that kernels are launched with uniform block sizes">>;
 def hipspv_pass_plugin_EQ : Joined<["--"], "hipspv-pass-plugin=">,
   Group<Link_Group>, MetaVarName<"<dsopath>">,
   HelpText<"path to a pass plugin for HIP to SPIR-V passes.">;
Index: clang/include/clang/Basic/LangOptions.def
===================================================================
--- clang/include/clang/Basic/LangOptions.def
+++ clang/include/clang/Basic/LangOptions.def
@@ -278,6 +278,7 @@
 ENUM_LANGOPT(SYCLVersion  , SYCLMajorVersion, 2, SYCL_None, "Version of the SYCL standard used")
 
 LANGOPT(HIPUseNewLaunchAPI, 1, 0, "Use new kernel launching API for HIP")
+LANGOPT(HIPUniformBlock, 1, 0, "Assume that HIP kernels are launched with uniform block sizes")
 
 LANGOPT(SizedDeallocation , 1, 0, "sized deallocation")
 LANGOPT(AlignedAllocation , 1, 0, "aligned allocation")
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to