yaxunl created this revision. yaxunl added reviewers: tra, b-sumner, MaskRay, arsenm, scchan. Herald added subscribers: jdoerfert, kerbowa, jvesely. Herald added a project: All. yaxunl requested review of this revision. Herald added a subscriber: wdng.
By default, clang assumes HIP kernels are launched with uniform block size, which is the case for kernels launched through triple chevron or hipLaunchKernelGGL. Clang adds uniform-work-group-size function attribute to HIP kernels to allow the backend to do optimizations on that. However, in some rare cases, HIP kernels can be launched through hipExtModuleLaunchKernel where global work size is specified, which may result in non-uniform block size. To be able to support non-uniform block size for HIP kernels, an option `-f[no-]hip-uniform-block is added. By default it is on. https://reviews.llvm.org/D155213 Files: clang/include/clang/Basic/LangOptions.def clang/include/clang/Driver/Options.td clang/lib/CodeGen/CGCall.cpp clang/lib/CodeGen/Targets/AMDGPU.cpp clang/lib/Driver/ToolChains/Clang.cpp clang/test/CodeGenHIP/default-attributes.hip clang/test/Driver/hip-options.hip
Index: clang/test/Driver/hip-options.hip =================================================================== --- clang/test/Driver/hip-options.hip +++ clang/test/Driver/hip-options.hip @@ -169,3 +169,28 @@ // RUN: %clang -### -nogpuinc -nogpulib -fhip-fp32-correctly-rounded-divide-sqrt \ // RUN: --cuda-gpu-arch=gfx906 %s 2>&1 | FileCheck -check-prefixes=CRDS %s // CRDS-NOT: "-f{{(no-)?}}hip-fp32-correctly-rounded-divide-sqrt" + +// Check -fno-hip-uniform-block is passed to clang -cc1 but +// (default) -fhip-uniform-block is not. + +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -nogpuinc -nogpulib -fno-hip-uniform-block \ +// RUN: --cuda-gpu-arch=gfx906 %s 2>&1 | FileCheck -check-prefix=NOUNIBLK %s + +// NOUNIBLK: "-cc1"{{.*}} "-triple" "amdgcn-amd-amdhsa" {{.*}} "-fno-hip-uniform-block" +// NOUNIBLK: "-cc1"{{.*}} "-triple" "x86_64-unknown-linux-gnu" {{.*}} "-fno-hip-uniform-block" + +// RUN: %clang -### -nogpuinc -nogpulib -fhip-uniform-block \ +// RUN: --cuda-gpu-arch=gfx906 %s 2>&1 | FileCheck -check-prefix=UNIBLK %s + +// RUN: %clang -### -nogpuinc -nogpulib \ +// RUN: --cuda-gpu-arch=gfx906 %s 2>&1 | FileCheck -check-prefix=UNIBLK %s + +// UNIBLK-NOT: "-f{{(no-)?}}hip-uniform-block" + +// Check no warnings for -f[no-]uniform-block. + +// RUN: %clang -fdriver-only -Werror --target=x86_64-unknown-linux-gnu -nogpuinc -nogpulib -fno-hip-uniform-block \ +// RUN: -fhip-uniform-block --cuda-gpu-arch=gfx906 %s 2>&1 | count 0 + +// RUN: %clang -fdriver-only -Werror --target=x86_64-unknown-linux-gnu -nostdinc -nostdlib -fno-hip-uniform-block \ +// RUN: -fhip-uniform-block -x c++ %s 2>&1 | count 0 Index: clang/test/CodeGenHIP/default-attributes.hip =================================================================== --- clang/test/CodeGenHIP/default-attributes.hip +++ clang/test/CodeGenHIP/default-attributes.hip @@ -5,6 +5,9 @@ // RUN: %clang_cc1 -O3 -triple amdgcn-amd-amdhsa -x hip -fno-ident -fcuda-is-device \ // RUN: -emit-llvm -o - %s | FileCheck -check-prefix=OPT %s +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -fno-ident -fcuda-is-device -fno-hip-uniform-block \ +// RUN: -emit-llvm -o - %s | FileCheck -check-prefix=NOUNIBLK %s + #define __device__ __attribute__((device)) #define __global__ __attribute__((global)) @@ -20,6 +23,12 @@ // OPT-NEXT: entry: // OPT-NEXT: ret void // +// NOUNIBLK: Function Attrs: convergent mustprogress noinline nounwind optnone +// NOUNIBLK-LABEL: define {{[^@]+}}@_Z4funcv +// NOUNIBLK-SAME: () #[[ATTR0:[0-9]+]] { +// NOUNIBLK-NEXT: entry: +// NOUNIBLK-NEXT: ret void +// __device__ void func() { } @@ -36,21 +45,34 @@ // OPT-NEXT: entry: // OPT-NEXT: ret void // +// NOUNIBLK: Function Attrs: convergent mustprogress noinline norecurse nounwind optnone +// NOUNIBLK-LABEL: define {{[^@]+}}@_Z6kernelv +// NOUNIBLK-SAME: () #[[ATTR1:[0-9]+]] { +// NOUNIBLK-NEXT: entry: +// NOUNIBLK-NEXT: ret void +// __global__ void kernel() { } //. -// OPTNONE: attributes #0 = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } -// OPTNONE: attributes #1 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } +// OPTNONE: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// OPTNONE: attributes #[[ATTR1]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } +//. +// OPT: attributes #[[ATTR0]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// OPT: attributes #[[ATTR1]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } +//. +// NOUNIBLK: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +// NOUNIBLK: attributes #[[ATTR1]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } //. -// OPT: attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" } -// OPT: attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" } +// OPTNONE: [[META0:![0-9]+]] = !{i32 1, !"amdgpu_code_object_version", i32 400} +// OPTNONE: [[META1:![0-9]+]] = !{i32 1, !"amdgpu_printf_kind", !"hostcall"} +// OPTNONE: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} //. -// OPTNONE: !0 = !{i32 1, !"amdgpu_code_object_version", i32 400} -// OPTNONE: !1 = !{i32 1, !"amdgpu_printf_kind", !"hostcall"} -// OPTNONE: !2 = !{i32 1, !"wchar_size", i32 4} +// OPT: [[META0:![0-9]+]] = !{i32 1, !"amdgpu_code_object_version", i32 400} +// OPT: [[META1:![0-9]+]] = !{i32 1, !"amdgpu_printf_kind", !"hostcall"} +// OPT: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} //. -// OPT: !0 = !{i32 1, !"amdgpu_code_object_version", i32 400} -// OPT: !1 = !{i32 1, !"amdgpu_printf_kind", !"hostcall"} -// OPT: !2 = !{i32 1, !"wchar_size", i32 4} +// NOUNIBLK: [[META0:![0-9]+]] = !{i32 1, !"amdgpu_code_object_version", i32 400} +// NOUNIBLK: [[META1:![0-9]+]] = !{i32 1, !"amdgpu_printf_kind", !"hostcall"} +// NOUNIBLK: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4} //. Index: clang/lib/Driver/ToolChains/Clang.cpp =================================================================== --- clang/lib/Driver/ToolChains/Clang.cpp +++ clang/lib/Driver/ToolChains/Clang.cpp @@ -7220,6 +7220,11 @@ if (IsHIP) { CmdArgs.push_back("-fcuda-allow-variadic-functions"); Args.AddLastArg(CmdArgs, options::OPT_fgpu_default_stream_EQ); + Args.addOptOutFlag(CmdArgs, options::OPT_fhip_uniform_block, + options::OPT_fno_hip_uniform_block); + } else { + Args.claimAllArgs(options::OPT_fhip_uniform_block, + options::OPT_fno_hip_uniform_block); } if (IsCudaDevice || IsHIPDevice) { Index: clang/lib/CodeGen/Targets/AMDGPU.cpp =================================================================== --- clang/lib/CodeGen/Targets/AMDGPU.cpp +++ clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -401,13 +401,6 @@ if (FD) setFunctionDeclAttributes(FD, F, M); - const bool IsHIPKernel = - M.getLangOpts().HIP && FD && FD->hasAttr<CUDAGlobalAttr>(); - - // TODO: This should be moved to language specific attributes instead. - if (IsHIPKernel) - F->addFnAttr("uniform-work-group-size", "true"); - if (M.getContext().getTargetInfo().allowAMDGPUUnsafeFPAtomics()) F->addFnAttr("amdgpu-unsafe-fp-atomics", "true"); Index: clang/lib/CodeGen/CGCall.cpp =================================================================== --- clang/lib/CodeGen/CGCall.cpp +++ clang/lib/CodeGen/CGCall.cpp @@ -2402,6 +2402,9 @@ llvm::toStringRef(CodeGenOpts.UniformWGSize)); } } + + if (TargetDecl->hasAttr<CUDAGlobalAttr>() && getLangOpts().HIPUniformBlock) + FuncAttrs.addAttribute("uniform-work-group-size", "true"); } // Attach "no-builtins" attributes to: Index: clang/include/clang/Driver/Options.td =================================================================== --- clang/include/clang/Driver/Options.td +++ clang/include/clang/Driver/Options.td @@ -1089,6 +1089,10 @@ NegFlag<SetFalse, [], "Don't specify">, BothFlags<[], " that kernel argument names are preserved (HIP only)">>, ShouldParseIf<hip.KeyPath>; +defm hip_uniform_block : BoolFOption<"hip-uniform-block", + LangOpts<"HIPUniformBlock">, DefaultTrue, + PosFlag<SetTrue, [], "Assume">, NegFlag<SetFalse, [CC1Option], "Don't assume">, + BothFlags<[], " that kernels are launched with uniform block sizes">>; def hipspv_pass_plugin_EQ : Joined<["--"], "hipspv-pass-plugin=">, Group<Link_Group>, MetaVarName<"<dsopath>">, HelpText<"path to a pass plugin for HIP to SPIR-V passes.">; Index: clang/include/clang/Basic/LangOptions.def =================================================================== --- clang/include/clang/Basic/LangOptions.def +++ clang/include/clang/Basic/LangOptions.def @@ -278,6 +278,7 @@ ENUM_LANGOPT(SYCLVersion , SYCLMajorVersion, 2, SYCL_None, "Version of the SYCL standard used") LANGOPT(HIPUseNewLaunchAPI, 1, 0, "Use new kernel launching API for HIP") +LANGOPT(HIPUniformBlock, 1, 0, "Assume that HIP kernels are launched with uniform block sizes") LANGOPT(SizedDeallocation , 1, 0, "sized deallocation") LANGOPT(AlignedAllocation , 1, 0, "aligned allocation")
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits