jhuber6 updated this revision to Diff 424537.
jhuber6 added a comment.
Adding warning for using both -fno-gpu-rdc and -foffload-new-driver. I think
this is a good warning to have for now while this is being worked in as opt-in.
Once this has matured I plan on adding the necessary logic to handle RDC and
non-RDC builds correctly with this. But for the purposes of this patch just
warning is fine.
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D120272/new/
https://reviews.llvm.org/D120272
Files:
clang/include/clang/Basic/Cuda.h
clang/include/clang/Basic/DiagnosticDriverKinds.td
clang/lib/Driver/Driver.cpp
clang/lib/Driver/ToolChains/Clang.cpp
clang/lib/Driver/ToolChains/Cuda.cpp
clang/lib/Driver/ToolChains/HIPAMD.cpp
clang/test/Driver/cuda-openmp-driver.cu
clang/test/Driver/cuda-phases.cu
Index: clang/test/Driver/cuda-phases.cu
===================================================================
--- clang/test/Driver/cuda-phases.cu
+++ clang/test/Driver/cuda-phases.cu
@@ -217,3 +217,32 @@
// DASM2-DAG: [[P8:[0-9]+]]: backend, {[[P7]]}, assembler, (device-[[T]], [[ARCH2]])
// DASM2-DAG: [[P9:[0-9]+]]: offload, "device-[[T]] ([[TRIPLE]]:[[ARCH2]])" {[[P8]]}, assembler
// DASM2-NOT: host
+
+//
+// Test the phases generated when using the new offloading driver.
+//
+// RUN: %clang -### -target powerpc64le-ibm-linux-gnu -ccc-print-phases -foffload-new-driver \
+// RUN: --offload-arch=sm_52 --offload-arch=sm_70 %s 2>&1 | FileCheck --check-prefix=NEW_DRIVER %s
+// NEW_DRIVER: 0: input, "[[INPUT:.*]]", cuda, (host-cuda)
+// NEW_DRIVER: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
+// NEW_DRIVER: 2: compiler, {1}, ir, (host-cuda)
+// NEW_DRIVER: 3: input, "[[INPUT]]", cuda, (device-cuda, sm_52)
+// NEW_DRIVER: 4: preprocessor, {3}, cuda-cpp-output, (device-cuda, sm_52)
+// NEW_DRIVER: 5: compiler, {4}, ir, (device-cuda, sm_52)
+// NEW_DRIVER: 6: backend, {5}, assembler, (device-cuda, sm_52)
+// NEW_DRIVER: 7: assembler, {6}, object, (device-cuda, sm_52)
+// NEW_DRIVER: 8: offload, "device-cuda (nvptx64-nvidia-cuda:sm_52)" {7}, object
+// NEW_DRIVER: 9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_52)" {6}, assembler
+// NEW_DRIVER: 10: linker, {8, 9}, cuda-fatbin, (device-cuda, sm_52)
+// NEW_DRIVER: 11: input, "[[INPUT]]", cuda, (device-cuda, sm_70)
+// NEW_DRIVER: 12: preprocessor, {11}, cuda-cpp-output, (device-cuda, sm_70)
+// NEW_DRIVER: 13: compiler, {12}, ir, (device-cuda, sm_70)
+// NEW_DRIVER: 14: backend, {13}, assembler, (device-cuda, sm_70)
+// NEW_DRIVER: 15: assembler, {14}, object, (device-cuda, sm_70)
+// NEW_DRIVER: 16: offload, "device-cuda (nvptx64-nvidia-cuda:sm_70)" {15}, object
+// NEW_DRIVER: 17: offload, "device-cuda (nvptx64-nvidia-cuda:sm_70)" {14}, assembler
+// NEW_DRIVER: 18: linker, {16, 17}, cuda-fatbin, (device-cuda, sm_70)
+// NEW_DRIVER: 19: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, "device-cuda (nvptx64-nvidia-cuda:sm_52)" {10}, "device-cuda (nvptx64-nvidia-cuda:sm_70)" {18}, ir
+// NEW_DRIVER: 20: backend, {19}, assembler, (host-cuda)
+// NEW_DRIVER: 21: assembler, {20}, object, (host-cuda)
+// NEW_DRIVER: 22: clang-linker-wrapper, {21}, image, (host-cuda)
Index: clang/test/Driver/cuda-openmp-driver.cu
===================================================================
--- /dev/null
+++ clang/test/Driver/cuda-openmp-driver.cu
@@ -0,0 +1,19 @@
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+// RUN: %clang -### -target x86_64-linux-gnu -nocudalib -ccc-print-bindings -fgpu-rdc \
+// RUN: -foffload-new-driver --offload-arch=sm_35 --offload-arch=sm_70 %s 2>&1 \
+// RUN: | FileCheck -check-prefix BINDINGS %s
+
+// BINDINGS: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT:.+]]"], output: "[[PTX_SM_35:.+]]"
+// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[PTX_SM_35]]"], output: "[[CUBIN_SM_35:.+]]"
+// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "NVPTX::Linker", inputs: ["[[CUBIN_SM_35]]", "[[PTX_SM_35]]"], output: "[[FATBIN_SM_35:.+]]"
+// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "clang", inputs: ["[[INPUT]]"], output: "[[PTX_SM_70:.+]]"
+// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "NVPTX::Assembler", inputs: ["[[PTX_SM_70:.+]]"], output: "[[CUBIN_SM_70:.+]]"
+// BINDINGS-NEXT: "nvptx64-nvidia-cuda" - "NVPTX::Linker", inputs: ["[[CUBIN_SM_70]]", "[[PTX_SM_70:.+]]"], output: "[[FATBIN_SM_70:.+]]"
+// BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT]]", "[[FATBIN_SM_35]]", "[[FATBIN_SM_70]]"], output: "[[HOST_OBJ:.+]]"
+// BINDINGS-NEXT: "x86_64-unknown-linux-gnu" - "Offload::Linker", inputs: ["[[HOST_OBJ]]"], output: "a.out"
+
+// RUN: %clang -### -nocudalib -foffload-new-driver %s 2>&1 | FileCheck -check-prefix RDC %s
+// RDC: clang{{.*}}"-cc1" "-triple" "nvptx64-nvidia-cuda"{{.*}}"-fgpu-rdc"
+// RDC: ptxas{{.*}}-c
Index: clang/lib/Driver/ToolChains/HIPAMD.cpp
===================================================================
--- clang/lib/Driver/ToolChains/HIPAMD.cpp
+++ clang/lib/Driver/ToolChains/HIPAMD.cpp
@@ -222,7 +222,8 @@
CC1Args.push_back("-fcuda-approx-transcendentals");
if (!DriverArgs.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
- false))
+ false) ||
+ DriverArgs.hasArg(options::OPT_foffload_new_driver))
CC1Args.append({"-mllvm", "-amdgpu-internalize-symbols"});
StringRef MaxThreadsPerBlock =
Index: clang/lib/Driver/ToolChains/Cuda.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Cuda.cpp
+++ clang/lib/Driver/ToolChains/Cuda.cpp
@@ -461,8 +461,9 @@
options::OPT_fnoopenmp_relocatable_target,
/*Default=*/true);
else if (JA.isOffloading(Action::OFK_Cuda))
- Relocatable = Args.hasFlag(options::OPT_fgpu_rdc,
- options::OPT_fno_gpu_rdc, /*Default=*/false);
+ Relocatable = Args.hasArg(options::OPT_foffload_new_driver) ||
+ Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
+ /*Default=*/false);
if (Relocatable)
CmdArgs.push_back("-c");
Index: clang/lib/Driver/ToolChains/Clang.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Clang.cpp
+++ clang/lib/Driver/ToolChains/Clang.cpp
@@ -6222,7 +6222,12 @@
}
if (IsCuda || IsHIP) {
- if (Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false))
+ if (Args.hasArg(options::OPT_fno_gpu_rdc) && IsCudaDevice &&
+ Args.hasArg(options::OPT_foffload_new_driver))
+ D.Diag(diag::warn_drv_no_rdc_new_driver)
+ << "SampleUse with PGO options";
+ if (Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false) ||
+ Args.hasArg(options::OPT_foffload_new_driver))
CmdArgs.push_back("-fgpu-rdc");
if (Args.hasFlag(options::OPT_fgpu_defer_diag,
options::OPT_fno_gpu_defer_diag, false))
@@ -6887,7 +6892,8 @@
if ((IsCuda || IsHIP) && CudaDeviceInput) {
CmdArgs.push_back("-fcuda-include-gpubinary");
CmdArgs.push_back(CudaDeviceInput->getFilename());
- if (Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc, false))
+ if (Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
+ false))
CmdArgs.push_back("-fgpu-rdc");
}
@@ -8203,14 +8209,17 @@
ArgStringList CmdArgs;
// Pass the CUDA path to the linker wrapper tool.
- for (auto &I : llvm::make_range(OpenMPTCRange.first, OpenMPTCRange.second)) {
- const ToolChain *TC = I.second;
- if (TC->getTriple().isNVPTX()) {
- CudaInstallationDetector CudaInstallation(D, TheTriple, Args);
- if (CudaInstallation.isValid())
- CmdArgs.push_back(Args.MakeArgString(
- "--cuda-path=" + CudaInstallation.getInstallPath()));
- break;
+ for (Action::OffloadKind Kind : {Action::OFK_Cuda, Action::OFK_OpenMP}) {
+ auto TCRange = C.getOffloadToolChains(Kind);
+ for (auto &I : llvm::make_range(TCRange.first, TCRange.second)) {
+ const ToolChain *TC = I.second;
+ if (TC->getTriple().isNVPTX()) {
+ CudaInstallationDetector CudaInstallation(D, TheTriple, Args);
+ if (CudaInstallation.isValid())
+ CmdArgs.push_back(Args.MakeArgString(
+ "--cuda-path=" + CudaInstallation.getInstallPath()));
+ break;
+ }
}
}
Index: clang/lib/Driver/Driver.cpp
===================================================================
--- clang/lib/Driver/Driver.cpp
+++ clang/lib/Driver/Driver.cpp
@@ -4139,6 +4139,101 @@
Args.ClaimAllArgs(options::OPT_cuda_compile_host_device);
}
+/// Returns the canonical name for the offloading architecture when using HIP or
+/// CUDA.
+static StringRef getCanonicalArchString(Compilation &C,
+ llvm::opt::DerivedArgList &Args,
+ StringRef ArchStr,
+ Action::OffloadKind Kind) {
+ if (Kind == Action::OFK_Cuda) {
+ CudaArch Arch = StringToCudaArch(ArchStr);
+ if (Arch == CudaArch::UNKNOWN || !IsNVIDIAGpuArch(Arch)) {
+ C.getDriver().Diag(clang::diag::err_drv_cuda_bad_gpu_arch) << ArchStr;
+ return StringRef();
+ }
+ return Args.MakeArgStringRef(CudaArchToString(Arch));
+ } else if (Kind == Action::OFK_HIP) {
+ llvm::StringMap<bool> Features;
+ // getHIPOffloadTargetTriple() is known to return valid value as it has
+ // been called successfully in the CreateOffloadingDeviceToolChains().
+ auto Arch = parseTargetID(
+ *getHIPOffloadTargetTriple(C.getDriver(), C.getInputArgs()), ArchStr,
+ &Features);
+ if (!Arch) {
+ C.getDriver().Diag(clang::diag::err_drv_bad_target_id) << ArchStr;
+ C.setContainsError();
+ return StringRef();
+ }
+ return Args.MakeArgStringRef(
+ getCanonicalTargetID(Arch.getValue(), Features));
+ }
+ return StringRef();
+}
+
+/// Checks if the set offloading architectures does not conflict. Returns the
+/// incompatible pair if a conflict occurs.
+static llvm::Optional<std::pair<llvm::StringRef, llvm::StringRef>>
+getConflictOffloadArchCombination(const llvm::DenseSet<StringRef> &Archs,
+ Action::OffloadKind Kind) {
+ if (Kind != Action::OFK_HIP)
+ return None;
+
+ std::set<StringRef> ArchSet;
+ llvm::copy(Archs, std::inserter(ArchSet, ArchSet.begin()));
+ return getConflictTargetIDCombination(ArchSet);
+}
+
+/// Returns the set of bound architectures active for this compilation kind.
+/// This function returns a set of bound architectures, if there are no bound
+/// architctures we return a set containing only the empty string.
+static llvm::DenseSet<StringRef>
+getOffloadArchs(Compilation &C, llvm::opt::DerivedArgList &Args,
+ Action::OffloadKind Kind) {
+
+ // If this is OpenMP offloading we don't use a bound architecture.
+ if (Kind == Action::OFK_OpenMP)
+ return llvm::DenseSet<StringRef>{StringRef()};
+
+ // --offload and --offload-arch options are mutually exclusive.
+ if (Args.hasArgNoClaim(options::OPT_offload_EQ) &&
+ Args.hasArgNoClaim(options::OPT_offload_arch_EQ,
+ options::OPT_no_offload_arch_EQ)) {
+ C.getDriver().Diag(diag::err_opt_not_valid_with_opt)
+ << "--offload"
+ << (Args.hasArgNoClaim(options::OPT_offload_arch_EQ)
+ ? "--offload-arch"
+ : "--no-offload-arch");
+ }
+
+ llvm::DenseSet<StringRef> Archs;
+ for (auto &Arg : Args) {
+ if (Arg->getOption().matches(options::OPT_offload_arch_EQ)) {
+ Archs.insert(getCanonicalArchString(C, Args, Arg->getValue(), Kind));
+ } else if (Arg->getOption().matches(options::OPT_no_offload_arch_EQ)) {
+ if (Arg->getValue() == StringRef("all"))
+ Archs.clear();
+ else
+ Archs.erase(getCanonicalArchString(C, Args, Arg->getValue(), Kind));
+ }
+ }
+
+ if (auto ConflictingArchs = getConflictOffloadArchCombination(Archs, Kind)) {
+ C.getDriver().Diag(clang::diag::err_drv_bad_offload_arch_combo)
+ << ConflictingArchs.getValue().first
+ << ConflictingArchs.getValue().second;
+ C.setContainsError();
+ }
+
+ if (Archs.empty()) {
+ if (Kind == Action::OFK_Cuda)
+ Archs.insert(CudaArchToString(CudaArch::CudaDefault));
+ else if (Kind == Action::OFK_HIP)
+ Archs.insert(CudaArchToString(CudaArch::HIPDefault));
+ }
+
+ return Archs;
+}
+
Action *Driver::BuildOffloadingActions(Compilation &C,
llvm::opt::DerivedArgList &Args,
const InputTy &Input,
@@ -4151,7 +4246,8 @@
types::ID InputType = Input.first;
const Arg *InputArg = Input.second;
- const Action::OffloadKind OffloadKinds[] = {Action::OFK_OpenMP};
+ const Action::OffloadKind OffloadKinds[] = {
+ Action::OFK_OpenMP, Action::OFK_Cuda, Action::OFK_HIP};
for (Action::OffloadKind Kind : OffloadKinds) {
SmallVector<const ToolChain *, 2> ToolChains;
@@ -4164,7 +4260,13 @@
if (ToolChains.empty())
continue;
- for (unsigned I = 0; I < ToolChains.size(); ++I)
+ // Get the product of all bound architectures and toolchains.
+ SmallVector<std::pair<const ToolChain *, StringRef>> TCAndArchs;
+ for (const ToolChain *TC : ToolChains)
+ for (StringRef Arch : getOffloadArchs(C, Args, Kind))
+ TCAndArchs.push_back(std::make_pair(TC, Arch));
+
+ for (unsigned I = 0, E = TCAndArchs.size(); I != E; ++I)
DeviceActions.push_back(C.MakeAction<InputAction>(*InputArg, InputType));
if (DeviceActions.empty())
@@ -4178,27 +4280,41 @@
break;
}
- auto TC = ToolChains.begin();
+ auto TCAndArch = TCAndArchs.begin();
for (Action *&A : DeviceActions) {
A = ConstructPhaseAction(C, Args, Phase, A, Kind);
if (isa<CompileJobAction>(A) && Kind == Action::OFK_OpenMP) {
+ // OpenMP offloading has a dependency on the host compile action to
+ // identify which declarations need to be emitted. This shouldn't be
+ // collapsed with any other actions so we can use it in the device.
HostAction->setCannotBeCollapsedWithNextDependentAction();
OffloadAction::HostDependence HDep(
*HostAction, *C.getSingleOffloadToolChain<Action::OFK_Host>(),
- /*BourdArch=*/nullptr, Action::OFK_OpenMP);
+ /*BoundArch=*/nullptr, Kind);
OffloadAction::DeviceDependences DDep;
- DDep.add(*A, **TC, /*BoundArch=*/nullptr, Kind);
+ DDep.add(*A, *TCAndArch->first, /*BoundArch=*/nullptr, Kind);
A = C.MakeAction<OffloadAction>(HDep, DDep);
+ } else if (isa<AssembleJobAction>(A) && Kind == Action::OFK_Cuda) {
+ // The Cuda toolchain uses fatbinary as the linker phase to bundle the
+ // PTX and Cubin output.
+ ActionList FatbinActions;
+ for (Action *A : {A, A->getInputs()[0]}) {
+ OffloadAction::DeviceDependences DDep;
+ DDep.add(*A, *TCAndArch->first, TCAndArch->second.data(), Kind);
+ FatbinActions.emplace_back(
+ C.MakeAction<OffloadAction>(DDep, A->getType()));
+ }
+ A = C.MakeAction<LinkJobAction>(FatbinActions, types::TY_CUDA_FATBIN);
}
- ++TC;
+ ++TCAndArch;
}
}
- auto TC = ToolChains.begin();
+ auto TCAndArch = TCAndArchs.begin();
for (Action *A : DeviceActions) {
- DDeps.add(*A, **TC, /*BoundArch=*/nullptr, Kind);
- TC++;
+ DDeps.add(*A, *TCAndArch->first, TCAndArch->second.data(), Kind);
+ ++TCAndArch;
}
}
@@ -4303,7 +4419,7 @@
return C.MakeAction<BackendJobAction>(Input, Output);
}
if (isUsingLTO(/* IsOffload */ true) &&
- TargetDeviceOffloadKind == Action::OFK_OpenMP) {
+ TargetDeviceOffloadKind != Action::OFK_None) {
types::ID Output =
Args.hasArg(options::OPT_S) ? types::TY_LTO_IR : types::TY_LTO_BC;
return C.MakeAction<BackendJobAction>(Input, Output);
Index: clang/include/clang/Basic/DiagnosticDriverKinds.td
===================================================================
--- clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -63,6 +63,8 @@
"cannot find libdevice for %0; provide path to different CUDA installation "
"via '--cuda-path', or pass '-nocudalib' to build without linking with "
"libdevice">;
+def warn_drv_no_rdc_new_driver : Warning<
+ "Using '-foffload-new-driver' overrides '-fno-gpu-rdc'">;
def err_drv_no_rocm_device_lib : Error<
"cannot find ROCm device library%select{| for %1|for ABI version %1}0; provide its path via "
Index: clang/include/clang/Basic/Cuda.h
===================================================================
--- clang/include/clang/Basic/Cuda.h
+++ clang/include/clang/Basic/Cuda.h
@@ -100,6 +100,9 @@
Generic, // A processor model named 'generic' if the target backend defines a
// public one.
LAST,
+
+ CudaDefault = CudaArch::SM_35,
+ HIPDefault = CudaArch::GFX803,
};
static inline bool IsNVIDIAGpuArch(CudaArch A) {
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits