https://github.com/yxsamliu updated https://github.com/llvm/llvm-project/pull/201135
>From a8daecd6b38bed93bd6449643e85286d51ceef19 Mon Sep 17 00:00:00 2001 From: "Yaxun (Sam) Liu" <[email protected]> Date: Tue, 2 Jun 2026 10:12:23 -0400 Subject: [PATCH] [HIP][AMDGPU] Use non-LTO pipeline for non-RDC in the linker wrapper Non-RDC HIP does not need LTO, but the new offload driver compiles all AMDGPU device code through the LTO pipeline. That makes non-RDC builds pay full LTO codegen cost for no benefit. Fix this in clang-linker-wrapper instead of the driver, so device codegen still runs in the wrapper's parallel device-link step (kept fast by --offload-jobs) rather than being serialized back in the driver. The driver passes a new --no-lto flag for the non-RDC fat-binary job (unless the user asked for -foffload-lto). With it, the wrapper drops -flto and, because the device images are bitcode stored in object-extension files, also passes -x ir so clang actually compiles them (cc1 -emit-obj) instead of handing the bitcode to lld for an LTO link. --- clang/lib/Driver/ToolChains/Clang.cpp | 11 +++++++++++ clang/test/Driver/hip-toolchain-no-rdc.hip | 9 +++++++++ clang/test/Driver/hipspv-toolchain.hip | 2 +- .../linker-wrapper-hip-no-rdc.c | 12 ++++++++++++ .../clang-linker-wrapper/ClangLinkerWrapper.cpp | 16 ++++++++++++++-- .../clang-linker-wrapper/LinkerWrapperOpts.td | 3 +++ 6 files changed, 50 insertions(+), 3 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 7657afb14f077..dcd262a084753 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -9830,6 +9830,17 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA, JA.getType() == types::TY_Image); if (JA.getType() == types::TY_HIP_FATBIN) { CmdArgs.push_back("--emit-fatbin-only"); + // Non-RDC HIP uses the conventional non-LTO pipeline unless the user opts + // into offload LTO. The device backend then runs in the linker wrapper's + // parallel device-link step rather than being deferred to the LTO link. + // Profile generation still needs LTO so the device profile runtime is + // linked and optimized together with the device code. + bool UsesProfileGenerate = Args.hasArg( + options::OPT_fprofile_generate, options::OPT_fprofile_generate_EQ, + options::OPT_fprofile_instr_generate, + options::OPT_fprofile_instr_generate_EQ); + if (C.getDriver().getOffloadLTOMode() == LTOK_None && !UsesProfileGenerate) + CmdArgs.push_back("--no-lto"); CmdArgs.append({"-o", Output.getFilename()}); for (auto Input : Inputs) CmdArgs.push_back(Input.getFilename()); diff --git a/clang/test/Driver/hip-toolchain-no-rdc.hip b/clang/test/Driver/hip-toolchain-no-rdc.hip index f4cd703547ac0..d018cfe48b72e 100644 --- a/clang/test/Driver/hip-toolchain-no-rdc.hip +++ b/clang/test/Driver/hip-toolchain-no-rdc.hip @@ -46,6 +46,13 @@ // RUN: %s -nogpuinc -nogpulib \ // RUN: 2>&1 | FileCheck -check-prefixes=AMDGCNSPIRV-NEW %s +// Profile generation needs LTO, so the linker wrapper must not get --no-lto. +// RUN: %clang -### --target=x86_64-linux-gnu -fno-gpu-rdc -fprofile-generate \ +// RUN: -x hip --cuda-gpu-arch=gfx900 -nogpuinc -nogpulib --offload-new-driver \ +// RUN: %s 2>&1 | FileCheck -check-prefixes=PGO -implicit-check-not="--no-lto" %s +// PGO: {{".*clang-linker-wrapper}} +// PGO-SAME: "--emit-fatbin-only" + // // Compile device code in a.cu to code object for gfx803. // @@ -109,6 +116,7 @@ // NEW: [[WRAPPER:".*clang-linker-wrapper]]" // NEW-SAME: "--host-triple=x86_64-unknown-linux-gnu" // NEW-SAME: "--emit-fatbin-only" +// NEW-SAME: "--no-lto" // NEW-SAME: "-o" "[[HIPFB_A:.*.hipfb]]" "[[PACKAGE_A]]" // CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu" @@ -181,6 +189,7 @@ // NEW: [[WRAPPER:".*clang-linker-wrapper]]" // NEW-SAME: "--emit-fatbin-only" +// NEW-SAME: "--no-lto" // NEW-SAME: "-o" "[[HIPFB_B:.*.hipfb]]" "[[PACKAGE_B]]" // CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu" diff --git a/clang/test/Driver/hipspv-toolchain.hip b/clang/test/Driver/hipspv-toolchain.hip index d2a7e9a3aeb3a..8eeb6e60cba50 100644 --- a/clang/test/Driver/hipspv-toolchain.hip +++ b/clang/test/Driver/hipspv-toolchain.hip @@ -38,7 +38,7 @@ // NEW-SAME: "--image=file=[[OBJ_DEV]],triple=[[TRIPLE]],arch=generic,kind=hip" // NEW: {{".*clang-linker-wrapper"}} "--device-compiler=[[TRIPLE]]=--hip-path=[[HIP_PATH]]" -// NEW-SAME: "--emit-fatbin-only" "-o" "[[BUNDLE:.*hipfb]]" +// NEW-SAME: "--emit-fatbin-only" "--no-lto" "-o" "[[BUNDLE:.*hipfb]]" // CHECK: [[CLANG]] "-cc1" "-triple" {{".*"}} "-aux-triple" "[[TRIPLE]]" // CHECK-SAME: "-emit-obj" diff --git a/clang/test/OffloadTools/clang-linker-wrapper/linker-wrapper-hip-no-rdc.c b/clang/test/OffloadTools/clang-linker-wrapper/linker-wrapper-hip-no-rdc.c index aae330bd3f6de..5c5b7b1eabfab 100644 --- a/clang/test/OffloadTools/clang-linker-wrapper/linker-wrapper-hip-no-rdc.c +++ b/clang/test/OffloadTools/clang-linker-wrapper/linker-wrapper-hip-no-rdc.c @@ -55,3 +55,15 @@ __attribute__((visibility("protected"), used)) int x; // RUN: test -s %t.gfx9-4-generic-xnack+.co // RUN: test -f %t.gfx1200.co // RUN: test -s %t.gfx1200.co + +// Without --no-lto the AMDGPU device compilation uses the LTO pipeline +// (-flto). +// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --wrapper-verbose --dry-run --emit-fatbin-only --linker-path=/usr/bin/ld %t.out -o %t.lto.hipfb 2>&1 | FileCheck %s --check-prefix=LTO +// LTO: clang{{.*}} -mcpu=gfx1200{{.*}} -flto + +// With --no-lto the AMDGPU device compilation uses the conventional non-LTO +// pipeline: -flto must not be passed, and '-x ir' must be passed so Clang +// compiles the bitcode (stored in an object-extension file) instead of +// handing it to the LTO link. +// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --wrapper-verbose --dry-run --no-lto --emit-fatbin-only --linker-path=/usr/bin/ld %t.out -o %t.nolto.hipfb 2>&1 | FileCheck %s --check-prefix=NO-LTO --implicit-check-not=-flto +// NO-LTO: clang{{.*}} -mcpu=gfx1200{{.*}} -x ir diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp index 24900a43dbcc2..c35aceff6aa43 100644 --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -535,8 +535,14 @@ Expected<StringRef> clang(ArrayRef<StringRef> InputFiles, const ArgList &Args, Triple.isAMDGPU() ? CmdArgs.push_back(Args.MakeArgString("-mcpu=" + Arch)) : CmdArgs.push_back(Args.MakeArgString("-march=" + Arch)); - // AMDGPU is always in LTO mode currently. - if (Triple.isAMDGPU()) + // AMDGPU defaults to the LTO pipeline. Non-RDC HIP uses the conventional + // non-LTO pipeline so device codegen still runs here, in parallel, instead + // of being deferred to the LTO link. + // FIXME: This is a stop-gap for non-RDC. Longer term, RDC and non-RDC should + // share a unified interface so runtime libraries can be provided to non-RDC + // compilations without relying on -mlink-builtin-bitcode. + bool NonLTOAMDGPU = Triple.isAMDGPU() && Args.hasArg(OPT_no_lto); + if (Triple.isAMDGPU() && !NonLTOAMDGPU) CmdArgs.push_back("-flto"); // Forward all of the `--offload-opt` and `-mllvm` options to the device. @@ -548,6 +554,12 @@ Expected<StringRef> clang(ArrayRef<StringRef> InputFiles, const ArgList &Args, if (!Triple.isNVPTX() && !Triple.isSPIRV()) CmdArgs.push_back("-Wl,--no-undefined"); + // The device inputs are bitcode stored in files with an object extension. + // Force the IR input language so Clang runs the compile and backend phases + // instead of treating them as linker inputs, which would defer codegen to + // the LTO link and defeat the non-LTO pipeline. + if (NonLTOAMDGPU) + CmdArgs.append({"-x", "ir"}); for (StringRef InputFile : InputFiles) CmdArgs.push_back(InputFile); diff --git a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td index 53b6c596de291..87a26ca90a66f 100644 --- a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td +++ b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td @@ -39,6 +39,9 @@ def print_wrapped_module : Flag<["--"], "print-wrapped-module">, HelpText<"Print the wrapped module's IR for testing">; def save_temps : Flag<["--"], "save-temps">, Flags<[WrapperOnlyOption]>, HelpText<"Save intermediate results">; +def no_lto : Flag<["--"], "no-lto">, + Flags<[WrapperOnlyOption]>, + HelpText<"Use the non-LTO device compilation pipeline">; def compress : Flag<["--"], "compress">, Flags<[WrapperOnlyOption]>, HelpText<"Compress bundled files">; def compression_level_eq : Joined<["--"], "compression-level=">, _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
