https://github.com/yxsamliu updated 
https://github.com/llvm/llvm-project/pull/201135

>From a8daecd6b38bed93bd6449643e85286d51ceef19 Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <[email protected]>
Date: Tue, 2 Jun 2026 10:12:23 -0400
Subject: [PATCH] [HIP][AMDGPU] Use non-LTO pipeline for non-RDC in the linker
 wrapper

Non-RDC HIP does not need LTO, but the new offload driver compiles all
AMDGPU device code through the LTO pipeline. That makes non-RDC builds pay
full LTO codegen cost for no benefit.

Fix this in clang-linker-wrapper instead of the driver, so device codegen
still runs in the wrapper's parallel device-link step (kept fast by
--offload-jobs) rather than being serialized back in the driver.

The driver passes a new --no-lto flag for the non-RDC fat-binary job
(unless the user asked for -foffload-lto). With it, the wrapper drops -flto
and, because the device images are bitcode stored in object-extension
files, also passes -x ir so clang actually compiles them (cc1 -emit-obj)
instead of handing the bitcode to lld for an LTO link.
---
 clang/lib/Driver/ToolChains/Clang.cpp            | 11 +++++++++++
 clang/test/Driver/hip-toolchain-no-rdc.hip       |  9 +++++++++
 clang/test/Driver/hipspv-toolchain.hip           |  2 +-
 .../linker-wrapper-hip-no-rdc.c                  | 12 ++++++++++++
 .../clang-linker-wrapper/ClangLinkerWrapper.cpp  | 16 ++++++++++++++--
 .../clang-linker-wrapper/LinkerWrapperOpts.td    |  3 +++
 6 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp 
b/clang/lib/Driver/ToolChains/Clang.cpp
index 7657afb14f077..dcd262a084753 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -9830,6 +9830,17 @@ void LinkerWrapper::ConstructJob(Compilation &C, const 
JobAction &JA,
          JA.getType() == types::TY_Image);
   if (JA.getType() == types::TY_HIP_FATBIN) {
     CmdArgs.push_back("--emit-fatbin-only");
+    // Non-RDC HIP uses the conventional non-LTO pipeline unless the user opts
+    // into offload LTO. The device backend then runs in the linker wrapper's
+    // parallel device-link step rather than being deferred to the LTO link.
+    // Profile generation still needs LTO so the device profile runtime is
+    // linked and optimized together with the device code.
+    bool UsesProfileGenerate = Args.hasArg(
+        options::OPT_fprofile_generate, options::OPT_fprofile_generate_EQ,
+        options::OPT_fprofile_instr_generate,
+        options::OPT_fprofile_instr_generate_EQ);
+    if (C.getDriver().getOffloadLTOMode() == LTOK_None && !UsesProfileGenerate)
+      CmdArgs.push_back("--no-lto");
     CmdArgs.append({"-o", Output.getFilename()});
     for (auto Input : Inputs)
       CmdArgs.push_back(Input.getFilename());
diff --git a/clang/test/Driver/hip-toolchain-no-rdc.hip 
b/clang/test/Driver/hip-toolchain-no-rdc.hip
index f4cd703547ac0..d018cfe48b72e 100644
--- a/clang/test/Driver/hip-toolchain-no-rdc.hip
+++ b/clang/test/Driver/hip-toolchain-no-rdc.hip
@@ -46,6 +46,13 @@
 // RUN:   %s -nogpuinc -nogpulib \
 // RUN: 2>&1 | FileCheck -check-prefixes=AMDGCNSPIRV-NEW %s
 
+// Profile generation needs LTO, so the linker wrapper must not get --no-lto.
+// RUN: %clang -### --target=x86_64-linux-gnu -fno-gpu-rdc -fprofile-generate \
+// RUN:   -x hip --cuda-gpu-arch=gfx900 -nogpuinc -nogpulib 
--offload-new-driver \
+// RUN:   %s 2>&1 | FileCheck -check-prefixes=PGO 
-implicit-check-not="--no-lto" %s
+// PGO: {{".*clang-linker-wrapper}}
+// PGO-SAME: "--emit-fatbin-only"
+
 //
 // Compile device code in a.cu to code object for gfx803.
 //
@@ -109,6 +116,7 @@
 // NEW: [[WRAPPER:".*clang-linker-wrapper]]"
 // NEW-SAME: "--host-triple=x86_64-unknown-linux-gnu"
 // NEW-SAME: "--emit-fatbin-only"
+// NEW-SAME: "--no-lto"
 // NEW-SAME: "-o" "[[HIPFB_A:.*.hipfb]]" "[[PACKAGE_A]]"
 
 // CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
@@ -181,6 +189,7 @@
 
 // NEW: [[WRAPPER:".*clang-linker-wrapper]]"
 // NEW-SAME: "--emit-fatbin-only"
+// NEW-SAME: "--no-lto"
 // NEW-SAME: "-o" "[[HIPFB_B:.*.hipfb]]" "[[PACKAGE_B]]"
 
 // CHECK: [[CLANG]] "-cc1" "-triple" "x86_64-unknown-linux-gnu"
diff --git a/clang/test/Driver/hipspv-toolchain.hip 
b/clang/test/Driver/hipspv-toolchain.hip
index d2a7e9a3aeb3a..8eeb6e60cba50 100644
--- a/clang/test/Driver/hipspv-toolchain.hip
+++ b/clang/test/Driver/hipspv-toolchain.hip
@@ -38,7 +38,7 @@
 // NEW-SAME: "--image=file=[[OBJ_DEV]],triple=[[TRIPLE]],arch=generic,kind=hip"
 
 // NEW: {{".*clang-linker-wrapper"}} 
"--device-compiler=[[TRIPLE]]=--hip-path=[[HIP_PATH]]"
-// NEW-SAME: "--emit-fatbin-only" "-o" "[[BUNDLE:.*hipfb]]"
+// NEW-SAME: "--emit-fatbin-only" "--no-lto" "-o" "[[BUNDLE:.*hipfb]]"
 
 // CHECK: [[CLANG]] "-cc1" "-triple" {{".*"}} "-aux-triple" "[[TRIPLE]]"
 // CHECK-SAME: "-emit-obj"
diff --git 
a/clang/test/OffloadTools/clang-linker-wrapper/linker-wrapper-hip-no-rdc.c 
b/clang/test/OffloadTools/clang-linker-wrapper/linker-wrapper-hip-no-rdc.c
index aae330bd3f6de..5c5b7b1eabfab 100644
--- a/clang/test/OffloadTools/clang-linker-wrapper/linker-wrapper-hip-no-rdc.c
+++ b/clang/test/OffloadTools/clang-linker-wrapper/linker-wrapper-hip-no-rdc.c
@@ -55,3 +55,15 @@ __attribute__((visibility("protected"), used)) int x;
 // RUN: test -s %t.gfx9-4-generic-xnack+.co
 // RUN: test -f %t.gfx1200.co
 // RUN: test -s %t.gfx1200.co
+
+// Without --no-lto the AMDGPU device compilation uses the LTO pipeline
+// (-flto).
+// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu 
--wrapper-verbose --dry-run --emit-fatbin-only --linker-path=/usr/bin/ld %t.out 
-o %t.lto.hipfb 2>&1 | FileCheck %s --check-prefix=LTO
+// LTO: clang{{.*}} -mcpu=gfx1200{{.*}} -flto
+
+// With --no-lto the AMDGPU device compilation uses the conventional non-LTO
+// pipeline: -flto must not be passed, and '-x ir' must be passed so Clang
+// compiles the bitcode (stored in an object-extension file) instead of
+// handing it to the LTO link.
+// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu 
--wrapper-verbose --dry-run --no-lto --emit-fatbin-only 
--linker-path=/usr/bin/ld %t.out -o %t.nolto.hipfb 2>&1 | FileCheck %s 
--check-prefix=NO-LTO --implicit-check-not=-flto
+// NO-LTO: clang{{.*}} -mcpu=gfx1200{{.*}} -x ir
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp 
b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 24900a43dbcc2..c35aceff6aa43 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -535,8 +535,14 @@ Expected<StringRef> clang(ArrayRef<StringRef> InputFiles, 
const ArgList &Args,
     Triple.isAMDGPU() ? CmdArgs.push_back(Args.MakeArgString("-mcpu=" + Arch))
                       : CmdArgs.push_back(Args.MakeArgString("-march=" + 
Arch));
 
-  // AMDGPU is always in LTO mode currently.
-  if (Triple.isAMDGPU())
+  // AMDGPU defaults to the LTO pipeline. Non-RDC HIP uses the conventional
+  // non-LTO pipeline so device codegen still runs here, in parallel, instead
+  // of being deferred to the LTO link.
+  // FIXME: This is a stop-gap for non-RDC. Longer term, RDC and non-RDC should
+  // share a unified interface so runtime libraries can be provided to non-RDC
+  // compilations without relying on -mlink-builtin-bitcode.
+  bool NonLTOAMDGPU = Triple.isAMDGPU() && Args.hasArg(OPT_no_lto);
+  if (Triple.isAMDGPU() && !NonLTOAMDGPU)
     CmdArgs.push_back("-flto");
 
   // Forward all of the `--offload-opt` and `-mllvm` options to the device.
@@ -548,6 +554,12 @@ Expected<StringRef> clang(ArrayRef<StringRef> InputFiles, 
const ArgList &Args,
   if (!Triple.isNVPTX() && !Triple.isSPIRV())
     CmdArgs.push_back("-Wl,--no-undefined");
 
+  // The device inputs are bitcode stored in files with an object extension.
+  // Force the IR input language so Clang runs the compile and backend phases
+  // instead of treating them as linker inputs, which would defer codegen to
+  // the LTO link and defeat the non-LTO pipeline.
+  if (NonLTOAMDGPU)
+    CmdArgs.append({"-x", "ir"});
   for (StringRef InputFile : InputFiles)
     CmdArgs.push_back(InputFile);
 
diff --git a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td 
b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
index 53b6c596de291..87a26ca90a66f 100644
--- a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
+++ b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
@@ -39,6 +39,9 @@ def print_wrapped_module : Flag<["--"], 
"print-wrapped-module">,
   HelpText<"Print the wrapped module's IR for testing">;
 def save_temps : Flag<["--"], "save-temps">,
   Flags<[WrapperOnlyOption]>, HelpText<"Save intermediate results">;
+def no_lto : Flag<["--"], "no-lto">,
+             Flags<[WrapperOnlyOption]>,
+             HelpText<"Use the non-LTO device compilation pipeline">;
 def compress : Flag<["--"], "compress">,
   Flags<[WrapperOnlyOption]>, HelpText<"Compress bundled files">;
 def compression_level_eq : Joined<["--"], "compression-level=">,

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to