llvmorg-github-actions[bot] wrote:

<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-llvm-transforms

Author: Henry Jiang (mustartt)

<details>
<summary>Changes</summary>

When `-funique-internal-linkage-names` is supplied, a unique suffix is appended 
even for a static function with an asm label, when it really shouldn't have.

```c
static int impl(int x) asm("__impl"); // 
__impl.uniq.68358509610070717889884130747296293671
static int impl(int x) { return x + 1; }
```


---

Patch is 30.65 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/202004.diff


8 Files Affected:

- (modified) clang/lib/CodeGen/CodeGenModule.cpp (+6) 
- (modified) clang/test/CodeGen/unique-internal-linkage-names.c (+11) 
- (modified) clang/test/CodeGen/unique-internal-linkage-names.cpp (+8) 
- (modified) llvm/lib/CodeGen/CodeGenPrepare.cpp (+133) 
- (modified) llvm/lib/CodeGen/MachineSink.cpp (+254-2) 
- (added) llvm/test/CodeGen/AArch64/machine-sink-diamond-load-coexec.mir (+51) 
- (added) llvm/test/CodeGen/AArch64/machine-sink-diamond-load.mir (+64) 
- (added) llvm/test/Transforms/CodeGenPrepare/AArch64/sink-diamond-load.ll 
(+68) 


``````````diff
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp 
b/clang/lib/CodeGen/CodeGenModule.cpp
index 236738e9975d3..1c6ceafdb3c88 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -2257,7 +2257,13 @@ static void AppendCPUSpecificCPUDispatchMangling(const 
CodeGenModule &CGM,
 static bool isUniqueInternalLinkageDecl(GlobalDecl GD,
                                         CodeGenModule &CGM) {
   const Decl *D = GD.getDecl();
+  // An explicit asm label takes full control of the final symbol name and may
+  // be referenced under that exact name from elsewhere (e.g. a separate
+  // declaration in this or another translation unit). Appending the unique
+  // suffix would rename the definition out from under those references, so
+  // leave asm-labeled decls alone.
   return !CGM.getModuleNameHash().empty() && isa<FunctionDecl>(D) &&
+         !D->hasAttr<AsmLabelAttr>() &&
          (CGM.getFunctionLinkage(GD) == llvm::GlobalValue::InternalLinkage);
 }
 
diff --git a/clang/test/CodeGen/unique-internal-linkage-names.c 
b/clang/test/CodeGen/unique-internal-linkage-names.c
index 0fd5e516eec41..888de0e176b81 100644
--- a/clang/test/CodeGen/unique-internal-linkage-names.c
+++ b/clang/test/CodeGen/unique-internal-linkage-names.c
@@ -5,6 +5,15 @@
 inline void overloaded_external() {}
 extern void overloaded_external();
 
+// A prototyped static function gets a unique suffix...
+// CHECK: define internal i32 @_ZL7uniquedv.__uniq.{{[0-9]+}}(
+static int uniqued(void) { return 0; }
+
+// Check that a static function with asm label keeps its original name.
+// CHECK: define internal i32 @"\01custom_label"
+static int asm_label(void) asm("custom_label");
+static int asm_label(void) { return 0; }
+
 // CHECK: define internal void @overloaded_internal() [[ATTR:#[0-9]+]] {
 static void overloaded_internal() {}
 extern void overloaded_internal();
@@ -12,6 +21,8 @@ extern void overloaded_internal();
 void markUsed() {
   overloaded_external();
   overloaded_internal();
+  uniqued();
+  asm_label();
 }
 
 // CHECK: attributes [[ATTR]] =
diff --git a/clang/test/CodeGen/unique-internal-linkage-names.cpp 
b/clang/test/CodeGen/unique-internal-linkage-names.cpp
index e847cea9d273c..070a6eb4960d6 100644
--- a/clang/test/CodeGen/unique-internal-linkage-names.cpp
+++ b/clang/test/CodeGen/unique-internal-linkage-names.cpp
@@ -54,6 +54,11 @@ void test() {
   A a;
 }
 
+// Check a static function with an asm label must keep original name.
+static int asm_label() asm("custom_label");
+static int asm_label() { return 0; }
+int call_asm_label() { return asm_label(); }
+
 // PLAIN: @_ZL4glob = internal global
 // PLAIN: @_ZZ8retAnonMvE5fGlob = internal global
 // PLAIN: @_ZN12_GLOBAL__N_16anon_mE = internal global
@@ -62,6 +67,7 @@ void test() {
 // PLAIN: define internal ptr @_ZL4mverv.resolver()
 // PLAIN: define internal void @_ZN12_GLOBAL__N_11AC1Ev
 // PLAIN: define internal void @_ZN12_GLOBAL__N_11AD1Ev
+// PLAIN: define internal noundef i32 @custom_label()
 // PLAIN: define internal noundef i32 @_ZL4mverv()
 // PLAIN: define internal noundef i32 @_ZL4mverv.sse4.2()
 // PLAIN-NOT: "sample-profile-suffix-elision-policy"
@@ -73,6 +79,8 @@ void test() {
 // UNIQUE: define internal ptr @_ZL4mverv.[[MODHASH]].resolver()
 // UNIQUE: define internal void 
@_ZN12_GLOBAL__N_11AC1Ev.__uniq.68358509610070717889884130747296293671
 // UNIQUE: define internal void 
@_ZN12_GLOBAL__N_11AD1Ev.__uniq.68358509610070717889884130747296293671
+// An explicit asm label keeps the user-specified name with no unique suffix.
+// UNIQUE: define internal noundef i32 @custom_label()
 // UNIQUE: define internal noundef i32 @_ZL4mverv.[[MODHASH]]()
 // UNIQUE: define internal noundef i32 @_ZL4mverv.[[MODHASH]].sse4.2
 // UNIQUE: attributes #[[#ATTR]] = { 
{{.*}}"sample-profile-suffix-elision-policy"{{.*}} }
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp 
b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 74a0502d8cb7c..a2d861471347f 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -140,6 +140,32 @@ static cl::opt<bool> DisableBranchOpts(
     "disable-cgp-branch-opts", cl::Hidden, cl::init(false),
     cl::desc("Disable branch optimizations in CodeGenPrepare"));
 
+// A load that dominates a diamond and is consumed in more than one arm has had
+// its live range stretched across the conditional (typically by SimplifyCFG's
+// hoistCommonCodeFromSuccessors). The clone is dynamically neutral -- exactly
+// one arm runs per pass -- so its only cost is static code size. Sinking it
+// with duplication into each arm shortens the live range before ISel and lets
+// later passes (e.g. load-pair formation) treat each arm independently. Gated
+// off by default.
+static cl::opt<bool> EnableSinkDiamondLoads(
+    "cgp-sink-diamond-loads", cl::Hidden, cl::init(false),
+    cl::desc("Sink-with-duplication a load that dominates a diamond into its "
+             "using arms to shorten its live range across the branch"));
+
+static cl::opt<unsigned> SinkDiamondLoadsMaxFanout(
+    "cgp-sink-diamond-loads-max-fanout", cl::Hidden, cl::init(2),
+    cl::desc("Maximum number of arm clones for diamond-load sinking in CGP"));
+
+static cl::opt<unsigned> SinkDiamondLoadsChainDepth(
+    "cgp-sink-diamond-loads-chain", cl::Hidden, cl::init(4),
+    cl::desc("Sink a diamond load when its value feeds a dependency chain of "
+             "at least this depth in an arm (e.g. a reduction)"));
+
+static cl::opt<bool> ForceSinkDiamondLoads(
+    "cgp-sink-diamond-loads-force", cl::Hidden, cl::init(false),
+    cl::desc("Skip the profitability heuristics for CGP diamond-load sinking "
+             "(testing only); safety/legality checks still apply"));
+
 static cl::opt<bool>
     DisableGCOpts("disable-cgp-gc-opts", cl::Hidden, cl::init(false),
                   cl::desc("Disable GC optimizations in CodeGenPrepare"));
@@ -431,6 +457,7 @@ class CodeGenPrepare {
   bool optimizeExt(Instruction *&I);
   bool optimizeExtUses(Instruction *I);
   bool optimizeLoadExt(LoadInst *Load);
+  bool sinkLoadAcrossDiamond(LoadInst *Load);
   bool optimizeShiftInst(BinaryOperator *BO);
   bool optimizeFunnelShift(IntrinsicInst *Fsh);
   bool optimizeSelectInst(SelectInst *SI);
@@ -7394,6 +7421,110 @@ bool CodeGenPrepare::optimizeExtUses(Instruction *I) {
   return MadeChange;
 }
 
+// True if the value V feeds a dependency chain of at least MinDepth
+// instructions. Such a value sits early in a long computation (e.g. a
+// reduction), where pinning a register for it across the branch is most
+// costly. Explores all users level by level (bounded), so it is not fooled by
+// a reduction DAG whose nodes have several users.
+static bool feedsDeepChain(Value *V, unsigned MinDepth) {
+  SmallVector<Value *, 16> Frontier{V};
+  SmallPtrSet<Value *, 32> Visited{V};
+  auto IsChainStep = [](Instruction *UI) {
+    return UI && !UI->getType()->isVoidTy() && !UI->isTerminator() &&
+           !isa<PHINode>(UI);
+  };
+  for (unsigned Depth = 0; Depth < MinDepth; ++Depth) {
+    SmallVector<Value *, 16> Next;
+    for (Value *Cur : Frontier)
+      for (User *U : Cur->users())
+        if (auto *UI = dyn_cast<Instruction>(U); IsChainStep(UI))
+          if (Visited.insert(UI).second && Visited.size() < 512)
+            Next.push_back(UI);
+    if (Next.empty())
+      return false;
+    Frontier = std::move(Next);
+  }
+  return true;
+}
+
+// Sink a load that dominates a diamond by cloning it into each arm that
+// consumes its result, then erasing the original. This shortens the loaded
+// value's live range when SimplifyCFG hoisted a load common to the arms up 
into
+// their shared predecessor, stretching it across the conditional. The clone is
+// dynamically neutral (exactly one arm runs per pass), so its only cost is
+// static code size. Gated by -cgp-sink-diamond-loads.
+bool CodeGenPrepare::sinkLoadAcrossDiamond(LoadInst *Load) {
+  if (!EnableSinkDiamondLoads)
+    return false;
+
+  // Only plain loads: volatile/atomic loads have observable position.
+  if (!Load->isSimple())
+    return false;
+
+  BasicBlock *BB = Load->getParent();
+
+  // Collect the distinct arms that use the load. Require a use in more than 
one
+  // successor (a single-successor use is ordinary sinking), no use in BB
+  // itself, no PHI use (those want the value in a predecessor), and each using
+  // arm to be a direct successor of BB whose *only* predecessor is BB. That
+  // sole-predecessor condition makes the arms mutually exclusive (exactly one
+  // clone runs per pass) and means the only path from the load to a clone site
+  // is BB's tail -> arm, so memory safety only has to consider BB's tail.
+  SmallPtrSet<BasicBlock *, 4> SuccSet(succ_begin(BB), succ_end(BB));
+  SmallSetVector<BasicBlock *, 4> Arms;
+  for (User *U : Load->users()) {
+    auto *UI = dyn_cast<Instruction>(U);
+    if (!UI || isa<PHINode>(UI))
+      return false;
+    BasicBlock *UseBB = UI->getParent();
+    if (UseBB == BB || !SuccSet.contains(UseBB))
+      return false;
+    Arms.insert(UseBB);
+  }
+  if (Arms.size() < 2 || Arms.size() > SinkDiamondLoadsMaxFanout)
+    return false;
+  for (BasicBlock *Arm : Arms)
+    if (Arm->getUniquePredecessor() != BB || Arm->isEHPad())
+      return false;
+
+  // Memory safety: cloning moves the load later. Given sole-predecessor arms,
+  // the only instructions between the load and its clones are BB's tail; 
reject
+  // if any of them may write memory or otherwise has observable side effects.
+  // (Moving a pure load to execute later -- or not at all on some path -- is
+  // fine; only an intervening aliasing write would change its value.)
+  for (Instruction &I : make_range(std::next(Load->getIterator()), BB->end()))
+    if (I.mayWriteToMemory() || I.mayHaveSideEffects())
+      return false;
+
+  // Profitability. The clone is dynamically neutral, so the bar is low: fire
+  // when the load recurs in a loop and feeds a meaningful computation in an 
arm
+  // (a reduction-style chain), where the cross-branch live range hurts the
+  // schedule. -force skips this for testing.
+  if (!ForceSinkDiamondLoads) {
+    if (!LI->getLoopFor(BB))
+      return false;
+    if (!feedsDeepChain(Load, SinkDiamondLoadsChainDepth))
+      return false;
+  }
+
+  // Clone the load into each arm before the arm's first use, and rewrite that
+  // arm's uses to the local clone.
+  for (BasicBlock *Arm : Arms) {
+    Instruction *InsertPt = &*Arm->getFirstInsertionPt();
+    auto *Clone = cast<LoadInst>(Load->clone());
+    Clone->insertBefore(InsertPt->getIterator());
+    if (Load->hasName())
+      Clone->setName(Load->getName() + ".sunk");
+    Load->replaceUsesWithIf(Clone, [&](Use &U) {
+      auto *UI = cast<Instruction>(U.getUser());
+      return UI->getParent() == Arm;
+    });
+  }
+
+  Load->eraseFromParent();
+  return true;
+}
+
 // Find loads whose uses only use some of the loaded value's bits.  Add an 
"and"
 // just after the load if the target can fold this into one extload 
instruction,
 // with the hope of eliminating some of the other later "and" instructions 
using
@@ -8975,6 +9106,8 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, 
ModifyDT &ModifiedDT) {
       return true;
 
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    if (sinkLoadAcrossDiamond(LI))
+      return true; // LI has been erased.
     LI->setMetadata(LLVMContext::MD_invariant_group, nullptr);
     bool Modified = optimizeLoadExt(LI);
     unsigned AS = LI->getPointerAddressSpace();
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index b34ebc3a80886..28d127e436e1f 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -113,7 +113,49 @@ static cl::opt<unsigned> SinkIntoCycleLimit(
         "The maximum number of instructions considered for cycle sinking."),
     cl::init(50), cl::Hidden);
 
+// When a value-producing load dominates a diamond and its result is consumed 
in
+// more than one successor, ordinary sinking cannot move it (there is no single
+// sink target). Hoisting such a load -- as SimplifyCFG's
+// hoistCommonCodeFromSuccessors does -- is dynamically neutral but stretches
+// the loaded value's live range across the conditional, which hurts under
+// register pressure. When enabled, sink-with-duplication clones the load into
+// each using successor and removes the dominating copy, shortening the live
+// range. Gated off by default.
+static cl::opt<bool> EnableDiamondLoadSink(
+    "machine-sink-diamond-loads", cl::Hidden, cl::init(false),
+    cl::desc("Sink-with-duplication a load that dominates a diamond into its "
+             "using successors to shorten its live range across the branch"));
+
+// A clone into mutually-exclusive successors is dynamically neutral -- exactly
+// one copy runs per pass -- so unlike ordinary sinking its only cost is static
+// code size. The profitability bar is therefore low: it fires when the clone
+// shortens a live range at all, cheaply, in a loop, without pushing any target
+// over its own pressure limit. These knobs tune that bar; -force skips it
+// entirely for testing the mechanics.
+static cl::opt<bool> ForceDiamondLoadSink(
+    "machine-sink-diamond-loads-force", cl::Hidden, cl::init(false),
+    cl::desc("Skip the profitability heuristics for diamond-load sinking "
+             "(testing only); safety/legality checks still apply"));
+
+static cl::opt<unsigned> DiamondLoadSinkMaxFanout(
+    "machine-sink-diamond-loads-max-fanout", cl::Hidden, cl::init(2),
+    cl::desc("Maximum number of successor clones for diamond-load sinking "
+             "(bounds static code growth)"));
+
+static cl::opt<unsigned> DiamondLoadSinkSpanThreshold(
+    "machine-sink-diamond-loads-span", cl::Hidden, cl::init(8),
+    cl::desc("Sink a diamond load when the value would otherwise stay live "
+             "across at least this many instructions (def's block tail plus "
+             "the depth of the first use in an arm)"));
+
+static cl::opt<unsigned> DiamondLoadSinkChainDepth(
+    "machine-sink-diamond-loads-chain", cl::Hidden, cl::init(4),
+    cl::desc("Sink a diamond load when its value feeds a dependency chain of "
+             "at least this depth in an arm (e.g. a reduction)"));
+
 STATISTIC(NumSunk, "Number of machine instructions sunk");
+STATISTIC(NumDiamondLoadsSunk,
+          "Number of diamond loads sunk-with-duplication into successors");
 STATISTIC(NumCycleSunk, "Number of machine instructions sunk into a cycle");
 STATISTIC(NumSplit, "Number of critical edges split");
 STATISTIC(NumCoalesces, "Number of copies coalesced");
@@ -245,6 +287,13 @@ class MachineSinking {
   bool SinkInstruction(MachineInstr &MI, bool &SawStore,
                        AllSuccsCache &AllSuccessors);
 
+  /// Sink a load that dominates a diamond by cloning it into each successor
+  /// block that consumes its result, then erasing the original. This shortens
+  /// the loaded value's live range when it would otherwise span the
+  /// conditional. Returns true if the load was sunk. Gated by
+  /// -machine-sink-diamond-loads.
+  bool sinkDiamondLoadToUsers(MachineInstr &MI, MachineBasicBlock *MBB);
+
   /// If we sink a COPY inst, some debug users of it's destination may no
   /// longer be dominated by the COPY, and will eventually be dropped.
   /// This is easily rectified by forwarding the non-dominated debug uses
@@ -1875,8 +1924,13 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, 
bool &SawStore,
       FindSuccToSinkTo(MI, ParentBlock, BreakPHIEdge, AllSuccessors);
 
   // If there are no outputs, it must have side-effects.
-  if (!SuccToSinkTo)
-    return false;
+  if (!SuccToSinkTo) {
+    // FindSuccToSinkTo bails when the def is consumed by more than one
+    // successor (no single sink target). For a load that dominates a diamond,
+    // we can instead clone it into each using successor to shorten its live
+    // range. This is a no-op for non-candidates.
+    return sinkDiamondLoadToUsers(MI, ParentBlock);
+  }
 
   // If the instruction to move defines a dead physical register which is live
   // when leaving the basic block, don't move it because it could turn into a
@@ -2001,6 +2055,204 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, 
bool &SawStore,
   return true;
 }
 
+// Count non-debug instructions strictly after MI to the end of its block.
+static unsigned numNonDebugInstrsAfter(MachineInstr &MI) {
+  unsigned N = 0;
+  for (MachineInstr &O : make_range(std::next(MachineBasicBlock::iterator(MI)),
+                                    MI.getParent()->end()))
+    if (!O.isDebugOrPseudoInstr())
+      ++N;
+  return N;
+}
+
+// Number of non-debug instructions from the top of S up to the first
+// instruction that reads Reg. Returns ~0u if Reg is not read in S.
+static unsigned firstUseDepthInBlock(Register Reg, MachineBasicBlock &S) {
+  unsigned N = 0;
+  for (MachineInstr &I : S) {
+    if (I.isDebugOrPseudoInstr())
+      continue;
+    if (I.readsRegister(Reg, /*TRI=*/nullptr))
+      return N;
+    ++N;
+  }
+  return ~0u;
+}
+
+// True if the value in Reg feeds a dependency chain of at least MinDepth
+// value-producing instructions. Such a value sits early in a long computation
+// (e.g. a reduction), so pinning a register for it across the branch is most
+// costly -- exactly where shortening the live range pays off. Follows the 
first
+// single-def virtual-register user at each step, bounded.
+static bool feedsDeepChain(Register Reg, const MachineRegisterInfo *MRI,
+                           unsigned MinDepth) {
+  Register Cur = Reg;
+  for (unsigned Depth = 0; Depth < MinDepth + 1;) {
+    MachineInstr *Next = nullptr;
+    for (MachineInstr &U : MRI->use_nodbg_instructions(Cur)) {
+      if (U.getNumExplicitDefs() != 1)
+        continue;
+      const MachineOperand &D = *U.defs().begin();
+      if (D.isReg() && D.getReg().isVirtual() && !D.getSubReg()) {
+        Next = &U;
+        break;
+      }
+    }
+    if (!Next)
+      return false;
+    if (++Depth >= MinDepth)
+      return true;
+    Cur = Next->defs().begin()->getReg();
+  }
+  return false;
+}
+
+bool MachineSinking::sinkDiamondLoadToUsers(MachineInstr &MI,
+                                            MachineBasicBlock *MBB) {
+  if (!EnableDiamondLoadSink)
+    return false;
+
+  // Only consider plain, side-effect-free loads the target is willing to sink.
+  if (!MI.mayLoad() || MI.mayStore() || MI.hasOrderedMemoryRef() ||
+      MI.isCall() || MI.hasUnmodeledSideEffects() || MI.isConvergent())
+    return false;
+  if (!TII->shouldSink(MI))
+    return false;
+
+  // Require exactly one explicit def: a virtual register, not dead, no subreg,
+  // with a single definition. Physreg uses are only safe to duplicate if they
+  // cannot be redefined on the path (mirrors FindSuccToSinkTo).
+  Register DefReg;
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg() || !MO.getReg())
+      continue;
+    if (MO.isDef()) {
+      if (DefReg.isValid() || !MO.getReg().isVirtual() || MO.isDead() ||
+          MO.getSubReg())
+        return false;
+      DefReg = MO.getReg();
+    } else if (MO.getReg().isPhysical()) {
+      if (!MRI->isConstantPhysReg(MO.getReg()) && !TII->isIgnorableUse(MO))
+        return false;
+    }
+  }
+  if (!DefReg.isValid() || !MRI->hasOneDef(DefReg))
+    return false;
+  const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
+  if (!TII->isSafeToMoveRegClassDefs(RC))
+    return false;
+
+  // Collect the distinct successor blocks that use DefReg. Require a use in
+  // more than one successor (single-successor uses are handled by ordinary
+  // sinking), no PHI use (those want the value in a predecessor), no use in 
MBB
+  // itself, and every using block to be a direct successor of MBB.
+  SmallPtrSet<MachineBasicBlock *, 4> SuccSet(MBB->succ_begin(),
+                                              MBB->succ_end());
+  SmallSetVector<MachineBasicBlock *, 4> UsingBlocks;
+  for (MachineInstr &Use : MRI->use_nodbg_instructions(DefReg)) {
+    if (Use.isPHI())
+      return false;
+    MachineBasicBlock *UseBB = Use.getParent();
+    if (UseBB == MBB || !SuccSet.contains(UseBB))
+      return false;
+    UsingBlocks.insert(UseBB);
+  }
+  if (UsingBlocks.size() < 2 || UsingBlocks.size() > DiamondLoadSinkMaxFanout)
+    return false;
+
+  // Each using block must have MBB as it...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/202004
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to