llvmorg-github-actions[bot] wrote:

<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-loongarch

Author: llvmbot

<details>
<summary>Changes</summary>

Backport 19e915fc5c91645ccc4050180e9daabec30358c4

Requested by: @<!-- -->heiher

---

Patch is 51.69 KiB, truncated to 20.00 KiB below, full version: 
https://github.com/llvm/llvm-project/pull/199637.diff


4 Files Affected:

- (modified) llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp (+183-44) 
- (modified) llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h (+17) 
- (added) llvm/test/CodeGen/LoongArch/musttail-call.ll (+20) 
- (added) llvm/test/CodeGen/LoongArch/musttail-indirect-args.ll (+907) 


``````````diff
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp 
b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 2cfe3b2bc1a99..7d3d333efe046 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -8172,9 +8172,22 @@ SDValue LoongArchTargetLowering::LowerFormalArguments(
           "GHC calling convention requires the F and D extensions");
   }
 
+  const Function &Func = MF.getFunction();
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   MVT GRLenVT = Subtarget.getGRLenVT();
   unsigned GRLenInBytes = Subtarget.getGRLen() / 8;
+
+  // Check if this function has any musttail calls. If so, incoming indirect
+  // arg pointers must be saved in virtual registers so they survive across
+  // basic blocks (the SelectionDAG is cleared between BBs). Only do this
+  // when needed to avoid adding register pressure to non-musttail functions.
+  bool HasMusttail = llvm::any_of(Func, [](const BasicBlock &BB) {
+    return llvm::any_of(BB, [](const Instruction &I) {
+      if (const auto *CI = dyn_cast<CallInst>(&I))
+        return CI->isMustTailCall();
+      return false;
+    });
+  });
   // Used with varargs to acumulate store chains.
   std::vector<SDValue> OutChains;
 
@@ -8205,6 +8218,14 @@ SDValue LoongArchTargetLowering::LowerFormalArguments(
       InVals.push_back(DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue,
                                    MachinePointerInfo()));
       unsigned ArgIndex = Ins[InsIdx].OrigArgIndex;
+      if (HasMusttail) {
+        LoongArchMachineFunctionInfo *LAFI =
+            MF.getInfo<LoongArchMachineFunctionInfo>();
+        Register VReg =
+            MF.getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass);
+        Chain = DAG.getCopyToReg(Chain, DL, VReg, ArgValue);
+        LAFI->setIncomingIndirectArg(ArgIndex, VReg);
+      }
       unsigned ArgPartOffset = Ins[InsIdx].PartOffset;
       assert(ArgPartOffset == 0);
       while (i + 1 != e && Ins[InsIdx + 1].OrigArgIndex == ArgIndex) {
@@ -8335,6 +8356,27 @@ bool 
LoongArchTargetLowering::isEligibleForTailCallOptimization(
   auto &Caller = MF.getFunction();
   auto CallerCC = Caller.getCallingConv();
 
+  bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
+
+  // Byval parameters hand the function a pointer directly into the stack area
+  // we want to reuse during a tail call. Working around this *is* possible
+  // but less efficient and uglier in LowerCall. For musttail, there is no
+  // workaround today: a byval arg requires a local copy that becomes invalid
+  // after the tail call deallocates the caller's frame, so rejecting here
+  // (and triggering reportFatalInternalError in LowerCall) is safer than
+  // miscompiling.
+  for (auto &Arg : Outs)
+    if (Arg.Flags.isByVal())
+      return false;
+
+  // musttail bypasses the remaining checks: the checks either reject cases
+  // we handle specially (indirect args are forwarded via incoming pointers,
+  // stack-passed args reuse the matching incoming layout, sret is forwarded
+  // like any other pointer arg) or are optimizations not applicable to
+  // mandatory tail calls.
+  if (IsMustTail)
+    return true;
+
   // Do not tail call opt if the stack is used to pass parameters.
   if (CCInfo.getStackSize() != 0)
     return false;
@@ -8351,11 +8393,6 @@ bool 
LoongArchTargetLowering::isEligibleForTailCallOptimization(
   if (IsCallerStructRet || IsCalleeStructRet)
     return false;
 
-  // Do not tail call opt if either the callee or caller has a byval argument.
-  for (auto &Arg : Outs)
-    if (Arg.Flags.isByVal())
-      return false;
-
   // The callee has to preserve all registers the caller needs to preserve.
   const LoongArchRegisterInfo *TRI = Subtarget.getRegisterInfo();
   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
@@ -8488,47 +8525,149 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo 
&CLI,
     // Promote the value if needed.
     // For now, only handle fully promoted and indirect arguments.
     if (VA.getLocInfo() == CCValAssign::Indirect) {
-      // Store the argument in a stack slot and pass its address.
-      Align StackAlign =
-          std::max(getPrefTypeAlign(Outs[OutIdx].ArgVT, DAG),
-                   getPrefTypeAlign(ArgValue.getValueType(), DAG));
-      TypeSize StoredSize = ArgValue.getValueType().getStoreSize();
-      // If the original argument was split and passed by reference, we need to
-      // store the required parts of it here (and pass just one address).
-      unsigned ArgIndex = Outs[OutIdx].OrigArgIndex;
-      unsigned ArgPartOffset = Outs[OutIdx].PartOffset;
-      assert(ArgPartOffset == 0);
-      // Calculate the total size to store. We don't have access to what we're
-      // actually storing other than performing the loop and collecting the
-      // info.
-      SmallVector<std::pair<SDValue, SDValue>> Parts;
-      while (i + 1 != e && Outs[OutIdx + 1].OrigArgIndex == ArgIndex) {
-        SDValue PartValue = OutVals[OutIdx + 1];
-        unsigned PartOffset = Outs[OutIdx + 1].PartOffset - ArgPartOffset;
-        SDValue Offset = DAG.getIntPtrConstant(PartOffset, DL);
-        EVT PartVT = PartValue.getValueType();
+      // For musttail calls, reuse incoming indirect pointers instead of
+      // creating new stack temporaries. The incoming pointers point to the
+      // caller's caller's frame, which remains valid after a tail call.
+      if (IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) {
+        LoongArchMachineFunctionInfo *LAFI =
+            MF.getInfo<LoongArchMachineFunctionInfo>();
+        unsigned CallArgIdx = Outs[OutIdx].OrigArgIndex;
+
+        // Resolve which formal parameter is being passed at this call
+        // position.
+        //
+        // FIXME: Ins[].OrigArgIndex is Argument::getArgNo() (unfiltered),
+        // but Outs[].OrigArgIndex is an index into a filtered arg list
+        // (empty types removed, via CallLoweringInfo in the target-
+        // independent layer). IncomingIndirectArgs is keyed by the
+        // caller's unfiltered Argument::getArgNo(), so we have to walk
+        // the caller's formals (same filter) to translate the index.
+        // This target-independent asymmetry should be normalized so
+        // backends do not need to re-derive the mapping.
+        //
+        // Steps:
+        // 1. Find the call operand at filtered position CallArgIdx.
+        // 2. If it is an Argument, use getArgNo() directly (same filter
+        //    for caller formals and call operands).
+        // 3. Otherwise (computed value), walk the caller's formals and
+        //    skip empty types to map the filtered index to getArgNo().
+        const Argument *FormalArg = nullptr;
+        unsigned FilteredIdx = 0;
+        for (const auto &CallArg : CLI.CB->args()) {
+          if (CallArg->getType()->isEmptyTy())
+            continue;
+          if (FilteredIdx == CallArgIdx) {
+            FormalArg = dyn_cast<Argument>(CallArg);
+            break;
+          }
+          ++FilteredIdx;
+        }
 
-        StoredSize += PartVT.getStoreSize();
-        StackAlign = std::max(StackAlign, getPrefTypeAlign(PartVT, DAG));
-        Parts.push_back(std::make_pair(PartValue, Offset));
-        ++i;
-        ++OutIdx;
-      }
-      SDValue SpillSlot = DAG.CreateStackTemporary(StoredSize, StackAlign);
-      int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
-      MemOpChains.push_back(
-          DAG.getStore(Chain, DL, ArgValue, SpillSlot,
-                       MachinePointerInfo::getFixedStack(MF, FI)));
-      for (const auto &Part : Parts) {
-        SDValue PartValue = Part.first;
-        SDValue PartOffset = Part.second;
-        SDValue Address =
-            DAG.getNode(ISD::ADD, DL, PtrVT, SpillSlot, PartOffset);
+        // For forwarded args, getArgNo() gives the unfiltered index directly.
+        // For computed args, walk the caller's formals to resolve it.
+        unsigned FormalArgIdx = CallArgIdx;
+        if (FormalArg) {
+          FormalArgIdx = FormalArg->getArgNo();
+        } else {
+          FilteredIdx = 0;
+          for (const auto &Arg : MF.getFunction().args()) {
+            if (Arg.getType()->isEmptyTy())
+              continue;
+            if (FilteredIdx == CallArgIdx) {
+              FormalArgIdx = Arg.getArgNo();
+              break;
+            }
+            ++FilteredIdx;
+          }
+        }
+
+        Register VReg = LAFI->getIncomingIndirectArg(FormalArgIdx);
+        SDValue CopyOp = DAG.getCopyFromReg(Chain, DL, VReg, PtrVT);
+        // Thread the CopyFromReg output chain through MemOpChains so the
+        // TokenFactor below sequences the copy with any stores we emit
+        // for this argument.
+        MemOpChains.push_back(CopyOp.getValue(1));
+        SDValue IncomingPtr = CopyOp;
+
+        if (!FormalArg) {
+          // Computed value: store into the incoming indirect pointer for the
+          // same-position formal parameter (musttail guarantees matching
+          // prototypes, so types match). The pointer survives the tail call
+          // since it points to the caller's caller's frame.
+          //
+          // The data-flow edge through IncomingPtr already prevents the
+          // store from being scheduled before the CopyFromReg. Threading
+          // CopyOp.getValue(1) (the copy's output chain) into the store
+          // makes that ordering explicit on the chain edge as well, which
+          // is the convention for memory ops chaining off their producers.
+          MemOpChains.push_back(
+              DAG.getStore(CopyOp.getValue(1), DL, ArgValue, IncomingPtr,
+                           MachinePointerInfo::getUnknownStack(MF)));
+          // Store any split parts at their respective offsets.
+          unsigned ArgPartOffset = Outs[OutIdx].PartOffset;
+          while (i + 1 != e && Outs[OutIdx + 1].OrigArgIndex == CallArgIdx) {
+            SDValue PartValue = OutVals[OutIdx + 1];
+            unsigned PartOffset = Outs[OutIdx + 1].PartOffset - ArgPartOffset;
+            SDValue Offset = DAG.getIntPtrConstant(PartOffset, DL);
+            SDValue Addr =
+                DAG.getNode(ISD::ADD, DL, PtrVT, IncomingPtr, Offset);
+            MemOpChains.push_back(
+                DAG.getStore(CopyOp.getValue(1), DL, PartValue, Addr,
+                             MachinePointerInfo::getUnknownStack(MF)));
+            ++i;
+            ++OutIdx;
+          }
+        }
+        ArgValue = IncomingPtr;
+
+        // Skip any remaining split parts (for forwarded args, they are
+        // covered by the forwarded pointer).
+        while (i + 1 != e && Outs[OutIdx + 1].OrigArgIndex == CallArgIdx) {
+          ++i;
+          ++OutIdx;
+        }
+      } else {
+        // Store the argument in a stack slot and pass its address.
+        Align StackAlign =
+            std::max(getPrefTypeAlign(Outs[OutIdx].ArgVT, DAG),
+                     getPrefTypeAlign(ArgValue.getValueType(), DAG));
+        TypeSize StoredSize = ArgValue.getValueType().getStoreSize();
+        // If the original argument was split and passed by reference, we need
+        // to store the required parts of it here (and pass just one address).
+        unsigned ArgIndex = Outs[OutIdx].OrigArgIndex;
+        unsigned ArgPartOffset = Outs[OutIdx].PartOffset;
+        assert(ArgPartOffset == 0);
+        // Calculate the total size to store. We don't have access to what 
we're
+        // actually storing other than performing the loop and collecting the
+        // info.
+        SmallVector<std::pair<SDValue, SDValue>> Parts;
+        while (i + 1 != e && Outs[OutIdx + 1].OrigArgIndex == ArgIndex) {
+          SDValue PartValue = OutVals[OutIdx + 1];
+          unsigned PartOffset = Outs[OutIdx + 1].PartOffset - ArgPartOffset;
+          SDValue Offset = DAG.getIntPtrConstant(PartOffset, DL);
+          EVT PartVT = PartValue.getValueType();
+          StoredSize += PartVT.getStoreSize();
+          StackAlign = std::max(StackAlign, getPrefTypeAlign(PartVT, DAG));
+          Parts.push_back(std::make_pair(PartValue, Offset));
+          ++i;
+          ++OutIdx;
+        }
+        SDValue SpillSlot = DAG.CreateStackTemporary(StoredSize, StackAlign);
+        int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
         MemOpChains.push_back(
-            DAG.getStore(Chain, DL, PartValue, Address,
+            DAG.getStore(Chain, DL, ArgValue, SpillSlot,
                          MachinePointerInfo::getFixedStack(MF, FI)));
+        for (const auto &Part : Parts) {
+          SDValue PartValue = Part.first;
+          SDValue PartOffset = Part.second;
+          SDValue Address =
+              DAG.getNode(ISD::ADD, DL, PtrVT, SpillSlot, PartOffset);
+          MemOpChains.push_back(
+              DAG.getStore(Chain, DL, PartValue, Address,
+                           MachinePointerInfo::getFixedStack(MF, FI)));
+        }
+        ArgValue = SpillSlot;
       }
-      ArgValue = SpillSlot;
     } else {
       ArgValue = convertValVTToLocVT(DAG, ArgValue, VA, DL);
     }
@@ -8542,8 +8681,8 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI,
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgValue));
     } else {
       assert(VA.isMemLoc() && "Argument not register or memory");
-      assert(!IsTailCall && "Tail call not allowed if stack is used "
-                            "for passing parameters");
+      assert((!IsTailCall || (CLI.CB && CLI.CB->isMustTailCall())) &&
+             "Tail call not allowed if stack is used for passing parameters");
 
       // Work out the address of the stack slot.
       if (!StackPtr.getNode())
diff --git a/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h 
b/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h
index 904985c189dba..7bf7171198e8a 100644
--- a/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h
+++ b/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h
@@ -14,6 +14,7 @@
 #define LLVM_LIB_TARGET_LOONGARCH_LOONGARCHMACHINEFUNCTIONINFO_H
 
 #include "LoongArchSubtarget.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 
@@ -32,6 +33,13 @@ class LoongArchMachineFunctionInfo : public 
MachineFunctionInfo {
   /// Size of stack frame to save callee saved registers
   unsigned CalleeSavedStackSize = 0;
 
+  /// Incoming indirect argument pointers saved as virtual registers, keyed by
+  /// formal parameter index. Used for musttail forwarding of indirect args.
+  /// Virtual registers (not SDValues) are used because the SelectionDAG is
+  /// cleared between basic blocks, and musttail calls may be in non-entry
+  /// blocks.
+  DenseMap<unsigned, Register> IncomingIndirectArgs;
+
   /// FrameIndex of the spill slot when there is no scavenged register in
   /// insertIndirectBranch.
   int BranchRelaxationSpillFrameIndex = -1;
@@ -63,6 +71,15 @@ class LoongArchMachineFunctionInfo : public 
MachineFunctionInfo {
   unsigned getCalleeSavedStackSize() const { return CalleeSavedStackSize; }
   void setCalleeSavedStackSize(unsigned Size) { CalleeSavedStackSize = Size; }
 
+  void setIncomingIndirectArg(unsigned ArgIndex, Register Reg) {
+    IncomingIndirectArgs[ArgIndex] = Reg;
+  }
+  Register getIncomingIndirectArg(unsigned ArgIndex) const {
+    auto It = IncomingIndirectArgs.find(ArgIndex);
+    assert(It != IncomingIndirectArgs.end() && "No incoming indirect arg");
+    return It->second;
+  }
+
   int getBranchRelaxationSpillFrameIndex() {
     return BranchRelaxationSpillFrameIndex;
   }
diff --git a/llvm/test/CodeGen/LoongArch/musttail-call.ll 
b/llvm/test/CodeGen/LoongArch/musttail-call.ll
new file mode 100644
index 0000000000000..0fe77ed802b7a
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/musttail-call.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 5
+; RUN: llc -mtriple=loongarch32 %s -o - | FileCheck %s --check-prefix=LA32
+; RUN: llc -mtriple=loongarch64 %s -o - | FileCheck %s --check-prefix=LA64
+
+%struct.A = type { i32 }
+
+declare void @callee_musttail(ptr sret(%struct.A) %a)
+define void @caller_musttail(ptr sret(%struct.A) %a) {
+; LA32-LABEL: caller_musttail:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    b callee_musttail
+;
+; LA64-LABEL: caller_musttail:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcaddu18i $t8, %call36(callee_musttail)
+; LA64-NEXT:    jr $t8
+entry:
+  musttail call void @callee_musttail(ptr sret(%struct.A) %a)
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/musttail-indirect-args.ll 
b/llvm/test/CodeGen/LoongArch/musttail-indirect-args.ll
new file mode 100644
index 0000000000000..d088d6065aa07
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/musttail-indirect-args.ll
@@ -0,0 +1,907 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 5
+; RUN: llc -mtriple=loongarch32 %s -o - | FileCheck %s --check-prefix=LA32
+; RUN: llc -mtriple=loongarch64 %s -o - | FileCheck %s --check-prefix=LA64
+
+; Test that musttail with indirect args (fp128 on LA32) forwards the incoming
+; pointer instead of creating a new stack temporary. Without this fix, the
+; pointer would dangle after the tail call deallocates the caller's frame.
+
+declare i32 @callee_musttail_indirect(fp128 %a)
+
+; fp128 is indirect on LA32 (too large for registers), direct on LA64.
+; On LA32, musttail must forward the incoming indirect pointer (a0) directly.
+define i32 @caller_musttail_indirect(fp128 %a) nounwind {
+; LA32-LABEL: caller_musttail_indirect:
+; LA32:       # %bb.0:
+; LA32-NEXT:    b callee_musttail_indirect
+;
+; LA64-LABEL: caller_musttail_indirect:
+; LA64:       # %bb.0:
+; LA64-NEXT:    pcaddu18i $t8, %call36(callee_musttail_indirect)
+; LA64-NEXT:    jr $t8
+  %call = musttail call i32 @callee_musttail_indirect(fp128 %a)
+  ret i32 %call
+}
+
+; Verify that non-musttail tail call with indirect args does NOT tail call
+; (this is the PR #184972 fix - indirect args are unsafe for regular tail 
calls).
+define void @caller_no_musttail_indirect() nounwind {
+; LA32-LABEL: caller_no_musttail_indirect:
+; LA32:       # %bb.0:
+; LA32-NEXT:    addi.w $sp, $sp, -32
+; LA32-NEXT:    st.w $ra, $sp, 28 # 4-byte Folded Spill
+; LA32-NEXT:    lu12i.w $a0, 262128
+; LA32-NEXT:    st.w $a0, $sp, 12
+; LA32-NEXT:    st.w $zero, $sp, 8
+; LA32-NEXT:    st.w $zero, $sp, 4
+; LA32-NEXT:    addi.w $a0, $sp, 0
+; LA32-NEXT:    st.w $zero, $sp, 0
+; LA32-NEXT:    bl callee_musttail_indirect
+; LA32-NEXT:    ld.w $ra, $sp, 28 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 32
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: caller_no_musttail_indirect:
+; LA64:       # %bb.0:
+; LA64-NEXT:    ori $a0, $zero, 0
+; LA64-NEXT:    lu32i.d $a0, -65536
+; LA64-NEXT:    lu52i.d $a1, $a0, 1023
+; LA64-NEXT:    move $a0, $zero
+; LA64-NEXT:    pcaddu18i $t8, %call36(callee_musttail_indirect)
+; LA64-NEXT:    jr $t8
+  %call = tail call i32 @callee_musttail_indirect(fp128 
0xL00000000000000003FFF000000000000)
+  ret void
+}
+
+; Verify that non-musttail tail call forwarding an indirect arg from the
+; caller's own parameters also does NOT tail call (the arg lives on the
+; caller's frame, which would be deallocated).
+define i32 @caller_no_musttail_forward_indirect(fp128 %a) nounwind {
+; LA32-LABEL: caller_no_musttail_forward_indirect:
+; LA32:       # %bb.0:
+; LA32-NEXT:    addi.w $sp, $sp, -32
+; LA32-NEXT:    st.w $ra, $sp, 28 # 4-byte Folded Spill
+; LA32-NEXT:    ld.w $a1, $a0, 0
+; LA32-NEXT:    ld.w $a2, $a0, 4
+; LA32-NEXT:    ld.w $a3, $a0, 8
+; LA32-NEXT:    ld.w $a0, $a0, 12
+; LA32-NEXT:    st.w $a0, $sp, 12
+; LA32-NEXT:    st.w $a3, $sp, 8
+; LA32-NEXT:    st.w $a2, $sp, 4
+; LA32-NEXT:    addi.w $a0, $sp, 0
+; LA32-NEXT:    st.w $a1, $sp, 0
+; LA32-NEXT:    bl callee_musttail_indirect
+; LA32-NEXT:    ld.w $ra, $sp, 28 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 32
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: caller_no_musttail_forward_indirect:
+; LA64:       # %bb.0:
+; LA64-NEXT:    pcaddu18i $t8, %call36(callee_musttail_indirect)
+; LA64-NEXT:    jr $t8
+  %call = tail call i32 @callee_musttail_indirect(fp128 %a)
+  ret i32 %call
+}
+
+; Test musttail with two indirect fp128 args on LA32. Both pointers must be
+; forwarded. Exercises the D...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/199637
_______________________________________________
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

Reply via email to