https://github.com/llvmbot updated 
https://github.com/llvm/llvm-project/pull/199637

>From 6f31a225da01c38787cf67f72f4b4422b3a5f022 Mon Sep 17 00:00:00 2001
From: hev <[email protected]>
Date: Tue, 26 May 2026 16:47:55 +0800
Subject: [PATCH] [LoongArch] Fix musttail with indirect arguments by
 forwarding incoming pointers (#198965)

When a `musttail` call passes arguments indirectly (fp128 on LA32, i128
on LA32), the backend allocates a stack temporary and hands the callee a
pointer. The tail call deallocates the caller's frame, and the pointer
dangles.

Fix by forwarding the incoming indirect pointers instead. They point to
the caller's caller's frame, which stays valid after the tail call.
Forwarded formal parameters reuse the pointer directly; computed values
get stored into the incoming buffer first.

The pointers are saved in virtual registers (`CopyToReg`/`CopyFromReg`)
rather than SDValues. The SelectionDAG is cleared between basic blocks
and musttail calls can appear in non-entry blocks, so storing raw
SDValues across BBs is unsound (this was the bug that led to the revert
in 501417baa60f). The vreg save only fires when the function has
musttail calls; other functions see no codegen change.

Non-musttail tail calls with indirect args are still rejected.

This uses the same strategy as the corresponding RISC-V fix (#185094).

(cherry picked from commit 19e915fc5c91645ccc4050180e9daabec30358c4)
---
 .../LoongArch/LoongArchISelLowering.cpp       | 227 ++++-
 .../LoongArch/LoongArchMachineFunctionInfo.h  |  17 +
 llvm/test/CodeGen/LoongArch/musttail-call.ll  |  20 +
 .../LoongArch/musttail-indirect-args.ll       | 907 ++++++++++++++++++
 4 files changed, 1127 insertions(+), 44 deletions(-)
 create mode 100644 llvm/test/CodeGen/LoongArch/musttail-call.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/musttail-indirect-args.ll

diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp 
b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 2cfe3b2bc1a99..7d3d333efe046 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -8172,9 +8172,22 @@ SDValue LoongArchTargetLowering::LowerFormalArguments(
           "GHC calling convention requires the F and D extensions");
   }
 
+  const Function &Func = MF.getFunction();
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   MVT GRLenVT = Subtarget.getGRLenVT();
   unsigned GRLenInBytes = Subtarget.getGRLen() / 8;
+
+  // Check if this function has any musttail calls. If so, incoming indirect
+  // arg pointers must be saved in virtual registers so they survive across
+  // basic blocks (the SelectionDAG is cleared between BBs). Only do this
+  // when needed to avoid adding register pressure to non-musttail functions.
+  bool HasMusttail = llvm::any_of(Func, [](const BasicBlock &BB) {
+    return llvm::any_of(BB, [](const Instruction &I) {
+      if (const auto *CI = dyn_cast<CallInst>(&I))
+        return CI->isMustTailCall();
+      return false;
+    });
+  });
   // Used with varargs to acumulate store chains.
   std::vector<SDValue> OutChains;
 
@@ -8205,6 +8218,14 @@ SDValue LoongArchTargetLowering::LowerFormalArguments(
       InVals.push_back(DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue,
                                    MachinePointerInfo()));
       unsigned ArgIndex = Ins[InsIdx].OrigArgIndex;
+      if (HasMusttail) {
+        LoongArchMachineFunctionInfo *LAFI =
+            MF.getInfo<LoongArchMachineFunctionInfo>();
+        Register VReg =
+            MF.getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass);
+        Chain = DAG.getCopyToReg(Chain, DL, VReg, ArgValue);
+        LAFI->setIncomingIndirectArg(ArgIndex, VReg);
+      }
       unsigned ArgPartOffset = Ins[InsIdx].PartOffset;
       assert(ArgPartOffset == 0);
       while (i + 1 != e && Ins[InsIdx + 1].OrigArgIndex == ArgIndex) {
@@ -8335,6 +8356,27 @@ bool 
LoongArchTargetLowering::isEligibleForTailCallOptimization(
   auto &Caller = MF.getFunction();
   auto CallerCC = Caller.getCallingConv();
 
+  bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
+
+  // Byval parameters hand the function a pointer directly into the stack area
+  // we want to reuse during a tail call. Working around this *is* possible
+  // but less efficient and uglier in LowerCall. For musttail, there is no
+  // workaround today: a byval arg requires a local copy that becomes invalid
+  // after the tail call deallocates the caller's frame, so rejecting here
+  // (and triggering reportFatalInternalError in LowerCall) is safer than
+  // miscompiling.
+  for (auto &Arg : Outs)
+    if (Arg.Flags.isByVal())
+      return false;
+
+  // musttail bypasses the remaining checks: the checks either reject cases
+  // we handle specially (indirect args are forwarded via incoming pointers,
+  // stack-passed args reuse the matching incoming layout, sret is forwarded
+  // like any other pointer arg) or are optimizations not applicable to
+  // mandatory tail calls.
+  if (IsMustTail)
+    return true;
+
   // Do not tail call opt if the stack is used to pass parameters.
   if (CCInfo.getStackSize() != 0)
     return false;
@@ -8351,11 +8393,6 @@ bool 
LoongArchTargetLowering::isEligibleForTailCallOptimization(
   if (IsCallerStructRet || IsCalleeStructRet)
     return false;
 
-  // Do not tail call opt if either the callee or caller has a byval argument.
-  for (auto &Arg : Outs)
-    if (Arg.Flags.isByVal())
-      return false;
-
   // The callee has to preserve all registers the caller needs to preserve.
   const LoongArchRegisterInfo *TRI = Subtarget.getRegisterInfo();
   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
@@ -8488,47 +8525,149 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo 
&CLI,
     // Promote the value if needed.
     // For now, only handle fully promoted and indirect arguments.
     if (VA.getLocInfo() == CCValAssign::Indirect) {
-      // Store the argument in a stack slot and pass its address.
-      Align StackAlign =
-          std::max(getPrefTypeAlign(Outs[OutIdx].ArgVT, DAG),
-                   getPrefTypeAlign(ArgValue.getValueType(), DAG));
-      TypeSize StoredSize = ArgValue.getValueType().getStoreSize();
-      // If the original argument was split and passed by reference, we need to
-      // store the required parts of it here (and pass just one address).
-      unsigned ArgIndex = Outs[OutIdx].OrigArgIndex;
-      unsigned ArgPartOffset = Outs[OutIdx].PartOffset;
-      assert(ArgPartOffset == 0);
-      // Calculate the total size to store. We don't have access to what we're
-      // actually storing other than performing the loop and collecting the
-      // info.
-      SmallVector<std::pair<SDValue, SDValue>> Parts;
-      while (i + 1 != e && Outs[OutIdx + 1].OrigArgIndex == ArgIndex) {
-        SDValue PartValue = OutVals[OutIdx + 1];
-        unsigned PartOffset = Outs[OutIdx + 1].PartOffset - ArgPartOffset;
-        SDValue Offset = DAG.getIntPtrConstant(PartOffset, DL);
-        EVT PartVT = PartValue.getValueType();
+      // For musttail calls, reuse incoming indirect pointers instead of
+      // creating new stack temporaries. The incoming pointers point to the
+      // caller's caller's frame, which remains valid after a tail call.
+      if (IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) {
+        LoongArchMachineFunctionInfo *LAFI =
+            MF.getInfo<LoongArchMachineFunctionInfo>();
+        unsigned CallArgIdx = Outs[OutIdx].OrigArgIndex;
+
+        // Resolve which formal parameter is being passed at this call
+        // position.
+        //
+        // FIXME: Ins[].OrigArgIndex is Argument::getArgNo() (unfiltered),
+        // but Outs[].OrigArgIndex is an index into a filtered arg list
+        // (empty types removed, via CallLoweringInfo in the target-
+        // independent layer). IncomingIndirectArgs is keyed by the
+        // caller's unfiltered Argument::getArgNo(), so we have to walk
+        // the caller's formals (same filter) to translate the index.
+        // This target-independent asymmetry should be normalized so
+        // backends do not need to re-derive the mapping.
+        //
+        // Steps:
+        // 1. Find the call operand at filtered position CallArgIdx.
+        // 2. If it is an Argument, use getArgNo() directly (same filter
+        //    for caller formals and call operands).
+        // 3. Otherwise (computed value), walk the caller's formals and
+        //    skip empty types to map the filtered index to getArgNo().
+        const Argument *FormalArg = nullptr;
+        unsigned FilteredIdx = 0;
+        for (const auto &CallArg : CLI.CB->args()) {
+          if (CallArg->getType()->isEmptyTy())
+            continue;
+          if (FilteredIdx == CallArgIdx) {
+            FormalArg = dyn_cast<Argument>(CallArg);
+            break;
+          }
+          ++FilteredIdx;
+        }
 
-        StoredSize += PartVT.getStoreSize();
-        StackAlign = std::max(StackAlign, getPrefTypeAlign(PartVT, DAG));
-        Parts.push_back(std::make_pair(PartValue, Offset));
-        ++i;
-        ++OutIdx;
-      }
-      SDValue SpillSlot = DAG.CreateStackTemporary(StoredSize, StackAlign);
-      int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
-      MemOpChains.push_back(
-          DAG.getStore(Chain, DL, ArgValue, SpillSlot,
-                       MachinePointerInfo::getFixedStack(MF, FI)));
-      for (const auto &Part : Parts) {
-        SDValue PartValue = Part.first;
-        SDValue PartOffset = Part.second;
-        SDValue Address =
-            DAG.getNode(ISD::ADD, DL, PtrVT, SpillSlot, PartOffset);
+        // For forwarded args, getArgNo() gives the unfiltered index directly.
+        // For computed args, walk the caller's formals to resolve it.
+        unsigned FormalArgIdx = CallArgIdx;
+        if (FormalArg) {
+          FormalArgIdx = FormalArg->getArgNo();
+        } else {
+          FilteredIdx = 0;
+          for (const auto &Arg : MF.getFunction().args()) {
+            if (Arg.getType()->isEmptyTy())
+              continue;
+            if (FilteredIdx == CallArgIdx) {
+              FormalArgIdx = Arg.getArgNo();
+              break;
+            }
+            ++FilteredIdx;
+          }
+        }
+
+        Register VReg = LAFI->getIncomingIndirectArg(FormalArgIdx);
+        SDValue CopyOp = DAG.getCopyFromReg(Chain, DL, VReg, PtrVT);
+        // Thread the CopyFromReg output chain through MemOpChains so the
+        // TokenFactor below sequences the copy with any stores we emit
+        // for this argument.
+        MemOpChains.push_back(CopyOp.getValue(1));
+        SDValue IncomingPtr = CopyOp;
+
+        if (!FormalArg) {
+          // Computed value: store into the incoming indirect pointer for the
+          // same-position formal parameter (musttail guarantees matching
+          // prototypes, so types match). The pointer survives the tail call
+          // since it points to the caller's caller's frame.
+          //
+          // The data-flow edge through IncomingPtr already prevents the
+          // store from being scheduled before the CopyFromReg. Threading
+          // CopyOp.getValue(1) (the copy's output chain) into the store
+          // makes that ordering explicit on the chain edge as well, which
+          // is the convention for memory ops chaining off their producers.
+          MemOpChains.push_back(
+              DAG.getStore(CopyOp.getValue(1), DL, ArgValue, IncomingPtr,
+                           MachinePointerInfo::getUnknownStack(MF)));
+          // Store any split parts at their respective offsets.
+          unsigned ArgPartOffset = Outs[OutIdx].PartOffset;
+          while (i + 1 != e && Outs[OutIdx + 1].OrigArgIndex == CallArgIdx) {
+            SDValue PartValue = OutVals[OutIdx + 1];
+            unsigned PartOffset = Outs[OutIdx + 1].PartOffset - ArgPartOffset;
+            SDValue Offset = DAG.getIntPtrConstant(PartOffset, DL);
+            SDValue Addr =
+                DAG.getNode(ISD::ADD, DL, PtrVT, IncomingPtr, Offset);
+            MemOpChains.push_back(
+                DAG.getStore(CopyOp.getValue(1), DL, PartValue, Addr,
+                             MachinePointerInfo::getUnknownStack(MF)));
+            ++i;
+            ++OutIdx;
+          }
+        }
+        ArgValue = IncomingPtr;
+
+        // Skip any remaining split parts (for forwarded args, they are
+        // covered by the forwarded pointer).
+        while (i + 1 != e && Outs[OutIdx + 1].OrigArgIndex == CallArgIdx) {
+          ++i;
+          ++OutIdx;
+        }
+      } else {
+        // Store the argument in a stack slot and pass its address.
+        Align StackAlign =
+            std::max(getPrefTypeAlign(Outs[OutIdx].ArgVT, DAG),
+                     getPrefTypeAlign(ArgValue.getValueType(), DAG));
+        TypeSize StoredSize = ArgValue.getValueType().getStoreSize();
+        // If the original argument was split and passed by reference, we need
+        // to store the required parts of it here (and pass just one address).
+        unsigned ArgIndex = Outs[OutIdx].OrigArgIndex;
+        unsigned ArgPartOffset = Outs[OutIdx].PartOffset;
+        assert(ArgPartOffset == 0);
+        // Calculate the total size to store. We don't have access to what 
we're
+        // actually storing other than performing the loop and collecting the
+        // info.
+        SmallVector<std::pair<SDValue, SDValue>> Parts;
+        while (i + 1 != e && Outs[OutIdx + 1].OrigArgIndex == ArgIndex) {
+          SDValue PartValue = OutVals[OutIdx + 1];
+          unsigned PartOffset = Outs[OutIdx + 1].PartOffset - ArgPartOffset;
+          SDValue Offset = DAG.getIntPtrConstant(PartOffset, DL);
+          EVT PartVT = PartValue.getValueType();
+          StoredSize += PartVT.getStoreSize();
+          StackAlign = std::max(StackAlign, getPrefTypeAlign(PartVT, DAG));
+          Parts.push_back(std::make_pair(PartValue, Offset));
+          ++i;
+          ++OutIdx;
+        }
+        SDValue SpillSlot = DAG.CreateStackTemporary(StoredSize, StackAlign);
+        int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
         MemOpChains.push_back(
-            DAG.getStore(Chain, DL, PartValue, Address,
+            DAG.getStore(Chain, DL, ArgValue, SpillSlot,
                          MachinePointerInfo::getFixedStack(MF, FI)));
+        for (const auto &Part : Parts) {
+          SDValue PartValue = Part.first;
+          SDValue PartOffset = Part.second;
+          SDValue Address =
+              DAG.getNode(ISD::ADD, DL, PtrVT, SpillSlot, PartOffset);
+          MemOpChains.push_back(
+              DAG.getStore(Chain, DL, PartValue, Address,
+                           MachinePointerInfo::getFixedStack(MF, FI)));
+        }
+        ArgValue = SpillSlot;
       }
-      ArgValue = SpillSlot;
     } else {
       ArgValue = convertValVTToLocVT(DAG, ArgValue, VA, DL);
     }
@@ -8542,8 +8681,8 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI,
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgValue));
     } else {
       assert(VA.isMemLoc() && "Argument not register or memory");
-      assert(!IsTailCall && "Tail call not allowed if stack is used "
-                            "for passing parameters");
+      assert((!IsTailCall || (CLI.CB && CLI.CB->isMustTailCall())) &&
+             "Tail call not allowed if stack is used for passing parameters");
 
       // Work out the address of the stack slot.
       if (!StackPtr.getNode())
diff --git a/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h 
b/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h
index 904985c189dba..7bf7171198e8a 100644
--- a/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h
+++ b/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h
@@ -14,6 +14,7 @@
 #define LLVM_LIB_TARGET_LOONGARCH_LOONGARCHMACHINEFUNCTIONINFO_H
 
 #include "LoongArchSubtarget.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 
@@ -32,6 +33,13 @@ class LoongArchMachineFunctionInfo : public 
MachineFunctionInfo {
   /// Size of stack frame to save callee saved registers
   unsigned CalleeSavedStackSize = 0;
 
+  /// Incoming indirect argument pointers saved as virtual registers, keyed by
+  /// formal parameter index. Used for musttail forwarding of indirect args.
+  /// Virtual registers (not SDValues) are used because the SelectionDAG is
+  /// cleared between basic blocks, and musttail calls may be in non-entry
+  /// blocks.
+  DenseMap<unsigned, Register> IncomingIndirectArgs;
+
   /// FrameIndex of the spill slot when there is no scavenged register in
   /// insertIndirectBranch.
   int BranchRelaxationSpillFrameIndex = -1;
@@ -63,6 +71,15 @@ class LoongArchMachineFunctionInfo : public 
MachineFunctionInfo {
   unsigned getCalleeSavedStackSize() const { return CalleeSavedStackSize; }
   void setCalleeSavedStackSize(unsigned Size) { CalleeSavedStackSize = Size; }
 
+  void setIncomingIndirectArg(unsigned ArgIndex, Register Reg) {
+    IncomingIndirectArgs[ArgIndex] = Reg;
+  }
+  Register getIncomingIndirectArg(unsigned ArgIndex) const {
+    auto It = IncomingIndirectArgs.find(ArgIndex);
+    assert(It != IncomingIndirectArgs.end() && "No incoming indirect arg");
+    return It->second;
+  }
+
   int getBranchRelaxationSpillFrameIndex() {
     return BranchRelaxationSpillFrameIndex;
   }
diff --git a/llvm/test/CodeGen/LoongArch/musttail-call.ll 
b/llvm/test/CodeGen/LoongArch/musttail-call.ll
new file mode 100644
index 0000000000000..0fe77ed802b7a
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/musttail-call.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 5
+; RUN: llc -mtriple=loongarch32 %s -o - | FileCheck %s --check-prefix=LA32
+; RUN: llc -mtriple=loongarch64 %s -o - | FileCheck %s --check-prefix=LA64
+
+%struct.A = type { i32 }
+
+declare void @callee_musttail(ptr sret(%struct.A) %a)
+define void @caller_musttail(ptr sret(%struct.A) %a) {
+; LA32-LABEL: caller_musttail:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    b callee_musttail
+;
+; LA64-LABEL: caller_musttail:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    pcaddu18i $t8, %call36(callee_musttail)
+; LA64-NEXT:    jr $t8
+entry:
+  musttail call void @callee_musttail(ptr sret(%struct.A) %a)
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/musttail-indirect-args.ll 
b/llvm/test/CodeGen/LoongArch/musttail-indirect-args.ll
new file mode 100644
index 0000000000000..d088d6065aa07
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/musttail-indirect-args.ll
@@ -0,0 +1,907 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 5
+; RUN: llc -mtriple=loongarch32 %s -o - | FileCheck %s --check-prefix=LA32
+; RUN: llc -mtriple=loongarch64 %s -o - | FileCheck %s --check-prefix=LA64
+
+; Test that musttail with indirect args (fp128 on LA32) forwards the incoming
+; pointer instead of creating a new stack temporary. Without this fix, the
+; pointer would dangle after the tail call deallocates the caller's frame.
+
+declare i32 @callee_musttail_indirect(fp128 %a)
+
+; fp128 is indirect on LA32 (too large for registers), direct on LA64.
+; On LA32, musttail must forward the incoming indirect pointer (a0) directly.
+define i32 @caller_musttail_indirect(fp128 %a) nounwind {
+; LA32-LABEL: caller_musttail_indirect:
+; LA32:       # %bb.0:
+; LA32-NEXT:    b callee_musttail_indirect
+;
+; LA64-LABEL: caller_musttail_indirect:
+; LA64:       # %bb.0:
+; LA64-NEXT:    pcaddu18i $t8, %call36(callee_musttail_indirect)
+; LA64-NEXT:    jr $t8
+  %call = musttail call i32 @callee_musttail_indirect(fp128 %a)
+  ret i32 %call
+}
+
+; Verify that non-musttail tail call with indirect args does NOT tail call
+; (this is the PR #184972 fix - indirect args are unsafe for regular tail 
calls).
+define void @caller_no_musttail_indirect() nounwind {
+; LA32-LABEL: caller_no_musttail_indirect:
+; LA32:       # %bb.0:
+; LA32-NEXT:    addi.w $sp, $sp, -32
+; LA32-NEXT:    st.w $ra, $sp, 28 # 4-byte Folded Spill
+; LA32-NEXT:    lu12i.w $a0, 262128
+; LA32-NEXT:    st.w $a0, $sp, 12
+; LA32-NEXT:    st.w $zero, $sp, 8
+; LA32-NEXT:    st.w $zero, $sp, 4
+; LA32-NEXT:    addi.w $a0, $sp, 0
+; LA32-NEXT:    st.w $zero, $sp, 0
+; LA32-NEXT:    bl callee_musttail_indirect
+; LA32-NEXT:    ld.w $ra, $sp, 28 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 32
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: caller_no_musttail_indirect:
+; LA64:       # %bb.0:
+; LA64-NEXT:    ori $a0, $zero, 0
+; LA64-NEXT:    lu32i.d $a0, -65536
+; LA64-NEXT:    lu52i.d $a1, $a0, 1023
+; LA64-NEXT:    move $a0, $zero
+; LA64-NEXT:    pcaddu18i $t8, %call36(callee_musttail_indirect)
+; LA64-NEXT:    jr $t8
+  %call = tail call i32 @callee_musttail_indirect(fp128 
0xL00000000000000003FFF000000000000)
+  ret void
+}
+
+; Verify that non-musttail tail call forwarding an indirect arg from the
+; caller's own parameters also does NOT tail call (the arg lives on the
+; caller's frame, which would be deallocated).
+define i32 @caller_no_musttail_forward_indirect(fp128 %a) nounwind {
+; LA32-LABEL: caller_no_musttail_forward_indirect:
+; LA32:       # %bb.0:
+; LA32-NEXT:    addi.w $sp, $sp, -32
+; LA32-NEXT:    st.w $ra, $sp, 28 # 4-byte Folded Spill
+; LA32-NEXT:    ld.w $a1, $a0, 0
+; LA32-NEXT:    ld.w $a2, $a0, 4
+; LA32-NEXT:    ld.w $a3, $a0, 8
+; LA32-NEXT:    ld.w $a0, $a0, 12
+; LA32-NEXT:    st.w $a0, $sp, 12
+; LA32-NEXT:    st.w $a3, $sp, 8
+; LA32-NEXT:    st.w $a2, $sp, 4
+; LA32-NEXT:    addi.w $a0, $sp, 0
+; LA32-NEXT:    st.w $a1, $sp, 0
+; LA32-NEXT:    bl callee_musttail_indirect
+; LA32-NEXT:    ld.w $ra, $sp, 28 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 32
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: caller_no_musttail_forward_indirect:
+; LA64:       # %bb.0:
+; LA64-NEXT:    pcaddu18i $t8, %call36(callee_musttail_indirect)
+; LA64-NEXT:    jr $t8
+  %call = tail call i32 @callee_musttail_indirect(fp128 %a)
+  ret i32 %call
+}
+
+; Test musttail with two indirect fp128 args on LA32. Both pointers must be
+; forwarded. Exercises the DenseMap with two distinct OrigArgIndex values.
+declare i32 @callee_musttail_two_indirect(fp128 %a, fp128 %b)
+
+define i32 @caller_musttail_two_indirect(fp128 %a, fp128 %b) nounwind {
+; LA32-LABEL: caller_musttail_two_indirect:
+; LA32:       # %bb.0:
+; LA32-NEXT:    b callee_musttail_two_indirect
+;
+; LA64-LABEL: caller_musttail_two_indirect:
+; LA64:       # %bb.0:
+; LA64-NEXT:    pcaddu18i $t8, %call36(callee_musttail_two_indirect)
+; LA64-NEXT:    jr $t8
+  %call = musttail call i32 @callee_musttail_two_indirect(fp128 %a, fp128 %b)
+  ret i32 %call
+}
+
+; Test musttail with mixed direct (i32 in register) + indirect (fp128) args.
+; Confirms OrigArgIndex lookup works when not all args are indirect.
+declare i32 @callee_musttail_mixed(i32 %x, fp128 %a)
+
+define i32 @caller_musttail_mixed(i32 %x, fp128 %a) nounwind {
+; LA32-LABEL: caller_musttail_mixed:
+; LA32:       # %bb.0:
+; LA32-NEXT:    b callee_musttail_mixed
+;
+; LA64-LABEL: caller_musttail_mixed:
+; LA64:       # %bb.0:
+; LA64-NEXT:    pcaddu18i $t8, %call36(callee_musttail_mixed)
+; LA64-NEXT:    jr $t8
+  %call = musttail call i32 @callee_musttail_mixed(i32 %x, fp128 %a)
+  ret i32 %call
+}
+
+; Test musttail with i128 on LA32 (indirect, split into 4 x i32 parts).
+declare i64 @callee_musttail_i128(i128 %a)
+
+define i64 @caller_musttail_i128(i128 %a) nounwind {
+; LA32-LABEL: caller_musttail_i128:
+; LA32:       # %bb.0:
+; LA32-NEXT:    b callee_musttail_i128
+;
+; LA64-LABEL: caller_musttail_i128:
+; LA64:       # %bb.0:
+; LA64-NEXT:    pcaddu18i $t8, %call36(callee_musttail_i128)
+; LA64-NEXT:    jr $t8
+  %call = musttail call i64 @callee_musttail_i128(i128 %a)
+  ret i64 %call
+}
+
+; Test musttail with i128 (indirect+split on LA32) plus a trailing i32 direct 
arg.
+; Exercises the split-skip logic followed by a normal register arg.
+declare i64 @callee_musttail_i128_and_i32(i128 %a, i32 %x)
+
+define i64 @caller_musttail_i128_and_i32(i128 %a, i32 %x) nounwind {
+; LA32-LABEL: caller_musttail_i128_and_i32:
+; LA32:       # %bb.0:
+; LA32-NEXT:    b callee_musttail_i128_and_i32
+;
+; LA64-LABEL: caller_musttail_i128_and_i32:
+; LA64:       # %bb.0:
+; LA64-NEXT:    pcaddu18i $t8, %call36(callee_musttail_i128_and_i32)
+; LA64-NEXT:    jr $t8
+  %call = musttail call i64 @callee_musttail_i128_and_i32(i128 %a, i32 %x)
+  ret i64 %call
+}
+
+; Test musttail with two indirect args SWAPPED. The pointers must be exchanged
+; before the tail call. This exercises the OrigArgIndex -> Argument::getArgNo()
+; resolution in LowerCall.
+define i32 @caller_musttail_two_indirect_swapped(fp128 %a, fp128 %b) nounwind {
+; LA32-LABEL: caller_musttail_two_indirect_swapped:
+; LA32:       # %bb.0:
+; LA32-NEXT:    move $a2, $a0
+; LA32-NEXT:    move $a0, $a1
+; LA32-NEXT:    move $a1, $a2
+; LA32-NEXT:    b callee_musttail_two_indirect
+;
+; LA64-LABEL: caller_musttail_two_indirect_swapped:
+; LA64:       # %bb.0:
+; LA64-NEXT:    move $a4, $a1
+; LA64-NEXT:    move $a5, $a0
+; LA64-NEXT:    move $a0, $a2
+; LA64-NEXT:    move $a1, $a3
+; LA64-NEXT:    move $a2, $a5
+; LA64-NEXT:    move $a3, $a4
+; LA64-NEXT:    pcaddu18i $t8, %call36(callee_musttail_two_indirect)
+; LA64-NEXT:    jr $t8
+  %call = musttail call i32 @callee_musttail_two_indirect(fp128 %b, fp128 %a)
+  ret i32 %call
+}
+
+; Test musttail with three indirect args rotated: call @f(%c, %a, %b).
+; All three pointers need to be shuffled.
+declare i32 @callee_musttail_three_indirect(fp128 %a, fp128 %b, fp128 %c)
+
+define i32 @caller_musttail_three_indirect_rotated(fp128 %a, fp128 %b, fp128 
%c) nounwind {
+; LA32-LABEL: caller_musttail_three_indirect_rotated:
+; LA32:       # %bb.0:
+; LA32-NEXT:    move $a3, $a1
+; LA32-NEXT:    move $a1, $a0
+; LA32-NEXT:    move $a0, $a2
+; LA32-NEXT:    move $a2, $a3
+; LA32-NEXT:    b callee_musttail_three_indirect
+;
+; LA64-LABEL: caller_musttail_three_indirect_rotated:
+; LA64:       # %bb.0:
+; LA64-NEXT:    move $a6, $a3
+; LA64-NEXT:    move $a7, $a2
+; LA64-NEXT:    move $a3, $a1
+; LA64-NEXT:    move $a2, $a0
+; LA64-NEXT:    move $a0, $a4
+; LA64-NEXT:    move $a1, $a5
+; LA64-NEXT:    move $a4, $a7
+; LA64-NEXT:    move $a5, $a6
+; LA64-NEXT:    pcaddu18i $t8, %call36(callee_musttail_three_indirect)
+; LA64-NEXT:    jr $t8
+  %call = musttail call i32 @callee_musttail_three_indirect(fp128 %c, fp128 
%a, fp128 %b)
+  ret i32 %call
+}
+
+; Test musttail with mixed direct + indirect args where the indirect args
+; are swapped but the direct arg stays in place.
+declare i32 @callee_musttail_mixed_two_indirect(i32 %x, fp128 %a, fp128 %b)
+
+define i32 @caller_musttail_mixed_swap_indirect(i32 %x, fp128 %a, fp128 %b) 
nounwind {
+; LA32-LABEL: caller_musttail_mixed_swap_indirect:
+; LA32:       # %bb.0:
+; LA32-NEXT:    move $a3, $a1
+; LA32-NEXT:    move $a1, $a2
+; LA32-NEXT:    move $a2, $a3
+; LA32-NEXT:    b callee_musttail_mixed_two_indirect
+;
+; LA64-LABEL: caller_musttail_mixed_swap_indirect:
+; LA64:       # %bb.0:
+; LA64-NEXT:    move $a5, $a2
+; LA64-NEXT:    move $a6, $a1
+; LA64-NEXT:    move $a1, $a3
+; LA64-NEXT:    move $a2, $a4
+; LA64-NEXT:    move $a3, $a6
+; LA64-NEXT:    move $a4, $a5
+; LA64-NEXT:    pcaddu18i $t8, %call36(callee_musttail_mixed_two_indirect)
+; LA64-NEXT:    jr $t8
+  %call = musttail call i32 @callee_musttail_mixed_two_indirect(i32 %x, fp128 
%b, fp128 %a)
+  ret i32 %call
+}
+
+; Test musttail with swapped i128 on LA32 (split indirect args).
+declare i64 @callee_musttail_two_i128(i128 %a, i128 %b)
+
+define i64 @caller_musttail_two_i128_swapped(i128 %a, i128 %b) nounwind {
+; LA32-LABEL: caller_musttail_two_i128_swapped:
+; LA32:       # %bb.0:
+; LA32-NEXT:    move $a2, $a0
+; LA32-NEXT:    move $a0, $a1
+; LA32-NEXT:    move $a1, $a2
+; LA32-NEXT:    b callee_musttail_two_i128
+;
+; LA64-LABEL: caller_musttail_two_i128_swapped:
+; LA64:       # %bb.0:
+; LA64-NEXT:    move $a4, $a1
+; LA64-NEXT:    move $a5, $a0
+; LA64-NEXT:    move $a0, $a2
+; LA64-NEXT:    move $a1, $a3
+; LA64-NEXT:    move $a2, $a5
+; LA64-NEXT:    move $a3, $a4
+; LA64-NEXT:    pcaddu18i $t8, %call36(callee_musttail_two_i128)
+; LA64-NEXT:    jr $t8
+  %call = musttail call i64 @callee_musttail_two_i128(i128 %b, i128 %a)
+  ret i64 %call
+}
+
+; Test musttail passing the same indirect arg to both positions.
+define i32 @caller_musttail_two_indirect_dup(fp128 %a, fp128 %b) nounwind {
+; LA32-LABEL: caller_musttail_two_indirect_dup:
+; LA32:       # %bb.0:
+; LA32-NEXT:    move $a1, $a0
+; LA32-NEXT:    b callee_musttail_two_indirect
+;
+; LA64-LABEL: caller_musttail_two_indirect_dup:
+; LA64:       # %bb.0:
+; LA64-NEXT:    move $a2, $a0
+; LA64-NEXT:    move $a3, $a1
+; LA64-NEXT:    pcaddu18i $t8, %call36(callee_musttail_two_indirect)
+; LA64-NEXT:    jr $t8
+  %call = musttail call i32 @callee_musttail_two_indirect(fp128 %a, fp128 %a)
+  ret i32 %call
+}
+
+; Test musttail with enough indirect args to spill to the stack (9 fp128 on
+; LA32 uses a0-a7 for the first 8 pointers, 9th goes on the stack).
+declare void @callee_musttail_nine_indirect(fp128, fp128, fp128, fp128, fp128, 
fp128, fp128, fp128, fp128)
+
+define void @caller_musttail_nine_indirect(fp128 %a, fp128 %b, fp128 %c, fp128 
%d, fp128 %e, fp128 %f, fp128 %g, fp128 %h, fp128 %i) nounwind {
+; LA32-LABEL: caller_musttail_nine_indirect:
+; LA32:       # %bb.0:
+; LA32-NEXT:    ld.w $t0, $sp, 0
+; LA32-NEXT:    st.w $t0, $sp, 0
+; LA32-NEXT:    b callee_musttail_nine_indirect
+;
+; LA64-LABEL: caller_musttail_nine_indirect:
+; LA64:       # %bb.0:
+; LA64-NEXT:    vld $vr0, $sp, 0
+; LA64-NEXT:    vld $vr1, $sp, 64
+; LA64-NEXT:    vld $vr2, $sp, 48
+; LA64-NEXT:    vld $vr3, $sp, 32
+; LA64-NEXT:    vld $vr4, $sp, 16
+; LA64-NEXT:    vst $vr1, $sp, 64
+; LA64-NEXT:    vst $vr2, $sp, 48
+; LA64-NEXT:    vst $vr3, $sp, 32
+; LA64-NEXT:    vst $vr4, $sp, 16
+; LA64-NEXT:    vst $vr0, $sp, 0
+; LA64-NEXT:    pcaddu18i $t8, %call36(callee_musttail_nine_indirect)
+; LA64-NEXT:    jr $t8
+  musttail call void @callee_musttail_nine_indirect(fp128 %a, fp128 %b, fp128 
%c, fp128 %d, fp128 %e, fp128 %f, fp128 %g, fp128 %h, fp128 %i)
+  ret void
+}
+
+; Test musttail swapping the first (register) and last (stack-spilled) args.
+define void @caller_musttail_nine_indirect_swap_first_last(fp128 %a, fp128 %b, 
fp128 %c, fp128 %d, fp128 %e, fp128 %f, fp128 %g, fp128 %h, fp128 %i) nounwind {
+; LA32-LABEL: caller_musttail_nine_indirect_swap_first_last:
+; LA32:       # %bb.0:
+; LA32-NEXT:    ld.w $t0, $sp, 0
+; LA32-NEXT:    st.w $a0, $sp, 0
+; LA32-NEXT:    move $a0, $t0
+; LA32-NEXT:    b callee_musttail_nine_indirect
+;
+; LA64-LABEL: caller_musttail_nine_indirect_swap_first_last:
+; LA64:       # %bb.0:
+; LA64-NEXT:    ld.d $t0, $sp, 64
+; LA64-NEXT:    ld.d $t1, $sp, 72
+; LA64-NEXT:    vld $vr0, $sp, 0
+; LA64-NEXT:    vld $vr1, $sp, 16
+; LA64-NEXT:    vld $vr2, $sp, 48
+; LA64-NEXT:    vld $vr3, $sp, 32
+; LA64-NEXT:    st.d $a1, $sp, 72
+; LA64-NEXT:    st.d $a0, $sp, 64
+; LA64-NEXT:    vst $vr2, $sp, 48
+; LA64-NEXT:    vst $vr3, $sp, 32
+; LA64-NEXT:    vst $vr1, $sp, 16
+; LA64-NEXT:    vst $vr0, $sp, 0
+; LA64-NEXT:    move $a0, $t0
+; LA64-NEXT:    move $a1, $t1
+; LA64-NEXT:    pcaddu18i $t8, %call36(callee_musttail_nine_indirect)
+; LA64-NEXT:    jr $t8
+  musttail call void @callee_musttail_nine_indirect(fp128 %i, fp128 %b, fp128 
%c, fp128 %d, fp128 %e, fp128 %f, fp128 %g, fp128 %h, fp128 %a)
+  ret void
+}
+
+; Test musttail where the indirect arg is a computed value, not a forwarded
+; formal parameter. The computed value must be stored into the incoming
+; indirect pointer before tail calling.
+define i32 @caller_musttail_computed(fp128 %a) nounwind {
+; LA32-LABEL: caller_musttail_computed:
+; LA32:       # %bb.0:
+; LA32-NEXT:    addi.w $sp, $sp, -64
+; LA32-NEXT:    st.w $ra, $sp, 60 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 56 # 4-byte Folded Spill
+; LA32-NEXT:    move $fp, $a0
+; LA32-NEXT:    ld.w $a3, $a0, 4
+; LA32-NEXT:    ld.w $a0, $a0, 8
+; LA32-NEXT:    ld.w $a1, $fp, 12
+; LA32-NEXT:    ld.w $a2, $fp, 0
+; LA32-NEXT:    st.w $a2, $sp, 8
+; LA32-NEXT:    st.w $a2, $sp, 24
+; LA32-NEXT:    st.w $a1, $sp, 20
+; LA32-NEXT:    st.w $a0, $sp, 16
+; LA32-NEXT:    st.w $a3, $sp, 12
+; LA32-NEXT:    st.w $a1, $sp, 36
+; LA32-NEXT:    st.w $a0, $sp, 32
+; LA32-NEXT:    addi.w $a0, $sp, 40
+; LA32-NEXT:    addi.w $a1, $sp, 24
+; LA32-NEXT:    addi.w $a2, $sp, 8
+; LA32-NEXT:    st.w $a3, $sp, 28
+; LA32-NEXT:    bl __addtf3
+; LA32-NEXT:    ld.w $a0, $sp, 40
+; LA32-NEXT:    ld.w $a1, $sp, 44
+; LA32-NEXT:    ld.w $a2, $sp, 48
+; LA32-NEXT:    ld.w $a3, $sp, 52
+; LA32-NEXT:    st.w $a0, $fp, 0
+; LA32-NEXT:    st.w $a1, $fp, 4
+; LA32-NEXT:    st.w $a2, $fp, 8
+; LA32-NEXT:    st.w $a3, $fp, 12
+; LA32-NEXT:    move $a0, $fp
+; LA32-NEXT:    ld.w $fp, $sp, 56 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 60 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 64
+; LA32-NEXT:    b callee_musttail_indirect
+;
+; LA64-LABEL: caller_musttail_computed:
+; LA64:       # %bb.0:
+; LA64-NEXT:    addi.d $sp, $sp, -16
+; LA64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    move $a2, $a0
+; LA64-NEXT:    move $a3, $a1
+; LA64-NEXT:    pcaddu18i $ra, %call36(__addtf3)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 16
+; LA64-NEXT:    pcaddu18i $t8, %call36(callee_musttail_indirect)
+; LA64-NEXT:    jr $t8
+  %sum = fadd fp128 %a, %a
+  %r = musttail call i32 @callee_musttail_indirect(fp128 %sum)
+  ret i32 %r
+}
+
+; Test musttail with a computed i128 on LA32 (split indirect). The add result
+; must be stored back into the incoming pointer.
+define i64 @caller_musttail_computed_i128(i128 %a) nounwind {
+; LA32-LABEL: caller_musttail_computed_i128:
+; LA32:       # %bb.0:
+; LA32-NEXT:    ld.w $a1, $a0, 0
+; LA32-NEXT:    ld.w $a2, $a0, 12
+; LA32-NEXT:    ld.w $a3, $a0, 4
+; LA32-NEXT:    ld.w $a4, $a0, 8
+; LA32-NEXT:    addi.w $a1, $a1, 1
+; LA32-NEXT:    sltui $a5, $a1, 1
+; LA32-NEXT:    add.w $a3, $a3, $a5
+; LA32-NEXT:    or $a5, $a1, $a3
+; LA32-NEXT:    sltui $a5, $a5, 1
+; LA32-NEXT:    add.w $a5, $a4, $a5
+; LA32-NEXT:    sltu $a4, $a5, $a4
+; LA32-NEXT:    add.w $a2, $a2, $a4
+; LA32-NEXT:    st.w $a1, $a0, 0
+; LA32-NEXT:    st.w $a3, $a0, 4
+; LA32-NEXT:    st.w $a5, $a0, 8
+; LA32-NEXT:    st.w $a2, $a0, 12
+; LA32-NEXT:    b callee_musttail_i128
+;
+; LA64-LABEL: caller_musttail_computed_i128:
+; LA64:       # %bb.0:
+; LA64-NEXT:    addi.d $a0, $a0, 1
+; LA64-NEXT:    sltui $a2, $a0, 1
+; LA64-NEXT:    add.d $a1, $a1, $a2
+; LA64-NEXT:    pcaddu18i $t8, %call36(callee_musttail_i128)
+; LA64-NEXT:    jr $t8
+  %sum = add i128 %a, 1
+  %r = musttail call i64 @callee_musttail_i128(i128 %sum)
+  ret i64 %r
+}
+
+; Test musttail with one computed and one forwarded indirect arg.
+; Position 0 gets the fadd result (stored into %a's incoming pointer),
+; position 1 gets %b's incoming pointer forwarded directly.
+define i32 @caller_musttail_computed_and_forwarded(fp128 %a, fp128 %b) 
nounwind {
+; LA32-LABEL: caller_musttail_computed_and_forwarded:
+; LA32:       # %bb.0:
+; LA32-NEXT:    addi.w $sp, $sp, -64
+; LA32-NEXT:    st.w $ra, $sp, 60 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 56 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s0, $sp, 52 # 4-byte Folded Spill
+; LA32-NEXT:    move $fp, $a1
+; LA32-NEXT:    move $s0, $a0
+; LA32-NEXT:    ld.w $a3, $a1, 4
+; LA32-NEXT:    ld.w $a0, $a1, 8
+; LA32-NEXT:    ld.w $a1, $a1, 12
+; LA32-NEXT:    ld.w $a2, $fp, 0
+; LA32-NEXT:    ld.w $a4, $s0, 4
+; LA32-NEXT:    ld.w $a5, $s0, 8
+; LA32-NEXT:    ld.w $a6, $s0, 12
+; LA32-NEXT:    ld.w $a7, $s0, 0
+; LA32-NEXT:    st.w $a7, $sp, 16
+; LA32-NEXT:    st.w $a6, $sp, 28
+; LA32-NEXT:    st.w $a5, $sp, 24
+; LA32-NEXT:    st.w $a4, $sp, 20
+; LA32-NEXT:    st.w $a2, $sp, 0
+; LA32-NEXT:    st.w $a1, $sp, 12
+; LA32-NEXT:    st.w $a0, $sp, 8
+; LA32-NEXT:    addi.w $a0, $sp, 32
+; LA32-NEXT:    addi.w $a1, $sp, 16
+; LA32-NEXT:    addi.w $a2, $sp, 0
+; LA32-NEXT:    st.w $a3, $sp, 4
+; LA32-NEXT:    bl __addtf3
+; LA32-NEXT:    ld.w $a0, $sp, 32
+; LA32-NEXT:    ld.w $a1, $sp, 36
+; LA32-NEXT:    ld.w $a2, $sp, 40
+; LA32-NEXT:    ld.w $a3, $sp, 44
+; LA32-NEXT:    st.w $a0, $s0, 0
+; LA32-NEXT:    st.w $a1, $s0, 4
+; LA32-NEXT:    st.w $a2, $s0, 8
+; LA32-NEXT:    st.w $a3, $s0, 12
+; LA32-NEXT:    move $a0, $s0
+; LA32-NEXT:    move $a1, $fp
+; LA32-NEXT:    ld.w $s0, $sp, 52 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $fp, $sp, 56 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 60 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 64
+; LA32-NEXT:    b callee_musttail_two_indirect
+;
+; LA64-LABEL: caller_musttail_computed_and_forwarded:
+; LA64:       # %bb.0:
+; LA64-NEXT:    addi.d $sp, $sp, -32
+; LA64-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s0, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    move $fp, $a3
+; LA64-NEXT:    move $s0, $a2
+; LA64-NEXT:    pcaddu18i $ra, %call36(__addtf3)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    move $a2, $s0
+; LA64-NEXT:    move $a3, $fp
+; LA64-NEXT:    ld.d $s0, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 32
+; LA64-NEXT:    pcaddu18i $t8, %call36(callee_musttail_two_indirect)
+; LA64-NEXT:    jr $t8
+  %sum = fadd fp128 %a, %b
+  %r = musttail call i32 @callee_musttail_two_indirect(fp128 %sum, fp128 %b)
+  ret i32 %r
+}
+
+; Test musttail with one forwarded and one computed indirect arg (reversed).
+; Position 0 forwards %a, position 1 gets the computed value.
+define i32 @caller_musttail_forwarded_and_computed(fp128 %a, fp128 %b) 
nounwind {
+; LA32-LABEL: caller_musttail_forwarded_and_computed:
+; LA32:       # %bb.0:
+; LA32-NEXT:    addi.w $sp, $sp, -64
+; LA32-NEXT:    st.w $ra, $sp, 60 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 56 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s0, $sp, 52 # 4-byte Folded Spill
+; LA32-NEXT:    move $fp, $a1
+; LA32-NEXT:    move $s0, $a0
+; LA32-NEXT:    ld.w $a3, $a1, 4
+; LA32-NEXT:    ld.w $a0, $a1, 8
+; LA32-NEXT:    ld.w $a1, $a1, 12
+; LA32-NEXT:    ld.w $a2, $fp, 0
+; LA32-NEXT:    ld.w $a4, $s0, 4
+; LA32-NEXT:    ld.w $a5, $s0, 8
+; LA32-NEXT:    ld.w $a6, $s0, 12
+; LA32-NEXT:    ld.w $a7, $s0, 0
+; LA32-NEXT:    st.w $a7, $sp, 16
+; LA32-NEXT:    st.w $a6, $sp, 28
+; LA32-NEXT:    st.w $a5, $sp, 24
+; LA32-NEXT:    st.w $a4, $sp, 20
+; LA32-NEXT:    st.w $a2, $sp, 0
+; LA32-NEXT:    st.w $a1, $sp, 12
+; LA32-NEXT:    st.w $a0, $sp, 8
+; LA32-NEXT:    addi.w $a0, $sp, 32
+; LA32-NEXT:    addi.w $a1, $sp, 16
+; LA32-NEXT:    addi.w $a2, $sp, 0
+; LA32-NEXT:    st.w $a3, $sp, 4
+; LA32-NEXT:    bl __addtf3
+; LA32-NEXT:    ld.w $a0, $sp, 32
+; LA32-NEXT:    ld.w $a1, $sp, 36
+; LA32-NEXT:    ld.w $a2, $sp, 40
+; LA32-NEXT:    ld.w $a3, $sp, 44
+; LA32-NEXT:    st.w $a0, $fp, 0
+; LA32-NEXT:    st.w $a1, $fp, 4
+; LA32-NEXT:    st.w $a2, $fp, 8
+; LA32-NEXT:    st.w $a3, $fp, 12
+; LA32-NEXT:    move $a0, $s0
+; LA32-NEXT:    move $a1, $fp
+; LA32-NEXT:    ld.w $s0, $sp, 52 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $fp, $sp, 56 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 60 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 64
+; LA32-NEXT:    b callee_musttail_two_indirect
+;
+; LA64-LABEL: caller_musttail_forwarded_and_computed:
+; LA64:       # %bb.0:
+; LA64-NEXT:    addi.d $sp, $sp, -32
+; LA64-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s0, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    move $fp, $a1
+; LA64-NEXT:    move $s0, $a0
+; LA64-NEXT:    pcaddu18i $ra, %call36(__addtf3)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    move $a2, $a0
+; LA64-NEXT:    move $a3, $a1
+; LA64-NEXT:    move $a0, $s0
+; LA64-NEXT:    move $a1, $fp
+; LA64-NEXT:    ld.d $s0, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 32
+; LA64-NEXT:    pcaddu18i $t8, %call36(callee_musttail_two_indirect)
+; LA64-NEXT:    jr $t8
+  %sum = fadd fp128 %a, %b
+  %r = musttail call i32 @callee_musttail_two_indirect(fp128 %a, fp128 %sum)
+  ret i32 %r
+}
+
+; Test musttail with both args computed. Neither can be zero-copy forwarded.
+define i32 @caller_musttail_both_computed(fp128 %a, fp128 %b) nounwind {
+; LA32-LABEL: caller_musttail_both_computed:
+; LA32:       # %bb.0:
+; LA32-NEXT:    addi.w $sp, $sp, -160
+; LA32-NEXT:    st.w $ra, $sp, 156 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 152 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s0, $sp, 148 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s1, $sp, 144 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s2, $sp, 140 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s3, $sp, 136 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s4, $sp, 132 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s5, $sp, 128 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s6, $sp, 124 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s7, $sp, 120 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s8, $sp, 116 # 4-byte Folded Spill
+; LA32-NEXT:    move $fp, $a1
+; LA32-NEXT:    move $s0, $a0
+; LA32-NEXT:    ld.w $s1, $a1, 4
+; LA32-NEXT:    ld.w $s2, $a1, 8
+; LA32-NEXT:    ld.w $s3, $a1, 12
+; LA32-NEXT:    ld.w $s4, $a1, 0
+; LA32-NEXT:    ld.w $s5, $a0, 4
+; LA32-NEXT:    ld.w $s6, $a0, 8
+; LA32-NEXT:    ld.w $s7, $a0, 12
+; LA32-NEXT:    ld.w $s8, $a0, 0
+; LA32-NEXT:    st.w $s8, $sp, 80
+; LA32-NEXT:    st.w $s7, $sp, 92
+; LA32-NEXT:    st.w $s6, $sp, 88
+; LA32-NEXT:    st.w $s5, $sp, 84
+; LA32-NEXT:    st.w $s4, $sp, 64
+; LA32-NEXT:    st.w $s3, $sp, 76
+; LA32-NEXT:    st.w $s2, $sp, 72
+; LA32-NEXT:    addi.w $a0, $sp, 96
+; LA32-NEXT:    addi.w $a1, $sp, 80
+; LA32-NEXT:    addi.w $a2, $sp, 64
+; LA32-NEXT:    st.w $s1, $sp, 68
+; LA32-NEXT:    bl __addtf3
+; LA32-NEXT:    ld.w $a0, $sp, 108
+; LA32-NEXT:    st.w $a0, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    ld.w $a0, $sp, 104
+; LA32-NEXT:    st.w $a0, $sp, 8 # 4-byte Folded Spill
+; LA32-NEXT:    ld.w $a0, $sp, 100
+; LA32-NEXT:    st.w $a0, $sp, 4 # 4-byte Folded Spill
+; LA32-NEXT:    ld.w $a0, $sp, 96
+; LA32-NEXT:    st.w $a0, $sp, 0 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s8, $sp, 32
+; LA32-NEXT:    st.w $s7, $sp, 44
+; LA32-NEXT:    st.w $s6, $sp, 40
+; LA32-NEXT:    st.w $s5, $sp, 36
+; LA32-NEXT:    st.w $s4, $sp, 16
+; LA32-NEXT:    st.w $s3, $sp, 28
+; LA32-NEXT:    st.w $s2, $sp, 24
+; LA32-NEXT:    addi.w $a0, $sp, 48
+; LA32-NEXT:    addi.w $a1, $sp, 32
+; LA32-NEXT:    addi.w $a2, $sp, 16
+; LA32-NEXT:    st.w $s1, $sp, 20
+; LA32-NEXT:    bl __subtf3
+; LA32-NEXT:    ld.w $a0, $sp, 60
+; LA32-NEXT:    ld.w $a1, $sp, 56
+; LA32-NEXT:    ld.w $a2, $sp, 52
+; LA32-NEXT:    ld.w $a3, $sp, 48
+; LA32-NEXT:    ld.w $a4, $sp, 0 # 4-byte Folded Reload
+; LA32-NEXT:    st.w $a4, $s0, 0
+; LA32-NEXT:    ld.w $a4, $sp, 4 # 4-byte Folded Reload
+; LA32-NEXT:    st.w $a4, $s0, 4
+; LA32-NEXT:    ld.w $a4, $sp, 8 # 4-byte Folded Reload
+; LA32-NEXT:    st.w $a4, $s0, 8
+; LA32-NEXT:    ld.w $a4, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    st.w $a4, $s0, 12
+; LA32-NEXT:    st.w $a3, $fp, 0
+; LA32-NEXT:    st.w $a2, $fp, 4
+; LA32-NEXT:    st.w $a1, $fp, 8
+; LA32-NEXT:    st.w $a0, $fp, 12
+; LA32-NEXT:    move $a0, $s0
+; LA32-NEXT:    move $a1, $fp
+; LA32-NEXT:    ld.w $s8, $sp, 116 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s7, $sp, 120 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s6, $sp, 124 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s5, $sp, 128 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s4, $sp, 132 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s3, $sp, 136 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s2, $sp, 140 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s1, $sp, 144 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s0, $sp, 148 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $fp, $sp, 152 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 156 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 160
+; LA32-NEXT:    b callee_musttail_two_indirect
+;
+; LA64-LABEL: caller_musttail_both_computed:
+; LA64:       # %bb.0:
+; LA64-NEXT:    addi.d $sp, $sp, -64
+; LA64-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s0, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s1, $sp, 32 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s2, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s3, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s4, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    move $fp, $a3
+; LA64-NEXT:    move $s0, $a2
+; LA64-NEXT:    move $s1, $a1
+; LA64-NEXT:    move $s2, $a0
+; LA64-NEXT:    pcaddu18i $ra, %call36(__addtf3)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    move $s3, $a0
+; LA64-NEXT:    move $s4, $a1
+; LA64-NEXT:    move $a0, $s2
+; LA64-NEXT:    move $a1, $s1
+; LA64-NEXT:    move $a2, $s0
+; LA64-NEXT:    move $a3, $fp
+; LA64-NEXT:    pcaddu18i $ra, %call36(__subtf3)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    move $a2, $a0
+; LA64-NEXT:    move $a3, $a1
+; LA64-NEXT:    move $a0, $s3
+; LA64-NEXT:    move $a1, $s4
+; LA64-NEXT:    ld.d $s4, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s3, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s2, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s1, $sp, 32 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s0, $sp, 40 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 64
+; LA64-NEXT:    pcaddu18i $t8, %call36(callee_musttail_two_indirect)
+; LA64-NEXT:    jr $t8
+  %sum = fadd fp128 %a, %b
+  %diff = fsub fp128 %a, %b
+  %r = musttail call i32 @callee_musttail_two_indirect(fp128 %sum, fp128 %diff)
+  ret i32 %r
+}
+
+; Test musttail in a non-entry basic block. The indirect pointer must survive
+; across basic blocks (the SelectionDAG is cleared between BBs, so the pointer
+; must be preserved in a virtual register, not as a raw SDValue).
+declare i32 @callee_musttail_cross_bb(fp128 %a, i1 %c)
+
+define i32 @caller_musttail_cross_bb(fp128 %a, i1 %cond) nounwind {
+; LA32-LABEL: caller_musttail_cross_bb:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    andi $a2, $a1, 1
+; LA32-NEXT:    beq $a2, $zero, .LBB19_2
+; LA32-NEXT:  # %bb.1: # %then
+; LA32-NEXT:    b callee_musttail_cross_bb
+; LA32-NEXT:  .LBB19_2: # %else
+; LA32-NEXT:    move $a0, $zero
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: caller_musttail_cross_bb:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    andi $a3, $a2, 1
+; LA64-NEXT:    beqz $a3, .LBB19_2
+; LA64-NEXT:  # %bb.1: # %then
+; LA64-NEXT:    pcaddu18i $t8, %call36(callee_musttail_cross_bb)
+; LA64-NEXT:    jr $t8
+; LA64-NEXT:  .LBB19_2: # %else
+; LA64-NEXT:    move $a0, $zero
+; LA64-NEXT:    ret
+entry:
+  br i1 %cond, label %then, label %else
+then:
+  %r = musttail call i32 @callee_musttail_cross_bb(fp128 %a, i1 %cond)
+  ret i32 %r
+else:
+  ret i32 0
+}
+
+; Test musttail with control flow and a computed indirect arg in a non-entry 
BB.
+declare i32 @callee_musttail_cross_bb_computed(fp128 %a, i1 %c)
+
+define i32 @caller_musttail_cross_bb_computed(fp128 %a, i1 %cond) nounwind {
+; LA32-LABEL: caller_musttail_cross_bb_computed:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -64
+; LA32-NEXT:    st.w $ra, $sp, 60 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $fp, $sp, 56 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s0, $sp, 52 # 4-byte Folded Spill
+; LA32-NEXT:    st.w $s1, $sp, 48 # 4-byte Folded Spill
+; LA32-NEXT:    move $fp, $a0
+; LA32-NEXT:    ld.w $a3, $a0, 4
+; LA32-NEXT:    ld.w $a0, $a0, 8
+; LA32-NEXT:    ld.w $a2, $fp, 12
+; LA32-NEXT:    ld.w $a4, $fp, 0
+; LA32-NEXT:    move $s0, $a1
+; LA32-NEXT:    andi $s1, $a1, 1
+; LA32-NEXT:    st.w $a4, $sp, 0
+; LA32-NEXT:    st.w $a4, $sp, 16
+; LA32-NEXT:    st.w $a2, $sp, 12
+; LA32-NEXT:    st.w $a0, $sp, 8
+; LA32-NEXT:    st.w $a3, $sp, 4
+; LA32-NEXT:    st.w $a2, $sp, 28
+; LA32-NEXT:    st.w $a0, $sp, 24
+; LA32-NEXT:    addi.w $a0, $sp, 32
+; LA32-NEXT:    addi.w $a1, $sp, 16
+; LA32-NEXT:    addi.w $a2, $sp, 0
+; LA32-NEXT:    st.w $a3, $sp, 20
+; LA32-NEXT:    bl __addtf3
+; LA32-NEXT:    beq $s1, $zero, .LBB20_2
+; LA32-NEXT:  # %bb.1: # %then
+; LA32-NEXT:    ld.w $a0, $sp, 32
+; LA32-NEXT:    ld.w $a1, $sp, 36
+; LA32-NEXT:    ld.w $a2, $sp, 40
+; LA32-NEXT:    ld.w $a3, $sp, 44
+; LA32-NEXT:    st.w $a0, $fp, 0
+; LA32-NEXT:    st.w $a1, $fp, 4
+; LA32-NEXT:    st.w $a2, $fp, 8
+; LA32-NEXT:    st.w $a3, $fp, 12
+; LA32-NEXT:    move $a0, $fp
+; LA32-NEXT:    move $a1, $s0
+; LA32-NEXT:    ld.w $s1, $sp, 48 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s0, $sp, 52 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $fp, $sp, 56 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 60 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 64
+; LA32-NEXT:    b callee_musttail_cross_bb_computed
+; LA32-NEXT:  .LBB20_2: # %else
+; LA32-NEXT:    move $a0, $zero
+; LA32-NEXT:    ld.w $s1, $sp, 48 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $s0, $sp, 52 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $fp, $sp, 56 # 4-byte Folded Reload
+; LA32-NEXT:    ld.w $ra, $sp, 60 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 64
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: caller_musttail_cross_bb_computed:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -32
+; LA64-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s0, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    move $fp, $a2
+; LA64-NEXT:    andi $s0, $a2, 1
+; LA64-NEXT:    move $a2, $a0
+; LA64-NEXT:    move $a3, $a1
+; LA64-NEXT:    pcaddu18i $ra, %call36(__addtf3)
+; LA64-NEXT:    jirl $ra, $ra, 0
+; LA64-NEXT:    beqz $s0, .LBB20_2
+; LA64-NEXT:  # %bb.1: # %then
+; LA64-NEXT:    move $a2, $fp
+; LA64-NEXT:    ld.d $s0, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 32
+; LA64-NEXT:    pcaddu18i $t8, %call36(callee_musttail_cross_bb_computed)
+; LA64-NEXT:    jr $t8
+; LA64-NEXT:  .LBB20_2: # %else
+; LA64-NEXT:    move $a0, $zero
+; LA64-NEXT:    ld.d $s0, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 32
+; LA64-NEXT:    ret
+entry:
+  %sum = fadd fp128 %a, %a
+  br i1 %cond, label %then, label %else
+then:
+  %r = musttail call i32 @callee_musttail_cross_bb_computed(fp128 %sum, i1 
%cond)
+  ret i32 %r
+else:
+  ret i32 0
+}
+
+; Non-indirect args that spill to the stack (exercises the
+; isEligibleForTailCallOptimization stack-size bypass for musttail). Both
+; LA32 and LA64 use a0..a7 for the first 8 args and spill from the 9th. The
+; spilled args live in the caller's incoming stack slots, which musttail can
+; re-use because matching prototypes imply a matching layout.
+declare void @callee_musttail_stack_spill(i32, i32, i32, i32, i32, i32, i32, 
i32, i32, i32)
+
+define void @caller_musttail_stack_spill(i32 %a0, i32 %a1, i32 %a2, i32 %a3, 
i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9) nounwind {
+; LA32-LABEL: caller_musttail_stack_spill:
+; LA32:       # %bb.0:
+; LA32-NEXT:    ld.w $t0, $sp, 4
+; LA32-NEXT:    ld.w $t1, $sp, 0
+; LA32-NEXT:    st.w $t0, $sp, 4
+; LA32-NEXT:    st.w $t1, $sp, 0
+; LA32-NEXT:    b callee_musttail_stack_spill
+;
+; LA64-LABEL: caller_musttail_stack_spill:
+; LA64:       # %bb.0:
+; LA64-NEXT:    vld $vr0, $sp, 0
+; LA64-NEXT:    vst $vr0, $sp, 0
+; LA64-NEXT:    pcaddu18i $t8, %call36(callee_musttail_stack_spill)
+; LA64-NEXT:    jr $t8
+  musttail call void @callee_musttail_stack_spill(i32 %a0, i32 %a1, i32 %a2, 
i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9)
+  ret void
+}
+
+; sret + musttail: the sret pointer is just a regular pointer arg in a0.
+; Tail call forwards it unchanged.
+%struct.Large = type { i64, i64, i64, i64 }
+declare void @callee_musttail_sret(ptr sret(%struct.Large), i32)
+
+define void @caller_musttail_sret(ptr sret(%struct.Large) %out, i32 %x) 
nounwind {
+; LA32-LABEL: caller_musttail_sret:
+; LA32:       # %bb.0:
+; LA32-NEXT:    b callee_musttail_sret
+;
+; LA64-LABEL: caller_musttail_sret:
+; LA64:       # %bb.0:
+; LA64-NEXT:    pcaddu18i $t8, %call36(callee_musttail_sret)
+; LA64-NEXT:    jr $t8
+  musttail call void @callee_musttail_sret(ptr sret(%struct.Large) %out, i32 
%x)
+  ret void
+}
+
+; Mix of indirect (fp128) and many i32 args spilled to the stack.
+declare void @callee_musttail_indirect_and_spill(fp128, i32, i32, i32, i32, 
i32, i32, i32, i32, i32)
+
+define void @caller_musttail_indirect_and_spill(fp128 %a, i32 %i0, i32 %i1, 
i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8) nounwind {
+; LA32-LABEL: caller_musttail_indirect_and_spill:
+; LA32:       # %bb.0:
+; LA32-NEXT:    ld.w $t0, $sp, 4
+; LA32-NEXT:    ld.w $t1, $sp, 0
+; LA32-NEXT:    st.w $t0, $sp, 4
+; LA32-NEXT:    st.w $t1, $sp, 0
+; LA32-NEXT:    b callee_musttail_indirect_and_spill
+;
+; LA64-LABEL: caller_musttail_indirect_and_spill:
+; LA64:       # %bb.0:
+; LA64-NEXT:    ld.d $t0, $sp, 16
+; LA64-NEXT:    vld $vr0, $sp, 0
+; LA64-NEXT:    st.d $t0, $sp, 16
+; LA64-NEXT:    vst $vr0, $sp, 0
+; LA64-NEXT:    pcaddu18i $t8, %call36(callee_musttail_indirect_and_spill)
+; LA64-NEXT:    jr $t8
+  musttail call void @callee_musttail_indirect_and_spill(fp128 %a, i32 %i0, 
i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7, i32 %i8)
+  ret void
+}
+
+; Note: byval + musttail is intentionally NOT tested here.
+; isEligibleForTailCallOptimization rejects byval outright, which causes the
+; musttail site to hit reportFatalInternalError. Tail-call support for byval
+; was reverted in 501417baa60f (RISC-V/LoongArch) pending a vreg-based
+; re-implementation; once that lands, musttail + byval can be tested as
+; well.

_______________________________________________
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

Reply via email to