llvmorg-github-actions[bot] wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-loongarch Author: llvmbot <details> <summary>Changes</summary> Backport 19e915fc5c91645ccc4050180e9daabec30358c4 Requested by: @<!-- -->heiher --- Patch is 51.69 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/199637.diff 4 Files Affected: - (modified) llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp (+183-44) - (modified) llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h (+17) - (added) llvm/test/CodeGen/LoongArch/musttail-call.ll (+20) - (added) llvm/test/CodeGen/LoongArch/musttail-indirect-args.ll (+907) ``````````diff diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 2cfe3b2bc1a99..7d3d333efe046 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -8172,9 +8172,22 @@ SDValue LoongArchTargetLowering::LowerFormalArguments( "GHC calling convention requires the F and D extensions"); } + const Function &Func = MF.getFunction(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); MVT GRLenVT = Subtarget.getGRLenVT(); unsigned GRLenInBytes = Subtarget.getGRLen() / 8; + + // Check if this function has any musttail calls. If so, incoming indirect + // arg pointers must be saved in virtual registers so they survive across + // basic blocks (the SelectionDAG is cleared between BBs). Only do this + // when needed to avoid adding register pressure to non-musttail functions. + bool HasMusttail = llvm::any_of(Func, [](const BasicBlock &BB) { + return llvm::any_of(BB, [](const Instruction &I) { + if (const auto *CI = dyn_cast<CallInst>(&I)) + return CI->isMustTailCall(); + return false; + }); + }); // Used with varargs to acumulate store chains. std::vector<SDValue> OutChains; @@ -8205,6 +8218,14 @@ SDValue LoongArchTargetLowering::LowerFormalArguments( InVals.push_back(DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue, MachinePointerInfo())); unsigned ArgIndex = Ins[InsIdx].OrigArgIndex; + if (HasMusttail) { + LoongArchMachineFunctionInfo *LAFI = + MF.getInfo<LoongArchMachineFunctionInfo>(); + Register VReg = + MF.getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass); + Chain = DAG.getCopyToReg(Chain, DL, VReg, ArgValue); + LAFI->setIncomingIndirectArg(ArgIndex, VReg); + } unsigned ArgPartOffset = Ins[InsIdx].PartOffset; assert(ArgPartOffset == 0); while (i + 1 != e && Ins[InsIdx + 1].OrigArgIndex == ArgIndex) { @@ -8335,6 +8356,27 @@ bool LoongArchTargetLowering::isEligibleForTailCallOptimization( auto &Caller = MF.getFunction(); auto CallerCC = Caller.getCallingConv(); + bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall(); + + // Byval parameters hand the function a pointer directly into the stack area + // we want to reuse during a tail call. Working around this *is* possible + // but less efficient and uglier in LowerCall. For musttail, there is no + // workaround today: a byval arg requires a local copy that becomes invalid + // after the tail call deallocates the caller's frame, so rejecting here + // (and triggering reportFatalInternalError in LowerCall) is safer than + // miscompiling. + for (auto &Arg : Outs) + if (Arg.Flags.isByVal()) + return false; + + // musttail bypasses the remaining checks: the checks either reject cases + // we handle specially (indirect args are forwarded via incoming pointers, + // stack-passed args reuse the matching incoming layout, sret is forwarded + // like any other pointer arg) or are optimizations not applicable to + // mandatory tail calls. + if (IsMustTail) + return true; + // Do not tail call opt if the stack is used to pass parameters. if (CCInfo.getStackSize() != 0) return false; @@ -8351,11 +8393,6 @@ bool LoongArchTargetLowering::isEligibleForTailCallOptimization( if (IsCallerStructRet || IsCalleeStructRet) return false; - // Do not tail call opt if either the callee or caller has a byval argument. - for (auto &Arg : Outs) - if (Arg.Flags.isByVal()) - return false; - // The callee has to preserve all registers the caller needs to preserve. const LoongArchRegisterInfo *TRI = Subtarget.getRegisterInfo(); const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); @@ -8488,47 +8525,149 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI, // Promote the value if needed. // For now, only handle fully promoted and indirect arguments. if (VA.getLocInfo() == CCValAssign::Indirect) { - // Store the argument in a stack slot and pass its address. - Align StackAlign = - std::max(getPrefTypeAlign(Outs[OutIdx].ArgVT, DAG), - getPrefTypeAlign(ArgValue.getValueType(), DAG)); - TypeSize StoredSize = ArgValue.getValueType().getStoreSize(); - // If the original argument was split and passed by reference, we need to - // store the required parts of it here (and pass just one address). - unsigned ArgIndex = Outs[OutIdx].OrigArgIndex; - unsigned ArgPartOffset = Outs[OutIdx].PartOffset; - assert(ArgPartOffset == 0); - // Calculate the total size to store. We don't have access to what we're - // actually storing other than performing the loop and collecting the - // info. - SmallVector<std::pair<SDValue, SDValue>> Parts; - while (i + 1 != e && Outs[OutIdx + 1].OrigArgIndex == ArgIndex) { - SDValue PartValue = OutVals[OutIdx + 1]; - unsigned PartOffset = Outs[OutIdx + 1].PartOffset - ArgPartOffset; - SDValue Offset = DAG.getIntPtrConstant(PartOffset, DL); - EVT PartVT = PartValue.getValueType(); + // For musttail calls, reuse incoming indirect pointers instead of + // creating new stack temporaries. The incoming pointers point to the + // caller's caller's frame, which remains valid after a tail call. + if (IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) { + LoongArchMachineFunctionInfo *LAFI = + MF.getInfo<LoongArchMachineFunctionInfo>(); + unsigned CallArgIdx = Outs[OutIdx].OrigArgIndex; + + // Resolve which formal parameter is being passed at this call + // position. + // + // FIXME: Ins[].OrigArgIndex is Argument::getArgNo() (unfiltered), + // but Outs[].OrigArgIndex is an index into a filtered arg list + // (empty types removed, via CallLoweringInfo in the target- + // independent layer). IncomingIndirectArgs is keyed by the + // caller's unfiltered Argument::getArgNo(), so we have to walk + // the caller's formals (same filter) to translate the index. + // This target-independent asymmetry should be normalized so + // backends do not need to re-derive the mapping. + // + // Steps: + // 1. Find the call operand at filtered position CallArgIdx. + // 2. If it is an Argument, use getArgNo() directly (same filter + // for caller formals and call operands). + // 3. Otherwise (computed value), walk the caller's formals and + // skip empty types to map the filtered index to getArgNo(). + const Argument *FormalArg = nullptr; + unsigned FilteredIdx = 0; + for (const auto &CallArg : CLI.CB->args()) { + if (CallArg->getType()->isEmptyTy()) + continue; + if (FilteredIdx == CallArgIdx) { + FormalArg = dyn_cast<Argument>(CallArg); + break; + } + ++FilteredIdx; + } - StoredSize += PartVT.getStoreSize(); - StackAlign = std::max(StackAlign, getPrefTypeAlign(PartVT, DAG)); - Parts.push_back(std::make_pair(PartValue, Offset)); - ++i; - ++OutIdx; - } - SDValue SpillSlot = DAG.CreateStackTemporary(StoredSize, StackAlign); - int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); - MemOpChains.push_back( - DAG.getStore(Chain, DL, ArgValue, SpillSlot, - MachinePointerInfo::getFixedStack(MF, FI))); - for (const auto &Part : Parts) { - SDValue PartValue = Part.first; - SDValue PartOffset = Part.second; - SDValue Address = - DAG.getNode(ISD::ADD, DL, PtrVT, SpillSlot, PartOffset); + // For forwarded args, getArgNo() gives the unfiltered index directly. + // For computed args, walk the caller's formals to resolve it. + unsigned FormalArgIdx = CallArgIdx; + if (FormalArg) { + FormalArgIdx = FormalArg->getArgNo(); + } else { + FilteredIdx = 0; + for (const auto &Arg : MF.getFunction().args()) { + if (Arg.getType()->isEmptyTy()) + continue; + if (FilteredIdx == CallArgIdx) { + FormalArgIdx = Arg.getArgNo(); + break; + } + ++FilteredIdx; + } + } + + Register VReg = LAFI->getIncomingIndirectArg(FormalArgIdx); + SDValue CopyOp = DAG.getCopyFromReg(Chain, DL, VReg, PtrVT); + // Thread the CopyFromReg output chain through MemOpChains so the + // TokenFactor below sequences the copy with any stores we emit + // for this argument. + MemOpChains.push_back(CopyOp.getValue(1)); + SDValue IncomingPtr = CopyOp; + + if (!FormalArg) { + // Computed value: store into the incoming indirect pointer for the + // same-position formal parameter (musttail guarantees matching + // prototypes, so types match). The pointer survives the tail call + // since it points to the caller's caller's frame. + // + // The data-flow edge through IncomingPtr already prevents the + // store from being scheduled before the CopyFromReg. Threading + // CopyOp.getValue(1) (the copy's output chain) into the store + // makes that ordering explicit on the chain edge as well, which + // is the convention for memory ops chaining off their producers. + MemOpChains.push_back( + DAG.getStore(CopyOp.getValue(1), DL, ArgValue, IncomingPtr, + MachinePointerInfo::getUnknownStack(MF))); + // Store any split parts at their respective offsets. + unsigned ArgPartOffset = Outs[OutIdx].PartOffset; + while (i + 1 != e && Outs[OutIdx + 1].OrigArgIndex == CallArgIdx) { + SDValue PartValue = OutVals[OutIdx + 1]; + unsigned PartOffset = Outs[OutIdx + 1].PartOffset - ArgPartOffset; + SDValue Offset = DAG.getIntPtrConstant(PartOffset, DL); + SDValue Addr = + DAG.getNode(ISD::ADD, DL, PtrVT, IncomingPtr, Offset); + MemOpChains.push_back( + DAG.getStore(CopyOp.getValue(1), DL, PartValue, Addr, + MachinePointerInfo::getUnknownStack(MF))); + ++i; + ++OutIdx; + } + } + ArgValue = IncomingPtr; + + // Skip any remaining split parts (for forwarded args, they are + // covered by the forwarded pointer). + while (i + 1 != e && Outs[OutIdx + 1].OrigArgIndex == CallArgIdx) { + ++i; + ++OutIdx; + } + } else { + // Store the argument in a stack slot and pass its address. + Align StackAlign = + std::max(getPrefTypeAlign(Outs[OutIdx].ArgVT, DAG), + getPrefTypeAlign(ArgValue.getValueType(), DAG)); + TypeSize StoredSize = ArgValue.getValueType().getStoreSize(); + // If the original argument was split and passed by reference, we need + // to store the required parts of it here (and pass just one address). + unsigned ArgIndex = Outs[OutIdx].OrigArgIndex; + unsigned ArgPartOffset = Outs[OutIdx].PartOffset; + assert(ArgPartOffset == 0); + // Calculate the total size to store. We don't have access to what we're + // actually storing other than performing the loop and collecting the + // info. + SmallVector<std::pair<SDValue, SDValue>> Parts; + while (i + 1 != e && Outs[OutIdx + 1].OrigArgIndex == ArgIndex) { + SDValue PartValue = OutVals[OutIdx + 1]; + unsigned PartOffset = Outs[OutIdx + 1].PartOffset - ArgPartOffset; + SDValue Offset = DAG.getIntPtrConstant(PartOffset, DL); + EVT PartVT = PartValue.getValueType(); + StoredSize += PartVT.getStoreSize(); + StackAlign = std::max(StackAlign, getPrefTypeAlign(PartVT, DAG)); + Parts.push_back(std::make_pair(PartValue, Offset)); + ++i; + ++OutIdx; + } + SDValue SpillSlot = DAG.CreateStackTemporary(StoredSize, StackAlign); + int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); MemOpChains.push_back( - DAG.getStore(Chain, DL, PartValue, Address, + DAG.getStore(Chain, DL, ArgValue, SpillSlot, MachinePointerInfo::getFixedStack(MF, FI))); + for (const auto &Part : Parts) { + SDValue PartValue = Part.first; + SDValue PartOffset = Part.second; + SDValue Address = + DAG.getNode(ISD::ADD, DL, PtrVT, SpillSlot, PartOffset); + MemOpChains.push_back( + DAG.getStore(Chain, DL, PartValue, Address, + MachinePointerInfo::getFixedStack(MF, FI))); + } + ArgValue = SpillSlot; } - ArgValue = SpillSlot; } else { ArgValue = convertValVTToLocVT(DAG, ArgValue, VA, DL); } @@ -8542,8 +8681,8 @@ LoongArchTargetLowering::LowerCall(CallLoweringInfo &CLI, RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgValue)); } else { assert(VA.isMemLoc() && "Argument not register or memory"); - assert(!IsTailCall && "Tail call not allowed if stack is used " - "for passing parameters"); + assert((!IsTailCall || (CLI.CB && CLI.CB->isMustTailCall())) && + "Tail call not allowed if stack is used for passing parameters"); // Work out the address of the stack slot. if (!StackPtr.getNode()) diff --git a/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h b/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h index 904985c189dba..7bf7171198e8a 100644 --- a/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h +++ b/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h @@ -14,6 +14,7 @@ #define LLVM_LIB_TARGET_LOONGARCH_LOONGARCHMACHINEFUNCTIONINFO_H #include "LoongArchSubtarget.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -32,6 +33,13 @@ class LoongArchMachineFunctionInfo : public MachineFunctionInfo { /// Size of stack frame to save callee saved registers unsigned CalleeSavedStackSize = 0; + /// Incoming indirect argument pointers saved as virtual registers, keyed by + /// formal parameter index. Used for musttail forwarding of indirect args. + /// Virtual registers (not SDValues) are used because the SelectionDAG is + /// cleared between basic blocks, and musttail calls may be in non-entry + /// blocks. + DenseMap<unsigned, Register> IncomingIndirectArgs; + /// FrameIndex of the spill slot when there is no scavenged register in /// insertIndirectBranch. int BranchRelaxationSpillFrameIndex = -1; @@ -63,6 +71,15 @@ class LoongArchMachineFunctionInfo : public MachineFunctionInfo { unsigned getCalleeSavedStackSize() const { return CalleeSavedStackSize; } void setCalleeSavedStackSize(unsigned Size) { CalleeSavedStackSize = Size; } + void setIncomingIndirectArg(unsigned ArgIndex, Register Reg) { + IncomingIndirectArgs[ArgIndex] = Reg; + } + Register getIncomingIndirectArg(unsigned ArgIndex) const { + auto It = IncomingIndirectArgs.find(ArgIndex); + assert(It != IncomingIndirectArgs.end() && "No incoming indirect arg"); + return It->second; + } + int getBranchRelaxationSpillFrameIndex() { return BranchRelaxationSpillFrameIndex; } diff --git a/llvm/test/CodeGen/LoongArch/musttail-call.ll b/llvm/test/CodeGen/LoongArch/musttail-call.ll new file mode 100644 index 0000000000000..0fe77ed802b7a --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/musttail-call.ll @@ -0,0 +1,20 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=loongarch32 %s -o - | FileCheck %s --check-prefix=LA32 +; RUN: llc -mtriple=loongarch64 %s -o - | FileCheck %s --check-prefix=LA64 + +%struct.A = type { i32 } + +declare void @callee_musttail(ptr sret(%struct.A) %a) +define void @caller_musttail(ptr sret(%struct.A) %a) { +; LA32-LABEL: caller_musttail: +; LA32: # %bb.0: # %entry +; LA32-NEXT: b callee_musttail +; +; LA64-LABEL: caller_musttail: +; LA64: # %bb.0: # %entry +; LA64-NEXT: pcaddu18i $t8, %call36(callee_musttail) +; LA64-NEXT: jr $t8 +entry: + musttail call void @callee_musttail(ptr sret(%struct.A) %a) + ret void +} diff --git a/llvm/test/CodeGen/LoongArch/musttail-indirect-args.ll b/llvm/test/CodeGen/LoongArch/musttail-indirect-args.ll new file mode 100644 index 0000000000000..d088d6065aa07 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/musttail-indirect-args.ll @@ -0,0 +1,907 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=loongarch32 %s -o - | FileCheck %s --check-prefix=LA32 +; RUN: llc -mtriple=loongarch64 %s -o - | FileCheck %s --check-prefix=LA64 + +; Test that musttail with indirect args (fp128 on LA32) forwards the incoming +; pointer instead of creating a new stack temporary. Without this fix, the +; pointer would dangle after the tail call deallocates the caller's frame. + +declare i32 @callee_musttail_indirect(fp128 %a) + +; fp128 is indirect on LA32 (too large for registers), direct on LA64. +; On LA32, musttail must forward the incoming indirect pointer (a0) directly. +define i32 @caller_musttail_indirect(fp128 %a) nounwind { +; LA32-LABEL: caller_musttail_indirect: +; LA32: # %bb.0: +; LA32-NEXT: b callee_musttail_indirect +; +; LA64-LABEL: caller_musttail_indirect: +; LA64: # %bb.0: +; LA64-NEXT: pcaddu18i $t8, %call36(callee_musttail_indirect) +; LA64-NEXT: jr $t8 + %call = musttail call i32 @callee_musttail_indirect(fp128 %a) + ret i32 %call +} + +; Verify that non-musttail tail call with indirect args does NOT tail call +; (this is the PR #184972 fix - indirect args are unsafe for regular tail calls). +define void @caller_no_musttail_indirect() nounwind { +; LA32-LABEL: caller_no_musttail_indirect: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -32 +; LA32-NEXT: st.w $ra, $sp, 28 # 4-byte Folded Spill +; LA32-NEXT: lu12i.w $a0, 262128 +; LA32-NEXT: st.w $a0, $sp, 12 +; LA32-NEXT: st.w $zero, $sp, 8 +; LA32-NEXT: st.w $zero, $sp, 4 +; LA32-NEXT: addi.w $a0, $sp, 0 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: bl callee_musttail_indirect +; LA32-NEXT: ld.w $ra, $sp, 28 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 32 +; LA32-NEXT: ret +; +; LA64-LABEL: caller_no_musttail_indirect: +; LA64: # %bb.0: +; LA64-NEXT: ori $a0, $zero, 0 +; LA64-NEXT: lu32i.d $a0, -65536 +; LA64-NEXT: lu52i.d $a1, $a0, 1023 +; LA64-NEXT: move $a0, $zero +; LA64-NEXT: pcaddu18i $t8, %call36(callee_musttail_indirect) +; LA64-NEXT: jr $t8 + %call = tail call i32 @callee_musttail_indirect(fp128 0xL00000000000000003FFF000000000000) + ret void +} + +; Verify that non-musttail tail call forwarding an indirect arg from the +; caller's own parameters also does NOT tail call (the arg lives on the +; caller's frame, which would be deallocated). +define i32 @caller_no_musttail_forward_indirect(fp128 %a) nounwind { +; LA32-LABEL: caller_no_musttail_forward_indirect: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -32 +; LA32-NEXT: st.w $ra, $sp, 28 # 4-byte Folded Spill +; LA32-NEXT: ld.w $a1, $a0, 0 +; LA32-NEXT: ld.w $a2, $a0, 4 +; LA32-NEXT: ld.w $a3, $a0, 8 +; LA32-NEXT: ld.w $a0, $a0, 12 +; LA32-NEXT: st.w $a0, $sp, 12 +; LA32-NEXT: st.w $a3, $sp, 8 +; LA32-NEXT: st.w $a2, $sp, 4 +; LA32-NEXT: addi.w $a0, $sp, 0 +; LA32-NEXT: st.w $a1, $sp, 0 +; LA32-NEXT: bl callee_musttail_indirect +; LA32-NEXT: ld.w $ra, $sp, 28 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 32 +; LA32-NEXT: ret +; +; LA64-LABEL: caller_no_musttail_forward_indirect: +; LA64: # %bb.0: +; LA64-NEXT: pcaddu18i $t8, %call36(callee_musttail_indirect) +; LA64-NEXT: jr $t8 + %call = tail call i32 @callee_musttail_indirect(fp128 %a) + ret i32 %call +} + +; Test musttail with two indirect fp128 args on LA32. Both pointers must be +; forwarded. Exercises the D... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/199637 _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
