https://github.com/CSharperMantle updated https://github.com/llvm/llvm-project/pull/195595
>From bf75c64f8d43fec2cbd073e8e6b7f8417eeafbdf Mon Sep 17 00:00:00 2001 From: Rong Bao <[email protected]> Date: Sat, 2 May 2026 15:19:10 +0800 Subject: [PATCH 1/3] [LoongArch] Implement stack allocation probing This implementation largely follows the pattern used in RISCV backend with support of both constant and dynamic allocations. --- .../LoongArch/LoongArchFrameLowering.cpp | 247 ++++++++++++++++-- .../Target/LoongArch/LoongArchFrameLowering.h | 8 + .../LoongArch/LoongArchISelLowering.cpp | 124 ++++++++- .../Target/LoongArch/LoongArchISelLowering.h | 9 + .../Target/LoongArch/LoongArchInstrInfo.td | 22 ++ .../LoongArch/LoongArchMachineFunctionInfo.h | 5 + 6 files changed, 393 insertions(+), 22 deletions(-) diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp index 690b0639484d0..0aba94383f546 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp @@ -15,6 +15,8 @@ #include "LoongArchSubtarget.h" #include "MCTargetDesc/LoongArchBaseInfo.h" #include "MCTargetDesc/LoongArchMCTargetDesc.h" +#include "llvm/CodeGen/CFIInstBuilder.h" +#include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -182,6 +184,118 @@ void LoongArchFrameLowering::processFunctionBeforeFrameFinalized( } } +// Allocate stack space and probe it if necessary. +void LoongArchFrameLowering::allocateStack(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineFunction &MF, uint64_t Offset, + uint64_t RealStackSize, bool EmitCFI, + bool NeedProbe, uint64_t ProbeSize, + bool DynAllocation, + MachineInstr::MIFlag Flag) const { + DebugLoc DL; + const LoongArchInstrInfo *TII = STI.getInstrInfo(); + const bool IsLA64 = STI.is64Bit(); + const Register SPReg = LoongArch::R3; + CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup); + + // Simply allocate the stack if it's not big enough to require a probe. + if (!NeedProbe || Offset <= ProbeSize) { + adjustReg(MBB, MBBI, DL, SPReg, SPReg, -Offset, Flag); + if (EmitCFI) + CFIBuilder.buildDefCFAOffset(RealStackSize); + + if (NeedProbe && DynAllocation) { + // st.{w/d} $zero, $sp, 0 + BuildMI(MBB, MBBI, DL, + TII->get(IsLA64 ? LoongArch::ST_D : LoongArch::ST_W)) + .addReg(LoongArch::R0) + .addReg(SPReg) + .addImm(0) + .setMIFlag(Flag); + } + + return; + } + + // Unroll the probe loop depending on the number of iterations. + if (Offset < ProbeSize * 5) { + const uint64_t CFAAdjust = RealStackSize - Offset; + + uint64_t CurrentOffset = 0; + while (CurrentOffset + ProbeSize <= Offset) { + adjustReg(MBB, MBBI, DL, SPReg, SPReg, -ProbeSize, Flag); + // st.{w/d} $zero, $sp, 0 + BuildMI(MBB, MBBI, DL, + TII->get(IsLA64 ? LoongArch::ST_D : LoongArch::ST_W)) + .addReg(LoongArch::R0) + .addReg(SPReg) + .addImm(0) + .setMIFlag(Flag); + + CurrentOffset += ProbeSize; + if (EmitCFI) + CFIBuilder.buildDefCFAOffset(CurrentOffset + CFAAdjust); + } + + const uint64_t Residual = Offset - CurrentOffset; + if (Residual) { + adjustReg(MBB, MBBI, DL, SPReg, SPReg, -Residual, Flag); + if (EmitCFI) + CFIBuilder.buildDefCFAOffset(RealStackSize); + + if (DynAllocation) { + // st.{w/d} $zero, $sp, 0 + BuildMI(MBB, MBBI, DL, + TII->get(IsLA64 ? LoongArch::ST_D : LoongArch::ST_W)) + .addReg(LoongArch::R0) + .addReg(SPReg) + .addImm(0) + .setMIFlag(Flag); + } + } + return; + } + + // Emit a variable-length allocation probing loop. + const uint64_t RoundedSize = alignDown(Offset, ProbeSize); + const uint64_t Residual = Offset - RoundedSize; + const uint64_t CFAAdjust = RealStackSize - Offset; + + const Register TargetReg = LoongArch::R13; + // SUB TargetReg, $sp, RoundedSize + adjustReg(MBB, MBBI, DL, TargetReg, SPReg, -RoundedSize, Flag); + + if (EmitCFI) { + // Set the CFA register to TargetReg. + CFIBuilder.buildDefCFA(TargetReg, RoundedSize + CFAAdjust); + } + + // It will be expanded to a probe loop in inlineStackProbe(). + BuildMI(MBB, MBBI, DL, TII->get(LoongArch::PROBED_STACKALLOC)) + .addReg(TargetReg); + + if (EmitCFI) { + // Set the CFA register back to SP. + CFIBuilder.buildDefCFARegister(SPReg); + } + + if (Residual) { + adjustReg(MBB, MBBI, DL, SPReg, SPReg, -Residual, Flag); + if (DynAllocation) { + // st.{w/d} $zero, $sp, 0 + BuildMI(MBB, MBBI, DL, + TII->get(IsLA64 ? LoongArch::ST_D : LoongArch::ST_W)) + .addReg(LoongArch::R0) + .addReg(SPReg) + .addImm(0) + .setMIFlag(Flag); + } + } + + if (EmitCFI) + CFIBuilder.buildDefCFAOffset(RealStackSize); +} + void LoongArchFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { MachineFrameInfo &MFI = MF.getFrameInfo(); @@ -218,13 +332,15 @@ void LoongArchFrameLowering::emitPrologue(MachineFunction &MF, StackSize = FirstSPAdjustAmount; // Adjust stack. - adjustReg(MBB, MBBI, DL, SPReg, SPReg, -StackSize, MachineInstr::FrameSetup); - // Emit ".cfi_def_cfa_offset StackSize". - unsigned CFIIndex = - MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, StackSize)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlag(MachineInstr::FrameSetup); + const LoongArchTargetLowering *TLI = STI.getTargetLowering(); + const bool NeedProbe = TLI->hasInlineStackProbe(MF); + const uint64_t ProbeSize = TLI->getStackProbeSize(MF, getStackAlign()); + const bool DynAllocation = + MF.getInfo<LoongArchMachineFunctionInfo>()->hasDynamicAllocation(); + if (StackSize != 0) + allocateStack(MBB, MBBI, MF, StackSize, StackSize, + /*EmitCFI=*/true, NeedProbe, ProbeSize, DynAllocation, + MachineInstr::FrameSetup); const auto &CSI = MFI.getCalleeSavedInfo(); @@ -265,19 +381,9 @@ void LoongArchFrameLowering::emitPrologue(MachineFunction &MF, uint64_t SecondSPAdjustAmount = RealStackSize - FirstSPAdjustAmount; assert(SecondSPAdjustAmount > 0 && "SecondSPAdjustAmount should be greater than zero"); - adjustReg(MBB, MBBI, DL, SPReg, SPReg, -SecondSPAdjustAmount, - MachineInstr::FrameSetup); - - if (!hasFP(MF)) { - // If we are using a frame-pointer, and thus emitted ".cfi_def_cfa fp, 0", - // don't emit an sp-based .cfi_def_cfa_offset - // Emit ".cfi_def_cfa_offset RealStackSize" - unsigned CFIIndex = MF.addFrameInst( - MCCFIInstruction::cfiDefCfaOffset(nullptr, RealStackSize)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlag(MachineInstr::FrameSetup); - } + allocateStack(MBB, MBBI, MF, SecondSPAdjustAmount, RealStackSize, + !hasFP(MF), NeedProbe, ProbeSize, DynAllocation, + MachineInstr::FrameSetup); } if (hasFP(MF)) { @@ -353,6 +459,89 @@ void LoongArchFrameLowering::emitEpilogue(MachineFunction &MF, adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackSize, MachineInstr::FrameDestroy); } +// Synthesize the probe loop. +static void emitStackProbeInline(MachineBasicBlock::iterator MBBI, DebugLoc DL, + Register TargetReg) { + assert(TargetReg != LoongArch::R3 && + "New top of stack cannot already be in $sp"); + + MachineBasicBlock &MBB = *MBBI->getParent(); + MachineFunction &MF = *MBB.getParent(); + + const LoongArchSubtarget &STI = MF.getSubtarget<LoongArchSubtarget>(); + const LoongArchInstrInfo *TII = STI.getInstrInfo(); + const bool IsLA64 = STI.is64Bit(); + const Align StackAlign = STI.getFrameLowering()->getStackAlign(); + const LoongArchTargetLowering *TLI = STI.getTargetLowering(); + const uint64_t ProbeSize = TLI->getStackProbeSize(MF, StackAlign); + + MachineFunction::iterator MBBInsertPoint = std::next(MBB.getIterator()); + MachineBasicBlock *LoopTestMBB = + MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, LoopTestMBB); + MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(MBB.getBasicBlock()); + MF.insert(MBBInsertPoint, ExitMBB); + const Register SPReg = LoongArch::R3; + const Register ScratchReg = LoongArch::R14; + const MachineInstr::MIFlag Flags = MachineInstr::FrameSetup; + + // ScratchReg = ProbeSize + TII->movImm(MBB, MBBI, DL, ScratchReg, ProbeSize, Flags); + + // LoopTest: + // sub.{w/d} $sp, $sp, ScratchReg + BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, + TII->get(IsLA64 ? LoongArch::SUB_D : LoongArch::SUB_W), SPReg) + .addReg(SPReg) + .addReg(ScratchReg) + .setMIFlag(Flags); + + // st.{w/d} $zero, $sp, 0 + BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, + TII->get(IsLA64 ? LoongArch::ST_D : LoongArch::ST_W)) + .addReg(LoongArch::R0) + .addReg(SPReg) + .addImm(0) + .setMIFlag(Flags); + + // bne $sp, TargetReg, LoopTest + BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(LoongArch::BNE)) + .addReg(SPReg) + .addReg(TargetReg) + .addMBB(LoopTestMBB) + .setMIFlag(Flags); + + ExitMBB->splice(ExitMBB->end(), &MBB, std::next(MBBI), MBB.end()); + ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB); + + LoopTestMBB->addSuccessor(ExitMBB); + LoopTestMBB->addSuccessor(LoopTestMBB); + MBB.addSuccessor(LoopTestMBB); + // Update liveins. + fullyRecomputeLiveIns({ExitMBB, LoopTestMBB}); +} + +void LoongArchFrameLowering::inlineStackProbe(MachineFunction &MF, + MachineBasicBlock &MBB) const { + // Get the instructions that need to be replaced. We emit at most two of + // these. Remember them in order to avoid complications coming from the need + // to traverse the block while potentially creating more blocks. + SmallVector<MachineInstr *, 2> ToReplace; + for (MachineInstr &MI : MBB) { + if (MI.getOpcode() == LoongArch::PROBED_STACKALLOC) { + ToReplace.push_back(&MI); + } + } + + for (MachineInstr *MI : ToReplace) { + MachineBasicBlock::iterator MBBI = MI->getIterator(); + DebugLoc DL = MBB.findDebugLoc(MBBI); + Register TargetReg = MI->getOperand(0).getReg(); + emitStackProbeInline(MBBI, DL, TargetReg); + MBBI->eraseFromParent(); + } +} + // We would like to split the SP adjustment to reduce prologue/epilogue // as following instructions. In this way, the offset of the callee saved // register could fit in a single store. @@ -425,7 +614,23 @@ LoongArchFrameLowering::eliminateCallFramePseudoInstr( if (MI->getOpcode() == LoongArch::ADJCALLSTACKDOWN) Amount = -Amount; - adjustReg(MBB, MI, DL, SPReg, SPReg, Amount, MachineInstr::NoFlags); + const LoongArchTargetLowering *TLI = + MF.getSubtarget<LoongArchSubtarget>().getTargetLowering(); + const int64_t ProbeSize = TLI->getStackProbeSize(MF, getStackAlign()); + if (TLI->hasInlineStackProbe(MF) && -Amount >= ProbeSize) { + // When stack probing is enabled, the decrement of SP may need to be + // probed. We can handle both the decrement and the probing in + // allocateStack. + const bool DynAllocation = + MF.getInfo<LoongArchMachineFunctionInfo>()->hasDynamicAllocation(); + allocateStack(MBB, MI, MF, -Amount, -Amount, + MF.needsFrameMoves() && !hasFP(MF), + /*NeedProbe=*/true, ProbeSize, DynAllocation, + MachineInstr::NoFlags); + inlineStackProbe(MF, MBB); + } else { + adjustReg(MBB, MI, DL, SPReg, SPReg, Amount, MachineInstr::NoFlags); + } } } diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h index 6cbfcf665f6a9..8a540986e9d70 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h @@ -55,11 +55,19 @@ class LoongArchFrameLowering : public TargetFrameLowering { bool enableShrinkWrapping(const MachineFunction &MF) const override; + void inlineStackProbe(MachineFunction &MF, + MachineBasicBlock &PrologueMBB) const override; + protected: bool hasFPImpl(const MachineFunction &MF) const override; private: void determineFrameLayout(MachineFunction &MF) const; + void allocateStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + MachineFunction &MF, uint64_t Offset, + uint64_t RealStackSize, bool EmitCFI, bool NeedProbe, + uint64_t ProbeSize, bool DynAllocation, + MachineInstr::MIFlag Flag) const; void adjustReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register DestReg, Register SrcReg, int64_t Val, MachineInstr::MIFlag Flag) const; diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index e3bdf2b993036..0d18c9c86a18a 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -120,7 +120,7 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setOperationAction(ISD::EH_DWARF_CFA, GRLenVT, Custom); - setOperationAction(ISD::DYNAMIC_STACKALLOC, GRLenVT, Expand); + setOperationAction(ISD::DYNAMIC_STACKALLOC, GRLenVT, Custom); setOperationAction({ISD::STACKSAVE, ISD::STACKRESTORE}, MVT::Other, Expand); setOperationAction(ISD::VASTART, MVT::Other, Custom); setOperationAction({ISD::VAARG, ISD::VACOPY, ISD::VAEND}, MVT::Other, Expand); @@ -659,6 +659,8 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op, return lowerFP_EXTEND(Op, DAG); case ISD::SIGN_EXTEND_VECTOR_INREG: return lowerSIGN_EXTEND_VECTOR_INREG(Op, DAG); + case ISD::DYNAMIC_STACKALLOC: + return lowerDYNAMIC_STACKALLOC(Op, DAG); } return SDValue(); } @@ -8795,6 +8797,8 @@ MachineBasicBlock *LoongArchTargetLowering::EmitInstrWithCustomInserter( if (!Subtarget.is64Bit()) report_fatal_error("STATEPOINT is only supported on 64-bit targets"); return emitPatchPoint(MI, BB); + case LoongArch::PROBED_STACKALLOC_DYN: + return emitDynamicProbedAlloc(MI, BB); } } @@ -11014,3 +11018,121 @@ bool LoongArchTargetLowering::isExtractVecEltCheap(EVT VT, // Extract a scalar FP value from index 0 of a vector is free. return (EltVT == MVT::f32 || EltVT == MVT::f64) && Index == 0; } + +bool LoongArchTargetLowering::hasInlineStackProbe( + const MachineFunction &MF) const { + + // If the function specifically requests inline stack probes, emit them. + if (MF.getFunction().hasFnAttribute("probe-stack")) + return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() == + "inline-asm"; + + return false; +} + +unsigned LoongArchTargetLowering::getStackProbeSize(const MachineFunction &MF, + Align StackAlign) const { + // The default stack probe size is 4096 if the function has no + // stack-probe-size attribute. + const Function &Fn = MF.getFunction(); + unsigned StackProbeSize = + Fn.getFnAttributeAsParsedInteger("stack-probe-size", 4096); + // Round down to the stack alignment. + StackProbeSize = alignDown(StackProbeSize, StackAlign.value()); + return StackProbeSize ? StackProbeSize : StackAlign.value(); +} + +SDValue +LoongArchTargetLowering::lowerDYNAMIC_STACKALLOC(SDValue Op, + SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + if (!hasInlineStackProbe(MF)) + return SDValue(); + + const MVT GRLenVT = Subtarget.getGRLenVT(); + // Get the inputs. + SDValue Chain = Op.getOperand(0); + SDValue Size = Op.getOperand(1); + + const MaybeAlign Align = + cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue(); + const SDLoc dl(Op); + const EVT VT = Op.getValueType(); + + // Construct the new SP value in a GPR. + SDValue SP = DAG.getCopyFromReg(Chain, dl, LoongArch::R3, GRLenVT); + Chain = SP.getValue(1); + SP = DAG.getNode(ISD::SUB, dl, GRLenVT, SP, Size); + if (Align) + SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), + DAG.getSignedConstant(-Align->value(), dl, VT)); + + // Set the real SP to the new value with a probing loop. + Chain = DAG.getNode(LoongArchISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP); + return DAG.getMergeValues({SP, Chain}, dl); +} + +MachineBasicBlock * +LoongArchTargetLowering::emitDynamicProbedAlloc(MachineInstr &MI, + MachineBasicBlock *MBB) const { + MachineFunction &MF = *MBB->getParent(); + MachineBasicBlock::iterator MBBI = MI.getIterator(); + DebugLoc DL = MBB->findDebugLoc(MBBI); + const Register TargetReg = MI.getOperand(0).getReg(); + + const LoongArchInstrInfo *TII = Subtarget.getInstrInfo(); + const bool IsLA64 = Subtarget.is64Bit(); + const Align StackAlign = Subtarget.getFrameLowering()->getStackAlign(); + const LoongArchTargetLowering *TLI = Subtarget.getTargetLowering(); + const uint64_t ProbeSize = TLI->getStackProbeSize(MF, StackAlign); + + MachineFunction::iterator MBBInsertPoint = std::next(MBB->getIterator()); + MachineBasicBlock *const LoopTestMBB = + MF.CreateMachineBasicBlock(MBB->getBasicBlock()); + MF.insert(MBBInsertPoint, LoopTestMBB); + MachineBasicBlock *const ExitMBB = + MF.CreateMachineBasicBlock(MBB->getBasicBlock()); + MF.insert(MBBInsertPoint, ExitMBB); + const Register SPReg = LoongArch::R3; + const Register ScratchReg = + MF.getRegInfo().createVirtualRegister(&LoongArch::GPRRegClass); + + // ScratchReg = ProbeSize + TII->movImm(*MBB, MBBI, DL, ScratchReg, ProbeSize, MachineInstr::NoFlags); + + // LoopTest: + // sub.{w/d} $sp, $sp, ScratchReg + BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, + TII->get(IsLA64 ? LoongArch::SUB_D : LoongArch::SUB_W), SPReg) + .addReg(SPReg) + .addReg(ScratchReg); + + // st.{w/d} $zero, $sp, 0 + BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, + TII->get(IsLA64 ? LoongArch::ST_D : LoongArch::ST_W)) + .addReg(LoongArch::R0) + .addReg(SPReg) + .addImm(0); + + // bltu TargetReg, $sp, LoopTest + BuildMI(*LoopTestMBB, LoopTestMBB->end(), DL, TII->get(LoongArch::BLTU)) + .addReg(TargetReg) + .addReg(SPReg) + .addMBB(LoopTestMBB); + + // move $sp, TargetReg + BuildMI(*ExitMBB, ExitMBB->end(), DL, TII->get(LoongArch::OR), SPReg) + .addReg(TargetReg) + .addReg(LoongArch::R0); + + ExitMBB->splice(ExitMBB->end(), MBB, std::next(MBBI), MBB->end()); + ExitMBB->transferSuccessorsAndUpdatePHIs(MBB); + + LoopTestMBB->addSuccessor(ExitMBB); + LoopTestMBB->addSuccessor(LoopTestMBB); + MBB->addSuccessor(LoopTestMBB); + + MI.eraseFromParent(); + MF.getInfo<LoongArchMachineFunctionInfo>()->setDynamicAllocation(); + return ExitMBB->begin()->getParent(); +} diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h index 4cc1aa2261ecc..189ecbe4820d2 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h @@ -179,6 +179,14 @@ class LoongArchTargetLowering : public TargetLowering { isImmVLDILegalForMode1(const APInt &SplatValue, const unsigned SplatBitSize) const; + /// True if stack clash protection is enabled for this function. + bool hasInlineStackProbe(const MachineFunction &MF) const override; + + unsigned getStackProbeSize(const MachineFunction &MF, Align StackAlign) const; + + MachineBasicBlock *emitDynamicProbedAlloc(MachineInstr &MI, + MachineBasicBlock *MBB) const; + private: /// Target-specific function used to lower LoongArch calling conventions. typedef bool LoongArchCCAssignFn(const DataLayout &DL, LoongArchABI::ABI ABI, @@ -250,6 +258,7 @@ class LoongArchTargetLowering : public TargetLowering { SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue lowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override; diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td index 54d6e51d4d6db..6f78a6d8be41b 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td @@ -76,6 +76,12 @@ def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_CallSeqStart, def callseq_end : SDNode<"ISD::CALLSEQ_END", SDT_CallSeqEnd, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; +// To avoid stack clash, allocation is performed by block and each block is +// probed. +def loongarch_probed_alloca : SDNode<"LoongArchISD::PROBED_ALLOCA", + SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>, + [SDNPHasChain, SDNPMayStore]>; + // Target-dependent nodes. def loongarch_call : SDNode<"LoongArchISD::CALL", SDT_LoongArchCall, [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, @@ -2456,6 +2462,22 @@ def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), [(callseq_end timm:$amt1, timm:$amt2)]>; } // Defs = [R3], Uses = [R3] +// Stack probing +let hasSideEffects = 1, mayLoad = 1, mayStore = 1, isCodeGenOnly = 1, + Defs = [R3], Uses = [R3] in { +// Probed stack allocation of a constant size, used in function prologues when +// stack-clash protection is enabled. +def PROBED_STACKALLOC : Pseudo<(outs), + (ins GPR:$target), + []>, + Sched<[]>; +let usesCustomInserter = 1 in +def PROBED_STACKALLOC_DYN : Pseudo<(outs), + (ins GPR:$target), + [(loongarch_probed_alloca GPR:$target)]>, + Sched<[]>; +} + //===----------------------------------------------------------------------===// // Assembler Pseudo Instructions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h b/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h index 904985c189dba..8e0a7f052961e 100644 --- a/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h +++ b/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h @@ -43,6 +43,8 @@ class LoongArchMachineFunctionInfo : public MachineFunctionInfo { /// `annotate-tablejump` option. SmallVector<std::pair<MachineInstr *, int>, 4> JumpInfos; + bool HasDynamicAllocation = false; + public: LoongArchMachineFunctionInfo(const Function &F, const TargetSubtargetInfo *STI) {} @@ -82,6 +84,9 @@ class LoongArchMachineFunctionInfo : public MachineFunctionInfo { unsigned getJumpInfoSize() { return JumpInfos.size(); } MachineInstr *getJumpInfoJrMI(unsigned Idx) { return JumpInfos[Idx].first; } int getJumpInfoJTIIndex(unsigned Idx) { return JumpInfos[Idx].second; } + + bool hasDynamicAllocation() const { return HasDynamicAllocation; } + void setDynamicAllocation() { HasDynamicAllocation = true; } }; } // end namespace llvm >From 35ec5d07cf65df3744e15f711d2cfce796c34b4d Mon Sep 17 00:00:00 2001 From: Rong Bao <[email protected]> Date: Sat, 2 May 2026 15:15:52 +0800 Subject: [PATCH 2/3] [clang][LoongArch] Render stack-clash-protection flag on LoongArch --- clang/lib/Driver/ToolChains/Clang.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 92b3045dceff2..1777228d91c94 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -3646,7 +3646,7 @@ static void RenderSCPOptions(const ToolChain &TC, const ArgList &Args, if (!EffectiveTriple.isX86() && !EffectiveTriple.isSystemZ() && !EffectiveTriple.isPPC64() && !EffectiveTriple.isAArch64() && - !EffectiveTriple.isRISCV()) + !EffectiveTriple.isRISCV() && !EffectiveTriple.isLoongArch()) return; Args.addOptInFlag(CmdArgs, options::OPT_fstack_clash_protection, >From 2e2752d462acbe6d8c72a83e5a68428e274653a8 Mon Sep 17 00:00:00 2001 From: Rong Bao <[email protected]> Date: Sat, 2 May 2026 20:29:15 +0800 Subject: [PATCH 3/3] [test][LoongArch] Add and update tests for stack clash protection --- clang/test/CodeGen/stack-clash-protection.c | 1 + .../LoongArch/inline-asm-constraint-f.ll | 2 - .../stack-clash-prologue-nounwind.ll | 351 +++++++++ .../CodeGen/LoongArch/stack-clash-prologue.ll | 714 ++++++++++++++++++ .../stack-probing-dynamic-nonentry.ll | 109 +++ .../LoongArch/stack-probing-dynamic.ll | 479 ++++++++++++ .../LoongArch/stack-probing-frame-setup.mir | 185 +++++ 7 files changed, 1839 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/LoongArch/stack-clash-prologue-nounwind.ll create mode 100644 llvm/test/CodeGen/LoongArch/stack-clash-prologue.ll create mode 100644 llvm/test/CodeGen/LoongArch/stack-probing-dynamic-nonentry.ll create mode 100644 llvm/test/CodeGen/LoongArch/stack-probing-dynamic.ll create mode 100644 llvm/test/CodeGen/LoongArch/stack-probing-frame-setup.mir diff --git a/clang/test/CodeGen/stack-clash-protection.c b/clang/test/CodeGen/stack-clash-protection.c index b07e4c4ce9084..b00cd46f8d24b 100644 --- a/clang/test/CodeGen/stack-clash-protection.c +++ b/clang/test/CodeGen/stack-clash-protection.c @@ -4,6 +4,7 @@ // RUN: %clang_cc1 -triple powerpc64le-linux-gnu -O0 -emit-llvm -o- %s -fstack-clash-protection -mstack-probe-size=8192 | FileCheck %s // RUN: %clang_cc1 -triple powerpc64-linux-gnu -O0 -emit-llvm -o- %s -fstack-clash-protection -mstack-probe-size=8192 | FileCheck %s // RUN: %clang_cc1 -triple aarch64-linux-gnu -O0 -emit-llvm -o- %s -fstack-clash-protection -mstack-probe-size=8192 | FileCheck %s +// RUN: %clang_cc1 -triple loongarch64-linux-gnu -O0 -emit-llvm -o- %s -fstack-clash-protection -mstack-probe-size=8192 | FileCheck %s // CHECK: define{{.*}} void @large_stack() #[[A:.*]] { void large_stack(void) { diff --git a/llvm/test/CodeGen/LoongArch/inline-asm-constraint-f.ll b/llvm/test/CodeGen/LoongArch/inline-asm-constraint-f.ll index b5f1c23a95207..9d66957037938 100644 --- a/llvm/test/CodeGen/LoongArch/inline-asm-constraint-f.ll +++ b/llvm/test/CodeGen/LoongArch/inline-asm-constraint-f.ll @@ -33,7 +33,6 @@ define double @constraint_f_double(double %a) nounwind { define double @constraint_gpr(double %a) { ; LA32-LABEL: constraint_gpr: ; LA32: # %bb.0: -; LA32-NEXT: .cfi_def_cfa_offset 0 ; LA32-NEXT: movfr2gr.s $a7, $fa0 ; LA32-NEXT: movfrh2gr.s $t0, $fa0 ; LA32-NEXT: #APP @@ -45,7 +44,6 @@ define double @constraint_gpr(double %a) { ; ; LA64-LABEL: constraint_gpr: ; LA64: # %bb.0: -; LA64-NEXT: .cfi_def_cfa_offset 0 ; LA64-NEXT: movfr2gr.d $a7, $fa0 ; LA64-NEXT: #APP ; LA64-NEXT: move $a6, $a7 diff --git a/llvm/test/CodeGen/LoongArch/stack-clash-prologue-nounwind.ll b/llvm/test/CodeGen/LoongArch/stack-clash-prologue-nounwind.ll new file mode 100644 index 0000000000000..4e1b745a2e041 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/stack-clash-prologue-nounwind.ll @@ -0,0 +1,351 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=loongarch64-unknown-linux-gnu -O2 < %s -verify-machineinstrs | FileCheck %s -check-prefix=LA64 +; RUN: llc -mtriple=loongarch32-unknown-linux-gnu -O2 < %s -verify-machineinstrs | FileCheck %s -check-prefix=LA32 + +; Tests copied from PowerPC. + +; Free probe +define i8 @f0() #0 nounwind { +; +; LA64-LABEL: f0: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -80 +; LA64-NEXT: ori $a0, $zero, 3 +; LA64-NEXT: st.b $a0, $sp, 16 +; LA64-NEXT: ld.b $a0, $sp, 16 +; LA64-NEXT: addi.d $sp, $sp, 80 +; LA64-NEXT: ret +; +; LA32-LABEL: f0: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -64 +; LA32-NEXT: ori $a0, $zero, 3 +; LA32-NEXT: st.b $a0, $sp, 0 +; LA32-NEXT: ld.b $a0, $sp, 0 +; LA32-NEXT: addi.w $sp, $sp, 64 +; LA32-NEXT: ret +entry: + %a = alloca i8, i64 64 + %b = getelementptr inbounds i8, ptr %a, i64 63 + store volatile i8 3, ptr %a + %c = load volatile i8, ptr %a + ret i8 %c +} + +define i8 @f1() #0 nounwind { +; +; LA64-LABEL: f1: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, 1 +; LA64-NEXT: sub.d $sp, $sp, $a0 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: ori $a0, $zero, 3 +; LA64-NEXT: st.b $a0, $sp, 16 +; LA64-NEXT: ld.b $a0, $sp, 16 +; LA64-NEXT: lu12i.w $a1, 1 +; LA64-NEXT: ori $a1, $a1, 16 +; LA64-NEXT: add.d $sp, $sp, $a1 +; LA64-NEXT: ret +; +; LA32-LABEL: f1: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, 1 +; LA32-NEXT: sub.w $sp, $sp, $a0 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: ori $a0, $zero, 3 +; LA32-NEXT: st.b $a0, $sp, 16 +; LA32-NEXT: ld.b $a0, $sp, 16 +; LA32-NEXT: lu12i.w $a1, 1 +; LA32-NEXT: ori $a1, $a1, 16 +; LA32-NEXT: add.w $sp, $sp, $a1 +; LA32-NEXT: ret +entry: + %a = alloca i8, i64 4096 + %b = getelementptr inbounds i8, ptr %a, i64 63 + store volatile i8 3, ptr %a + %c = load volatile i8, ptr %a + ret i8 %c +} + +define i8 @f2() #0 nounwind { +; +; LA64-LABEL: f2: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, 16 +; LA64-NEXT: sub.d $t1, $sp, $a0 +; LA64-NEXT: lu12i.w $t2, 1 +; LA64-NEXT: .LBB2_1: # %entry +; LA64-NEXT: # =>This Inner Loop Header: Depth=1 +; LA64-NEXT: sub.d $sp, $sp, $t2 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: bne $sp, $t1, .LBB2_1 +; LA64-NEXT: # %bb.2: # %entry +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: ori $a0, $zero, 3 +; LA64-NEXT: st.b $a0, $sp, 16 +; LA64-NEXT: ld.b $a0, $sp, 16 +; LA64-NEXT: lu12i.w $a1, 16 +; LA64-NEXT: ori $a1, $a1, 16 +; LA64-NEXT: add.d $sp, $sp, $a1 +; LA64-NEXT: ret +; +; LA32-LABEL: f2: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, 16 +; LA32-NEXT: sub.w $t1, $sp, $a0 +; LA32-NEXT: lu12i.w $t2, 1 +; LA32-NEXT: .LBB2_1: # %entry +; LA32-NEXT: # =>This Inner Loop Header: Depth=1 +; LA32-NEXT: sub.w $sp, $sp, $t2 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: bne $sp, $t1, .LBB2_1 +; LA32-NEXT: # %bb.2: # %entry +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: ori $a0, $zero, 3 +; LA32-NEXT: st.b $a0, $sp, 16 +; LA32-NEXT: ld.b $a0, $sp, 16 +; LA32-NEXT: lu12i.w $a1, 16 +; LA32-NEXT: ori $a1, $a1, 16 +; LA32-NEXT: add.w $sp, $sp, $a1 +; LA32-NEXT: ret +entry: + %a = alloca i8, i64 65536 + %b = getelementptr inbounds i8, ptr %a, i64 63 + store volatile i8 3, ptr %a + %c = load volatile i8, ptr %a + ret i8 %c +} + +define i8 @f3() #0 "stack-probe-size"="32768" nounwind { +; +; LA64-LABEL: f3: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, 8 +; LA64-NEXT: sub.d $sp, $sp, $a0 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: lu12i.w $a0, 8 +; LA64-NEXT: sub.d $sp, $sp, $a0 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: ori $a0, $zero, 3 +; LA64-NEXT: st.b $a0, $sp, 16 +; LA64-NEXT: ld.b $a0, $sp, 16 +; LA64-NEXT: lu12i.w $a1, 16 +; LA64-NEXT: ori $a1, $a1, 16 +; LA64-NEXT: add.d $sp, $sp, $a1 +; LA64-NEXT: ret +; +; LA32-LABEL: f3: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, 8 +; LA32-NEXT: sub.w $sp, $sp, $a0 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: lu12i.w $a0, 8 +; LA32-NEXT: sub.w $sp, $sp, $a0 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: ori $a0, $zero, 3 +; LA32-NEXT: st.b $a0, $sp, 16 +; LA32-NEXT: ld.b $a0, $sp, 16 +; LA32-NEXT: lu12i.w $a1, 16 +; LA32-NEXT: ori $a1, $a1, 16 +; LA32-NEXT: add.w $sp, $sp, $a1 +; LA32-NEXT: ret +entry: + %a = alloca i8, i64 65536 + %b = getelementptr inbounds i8, ptr %a, i64 63 + store volatile i8 3, ptr %a + %c = load volatile i8, ptr %a + ret i8 %c +} + +; Same as f2, but without protection. +define i8 @f4() nounwind { +; +; LA64-LABEL: f4: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, 16 +; LA64-NEXT: ori $a0, $a0, 16 +; LA64-NEXT: sub.d $sp, $sp, $a0 +; LA64-NEXT: ori $a0, $zero, 3 +; LA64-NEXT: st.b $a0, $sp, 16 +; LA64-NEXT: ld.b $a0, $sp, 16 +; LA64-NEXT: lu12i.w $a1, 16 +; LA64-NEXT: ori $a1, $a1, 16 +; LA64-NEXT: add.d $sp, $sp, $a1 +; LA64-NEXT: ret +; +; LA32-LABEL: f4: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, 16 +; LA32-NEXT: ori $a0, $a0, 16 +; LA32-NEXT: sub.w $sp, $sp, $a0 +; LA32-NEXT: ori $a0, $zero, 3 +; LA32-NEXT: st.b $a0, $sp, 16 +; LA32-NEXT: ld.b $a0, $sp, 16 +; LA32-NEXT: lu12i.w $a1, 16 +; LA32-NEXT: ori $a1, $a1, 16 +; LA32-NEXT: add.w $sp, $sp, $a1 +; LA32-NEXT: ret +entry: + %a = alloca i8, i64 65536 + %b = getelementptr inbounds i8, ptr %a, i64 63 + store volatile i8 3, ptr %a + %c = load volatile i8, ptr %a + ret i8 %c +} + +define i8 @f5() #0 "stack-probe-size"="65536" nounwind { +; +; LA64-LABEL: f5: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, 256 +; LA64-NEXT: sub.d $t1, $sp, $a0 +; LA64-NEXT: lu12i.w $t2, 16 +; LA64-NEXT: .LBB5_1: # %entry +; LA64-NEXT: # =>This Inner Loop Header: Depth=1 +; LA64-NEXT: sub.d $sp, $sp, $t2 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: bne $sp, $t1, .LBB5_1 +; LA64-NEXT: # %bb.2: # %entry +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: ori $a0, $zero, 3 +; LA64-NEXT: st.b $a0, $sp, 16 +; LA64-NEXT: ld.b $a0, $sp, 16 +; LA64-NEXT: lu12i.w $a1, 256 +; LA64-NEXT: ori $a1, $a1, 16 +; LA64-NEXT: add.d $sp, $sp, $a1 +; LA64-NEXT: ret +; +; LA32-LABEL: f5: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, 256 +; LA32-NEXT: sub.w $t1, $sp, $a0 +; LA32-NEXT: lu12i.w $t2, 16 +; LA32-NEXT: .LBB5_1: # %entry +; LA32-NEXT: # =>This Inner Loop Header: Depth=1 +; LA32-NEXT: sub.w $sp, $sp, $t2 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: bne $sp, $t1, .LBB5_1 +; LA32-NEXT: # %bb.2: # %entry +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: ori $a0, $zero, 3 +; LA32-NEXT: st.b $a0, $sp, 16 +; LA32-NEXT: ld.b $a0, $sp, 16 +; LA32-NEXT: lu12i.w $a1, 256 +; LA32-NEXT: ori $a1, $a1, 16 +; LA32-NEXT: add.w $sp, $sp, $a1 +; LA32-NEXT: ret +entry: + %a = alloca i8, i64 1048576 + %b = getelementptr inbounds i8, ptr %a, i64 63 + store volatile i8 3, ptr %a + %c = load volatile i8, ptr %a + ret i8 %c +} + +define i8 @f6() #0 nounwind { +; +; LA64-LABEL: f6: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, 262144 +; LA64-NEXT: sub.d $t1, $sp, $a0 +; LA64-NEXT: lu12i.w $t2, 1 +; LA64-NEXT: .LBB6_1: # %entry +; LA64-NEXT: # =>This Inner Loop Header: Depth=1 +; LA64-NEXT: sub.d $sp, $sp, $t2 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: bne $sp, $t1, .LBB6_1 +; LA64-NEXT: # %bb.2: # %entry +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: ori $a0, $zero, 3 +; LA64-NEXT: st.b $a0, $sp, 16 +; LA64-NEXT: ld.b $a0, $sp, 16 +; LA64-NEXT: lu12i.w $a1, 262144 +; LA64-NEXT: ori $a1, $a1, 16 +; LA64-NEXT: add.d $sp, $sp, $a1 +; LA64-NEXT: ret +; +; LA32-LABEL: f6: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, 262144 +; LA32-NEXT: sub.w $t1, $sp, $a0 +; LA32-NEXT: lu12i.w $t2, 1 +; LA32-NEXT: .LBB6_1: # %entry +; LA32-NEXT: # =>This Inner Loop Header: Depth=1 +; LA32-NEXT: sub.w $sp, $sp, $t2 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: bne $sp, $t1, .LBB6_1 +; LA32-NEXT: # %bb.2: # %entry +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: ori $a0, $zero, 3 +; LA32-NEXT: st.b $a0, $sp, 16 +; LA32-NEXT: ld.b $a0, $sp, 16 +; LA32-NEXT: lu12i.w $a1, 262144 +; LA32-NEXT: ori $a1, $a1, 16 +; LA32-NEXT: add.w $sp, $sp, $a1 +; LA32-NEXT: ret +entry: + %a = alloca i8, i64 1073741824 + %b = getelementptr inbounds i8, ptr %a, i64 63 + store volatile i8 3, ptr %a + %c = load volatile i8, ptr %a + ret i8 %c +} + +define i8 @f7() #0 "stack-probe-size"="65536" nounwind { +; +; LA64-LABEL: f7: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, 244128 +; LA64-NEXT: sub.d $t1, $sp, $a0 +; LA64-NEXT: lu12i.w $t2, 16 +; LA64-NEXT: .LBB7_1: # %entry +; LA64-NEXT: # =>This Inner Loop Header: Depth=1 +; LA64-NEXT: sub.d $sp, $sp, $t2 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: bne $sp, $t1, .LBB7_1 +; LA64-NEXT: # %bb.2: # %entry +; LA64-NEXT: lu12i.w $a0, 12 +; LA64-NEXT: ori $a0, $a0, 2576 +; LA64-NEXT: sub.d $sp, $sp, $a0 +; LA64-NEXT: ori $a0, $zero, 3 +; LA64-NEXT: st.b $a0, $sp, 9 +; LA64-NEXT: ld.b $a0, $sp, 9 +; LA64-NEXT: lu12i.w $a1, 244140 +; LA64-NEXT: ori $a1, $a1, 2576 +; LA64-NEXT: add.d $sp, $sp, $a1 +; LA64-NEXT: ret +; +; LA32-LABEL: f7: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, 244128 +; LA32-NEXT: sub.w $t1, $sp, $a0 +; LA32-NEXT: lu12i.w $t2, 16 +; LA32-NEXT: .LBB7_1: # %entry +; LA32-NEXT: # =>This Inner Loop Header: Depth=1 +; LA32-NEXT: sub.w $sp, $sp, $t2 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: bne $sp, $t1, .LBB7_1 +; LA32-NEXT: # %bb.2: # %entry +; LA32-NEXT: lu12i.w $a0, 12 +; LA32-NEXT: ori $a0, $a0, 2576 +; LA32-NEXT: sub.w $sp, $sp, $a0 +; LA32-NEXT: ori $a0, $zero, 3 +; LA32-NEXT: st.b $a0, $sp, 9 +; LA32-NEXT: ld.b $a0, $sp, 9 +; LA32-NEXT: lu12i.w $a1, 244140 +; LA32-NEXT: ori $a1, $a1, 2576 +; LA32-NEXT: add.w $sp, $sp, $a1 +; LA32-NEXT: ret +entry: + %a = alloca i8, i64 1000000007 + %b = getelementptr inbounds i8, ptr %a, i64 101 + store volatile i8 3, ptr %a + %c = load volatile i8, ptr %a + ret i8 %c +} + +attributes #0 = { "probe-stack"="inline-asm" } diff --git a/llvm/test/CodeGen/LoongArch/stack-clash-prologue.ll b/llvm/test/CodeGen/LoongArch/stack-clash-prologue.ll new file mode 100644 index 0000000000000..d42f3c861ec7b --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/stack-clash-prologue.ll @@ -0,0 +1,714 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=loongarch64 -O2 < %s | FileCheck %s -check-prefix=LA64 +; RUN: llc -mtriple=loongarch32 -O2 < %s | FileCheck %s -check-prefix=LA32 + +; Tests copied from PowerPC. + +; Free probe +define i8 @f0() #0 { +; +; LA64-LABEL: f0: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -80 +; LA64-NEXT: .cfi_def_cfa_offset 80 +; LA64-NEXT: ori $a0, $zero, 3 +; LA64-NEXT: st.b $a0, $sp, 16 +; LA64-NEXT: ld.b $a0, $sp, 16 +; LA64-NEXT: addi.d $sp, $sp, 80 +; LA64-NEXT: ret +; +; LA32-LABEL: f0: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -64 +; LA32-NEXT: .cfi_def_cfa_offset 64 +; LA32-NEXT: ori $a0, $zero, 3 +; LA32-NEXT: st.b $a0, $sp, 0 +; LA32-NEXT: ld.b $a0, $sp, 0 +; LA32-NEXT: addi.w $sp, $sp, 64 +; LA32-NEXT: ret +entry: + %a = alloca i8, i64 64 + %b = getelementptr inbounds i8, ptr %a, i64 63 + store volatile i8 3, ptr %a + %c = load volatile i8, ptr %a + ret i8 %c +} + +define i8 @f1() #0 { +; +; LA64-LABEL: f1: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, 1 +; LA64-NEXT: sub.d $sp, $sp, $a0 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: .cfi_def_cfa_offset 4096 +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: .cfi_def_cfa_offset 4112 +; LA64-NEXT: ori $a0, $zero, 3 +; LA64-NEXT: st.b $a0, $sp, 16 +; LA64-NEXT: ld.b $a0, $sp, 16 +; LA64-NEXT: lu12i.w $a1, 1 +; LA64-NEXT: ori $a1, $a1, 16 +; LA64-NEXT: add.d $sp, $sp, $a1 +; LA64-NEXT: ret +; +; LA32-LABEL: f1: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, 1 +; LA32-NEXT: sub.w $sp, $sp, $a0 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: .cfi_def_cfa_offset 4096 +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: .cfi_def_cfa_offset 4112 +; LA32-NEXT: ori $a0, $zero, 3 +; LA32-NEXT: st.b $a0, $sp, 16 +; LA32-NEXT: ld.b $a0, $sp, 16 +; LA32-NEXT: lu12i.w $a1, 1 +; LA32-NEXT: ori $a1, $a1, 16 +; LA32-NEXT: add.w $sp, $sp, $a1 +; LA32-NEXT: ret +entry: + %a = alloca i8, i64 4096 + %b = getelementptr inbounds i8, ptr %a, i64 63 + store volatile i8 3, ptr %a + %c = load volatile i8, ptr %a + ret i8 %c +} + +define i8 @f2() #0 { +; +; LA64-LABEL: f2: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, 16 +; LA64-NEXT: sub.d $t1, $sp, $a0 +; LA64-NEXT: .cfi_def_cfa 13, 65536 +; LA64-NEXT: lu12i.w $t2, 1 +; LA64-NEXT: .LBB2_1: # %entry +; LA64-NEXT: # =>This Inner Loop Header: Depth=1 +; LA64-NEXT: sub.d $sp, $sp, $t2 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: bne $sp, $t1, .LBB2_1 +; LA64-NEXT: # %bb.2: # %entry +; LA64-NEXT: .cfi_def_cfa_register 3 +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: .cfi_def_cfa_offset 65552 +; LA64-NEXT: ori $a0, $zero, 3 +; LA64-NEXT: st.b $a0, $sp, 16 +; LA64-NEXT: ld.b $a0, $sp, 16 +; LA64-NEXT: lu12i.w $a1, 16 +; LA64-NEXT: ori $a1, $a1, 16 +; LA64-NEXT: add.d $sp, $sp, $a1 +; LA64-NEXT: ret +; +; LA32-LABEL: f2: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, 16 +; LA32-NEXT: sub.w $t1, $sp, $a0 +; LA32-NEXT: .cfi_def_cfa 13, 65536 +; LA32-NEXT: lu12i.w $t2, 1 +; LA32-NEXT: .LBB2_1: # %entry +; LA32-NEXT: # =>This Inner Loop Header: Depth=1 +; LA32-NEXT: sub.w $sp, $sp, $t2 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: bne $sp, $t1, .LBB2_1 +; LA32-NEXT: # %bb.2: # %entry +; LA32-NEXT: .cfi_def_cfa_register 3 +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: .cfi_def_cfa_offset 65552 +; LA32-NEXT: ori $a0, $zero, 3 +; LA32-NEXT: st.b $a0, $sp, 16 +; LA32-NEXT: ld.b $a0, $sp, 16 +; LA32-NEXT: lu12i.w $a1, 16 +; LA32-NEXT: ori $a1, $a1, 16 +; LA32-NEXT: add.w $sp, $sp, $a1 +; LA32-NEXT: ret +entry: + %a = alloca i8, i64 65536 + %b = getelementptr inbounds i8, ptr %a, i64 63 + store volatile i8 3, ptr %a + %c = load volatile i8, ptr %a + ret i8 %c +} + +define i8 @f3() #0 "stack-probe-size"="32768" { +; +; LA64-LABEL: f3: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, 8 +; LA64-NEXT: sub.d $sp, $sp, $a0 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: .cfi_def_cfa_offset 32768 +; LA64-NEXT: lu12i.w $a0, 8 +; LA64-NEXT: sub.d $sp, $sp, $a0 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: .cfi_def_cfa_offset 65536 +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: .cfi_def_cfa_offset 65552 +; LA64-NEXT: ori $a0, $zero, 3 +; LA64-NEXT: st.b $a0, $sp, 16 +; LA64-NEXT: ld.b $a0, $sp, 16 +; LA64-NEXT: lu12i.w $a1, 16 +; LA64-NEXT: ori $a1, $a1, 16 +; LA64-NEXT: add.d $sp, $sp, $a1 +; LA64-NEXT: ret +; +; LA32-LABEL: f3: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, 8 +; LA32-NEXT: sub.w $sp, $sp, $a0 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: .cfi_def_cfa_offset 32768 +; LA32-NEXT: lu12i.w $a0, 8 +; LA32-NEXT: sub.w $sp, $sp, $a0 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: .cfi_def_cfa_offset 65536 +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: .cfi_def_cfa_offset 65552 +; LA32-NEXT: ori $a0, $zero, 3 +; LA32-NEXT: st.b $a0, $sp, 16 +; LA32-NEXT: ld.b $a0, $sp, 16 +; LA32-NEXT: lu12i.w $a1, 16 +; LA32-NEXT: ori $a1, $a1, 16 +; LA32-NEXT: add.w $sp, $sp, $a1 +; LA32-NEXT: ret +entry: + %a = alloca i8, i64 65536 + %b = getelementptr inbounds i8, ptr %a, i64 63 + store volatile i8 3, ptr %a + %c = load volatile i8, ptr %a + ret i8 %c +} + +; Same as f2, but without protection. +define i8 @f4() { +; +; LA64-LABEL: f4: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, 16 +; LA64-NEXT: ori $a0, $a0, 16 +; LA64-NEXT: sub.d $sp, $sp, $a0 +; LA64-NEXT: .cfi_def_cfa_offset 65552 +; LA64-NEXT: ori $a0, $zero, 3 +; LA64-NEXT: st.b $a0, $sp, 16 +; LA64-NEXT: ld.b $a0, $sp, 16 +; LA64-NEXT: lu12i.w $a1, 16 +; LA64-NEXT: ori $a1, $a1, 16 +; LA64-NEXT: add.d $sp, $sp, $a1 +; LA64-NEXT: ret +; +; LA32-LABEL: f4: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, 16 +; LA32-NEXT: ori $a0, $a0, 16 +; LA32-NEXT: sub.w $sp, $sp, $a0 +; LA32-NEXT: .cfi_def_cfa_offset 65552 +; LA32-NEXT: ori $a0, $zero, 3 +; LA32-NEXT: st.b $a0, $sp, 16 +; LA32-NEXT: ld.b $a0, $sp, 16 +; LA32-NEXT: lu12i.w $a1, 16 +; LA32-NEXT: ori $a1, $a1, 16 +; LA32-NEXT: add.w $sp, $sp, $a1 +; LA32-NEXT: ret +entry: + %a = alloca i8, i64 65536 + %b = getelementptr inbounds i8, ptr %a, i64 63 + store volatile i8 3, ptr %a + %c = load volatile i8, ptr %a + ret i8 %c +} + +define i8 @f5() #0 "stack-probe-size"="65536" { +; +; LA64-LABEL: f5: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, 256 +; LA64-NEXT: sub.d $t1, $sp, $a0 +; LA64-NEXT: .cfi_def_cfa 13, 1048576 +; LA64-NEXT: lu12i.w $t2, 16 +; LA64-NEXT: .LBB5_1: # %entry +; LA64-NEXT: # =>This Inner Loop Header: Depth=1 +; LA64-NEXT: sub.d $sp, $sp, $t2 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: bne $sp, $t1, .LBB5_1 +; LA64-NEXT: # %bb.2: # %entry +; LA64-NEXT: .cfi_def_cfa_register 3 +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: .cfi_def_cfa_offset 1048592 +; LA64-NEXT: ori $a0, $zero, 3 +; LA64-NEXT: st.b $a0, $sp, 16 +; LA64-NEXT: ld.b $a0, $sp, 16 +; LA64-NEXT: lu12i.w $a1, 256 +; LA64-NEXT: ori $a1, $a1, 16 +; LA64-NEXT: add.d $sp, $sp, $a1 +; LA64-NEXT: ret +; +; LA32-LABEL: f5: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, 256 +; LA32-NEXT: sub.w $t1, $sp, $a0 +; LA32-NEXT: .cfi_def_cfa 13, 1048576 +; LA32-NEXT: lu12i.w $t2, 16 +; LA32-NEXT: .LBB5_1: # %entry +; LA32-NEXT: # =>This Inner Loop Header: Depth=1 +; LA32-NEXT: sub.w $sp, $sp, $t2 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: bne $sp, $t1, .LBB5_1 +; LA32-NEXT: # %bb.2: # %entry +; LA32-NEXT: .cfi_def_cfa_register 3 +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: .cfi_def_cfa_offset 1048592 +; LA32-NEXT: ori $a0, $zero, 3 +; LA32-NEXT: st.b $a0, $sp, 16 +; LA32-NEXT: ld.b $a0, $sp, 16 +; LA32-NEXT: lu12i.w $a1, 256 +; LA32-NEXT: ori $a1, $a1, 16 +; LA32-NEXT: add.w $sp, $sp, $a1 +; LA32-NEXT: ret +entry: + %a = alloca i8, i64 1048576 + %b = getelementptr inbounds i8, ptr %a, i64 63 + store volatile i8 3, ptr %a + %c = load volatile i8, ptr %a + ret i8 %c +} + +define i8 @f6() #0 { +; +; LA64-LABEL: f6: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, 262144 +; LA64-NEXT: sub.d $t1, $sp, $a0 +; LA64-NEXT: .cfi_def_cfa 13, 1073741824 +; LA64-NEXT: lu12i.w $t2, 1 +; LA64-NEXT: .LBB6_1: # %entry +; LA64-NEXT: # =>This Inner Loop Header: Depth=1 +; LA64-NEXT: sub.d $sp, $sp, $t2 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: bne $sp, $t1, .LBB6_1 +; LA64-NEXT: # %bb.2: # %entry +; LA64-NEXT: .cfi_def_cfa_register 3 +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: .cfi_def_cfa_offset 1073741840 +; LA64-NEXT: ori $a0, $zero, 3 +; LA64-NEXT: st.b $a0, $sp, 16 +; LA64-NEXT: ld.b $a0, $sp, 16 +; LA64-NEXT: lu12i.w $a1, 262144 +; LA64-NEXT: ori $a1, $a1, 16 +; LA64-NEXT: add.d $sp, $sp, $a1 +; LA64-NEXT: ret +; +; LA32-LABEL: f6: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, 262144 +; LA32-NEXT: sub.w $t1, $sp, $a0 +; LA32-NEXT: .cfi_def_cfa 13, 1073741824 +; LA32-NEXT: lu12i.w $t2, 1 +; LA32-NEXT: .LBB6_1: # %entry +; LA32-NEXT: # =>This Inner Loop Header: Depth=1 +; LA32-NEXT: sub.w $sp, $sp, $t2 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: bne $sp, $t1, .LBB6_1 +; LA32-NEXT: # %bb.2: # %entry +; LA32-NEXT: .cfi_def_cfa_register 3 +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: .cfi_def_cfa_offset 1073741840 +; LA32-NEXT: ori $a0, $zero, 3 +; LA32-NEXT: st.b $a0, $sp, 16 +; LA32-NEXT: ld.b $a0, $sp, 16 +; LA32-NEXT: lu12i.w $a1, 262144 +; LA32-NEXT: ori $a1, $a1, 16 +; LA32-NEXT: add.w $sp, $sp, $a1 +; LA32-NEXT: ret +entry: + %a = alloca i8, i64 1073741824 + %b = getelementptr inbounds i8, ptr %a, i64 63 + store volatile i8 3, ptr %a + %c = load volatile i8, ptr %a + ret i8 %c +} + +define i8 @f7() #0 "stack-probe-size"="65536" { +; +; LA64-LABEL: f7: +; LA64: # %bb.0: # %entry +; LA64-NEXT: lu12i.w $a0, 244128 +; LA64-NEXT: sub.d $t1, $sp, $a0 +; LA64-NEXT: .cfi_def_cfa 13, 999948288 +; LA64-NEXT: lu12i.w $t2, 16 +; LA64-NEXT: .LBB7_1: # %entry +; LA64-NEXT: # =>This Inner Loop Header: Depth=1 +; LA64-NEXT: sub.d $sp, $sp, $t2 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: bne $sp, $t1, .LBB7_1 +; LA64-NEXT: # %bb.2: # %entry +; LA64-NEXT: .cfi_def_cfa_register 3 +; LA64-NEXT: lu12i.w $a0, 12 +; LA64-NEXT: ori $a0, $a0, 2576 +; LA64-NEXT: sub.d $sp, $sp, $a0 +; LA64-NEXT: .cfi_def_cfa_offset 1000000016 +; LA64-NEXT: ori $a0, $zero, 3 +; LA64-NEXT: st.b $a0, $sp, 9 +; LA64-NEXT: ld.b $a0, $sp, 9 +; LA64-NEXT: lu12i.w $a1, 244140 +; LA64-NEXT: ori $a1, $a1, 2576 +; LA64-NEXT: add.d $sp, $sp, $a1 +; LA64-NEXT: ret +; +; LA32-LABEL: f7: +; LA32: # %bb.0: # %entry +; LA32-NEXT: lu12i.w $a0, 244128 +; LA32-NEXT: sub.w $t1, $sp, $a0 +; LA32-NEXT: .cfi_def_cfa 13, 999948288 +; LA32-NEXT: lu12i.w $t2, 16 +; LA32-NEXT: .LBB7_1: # %entry +; LA32-NEXT: # =>This Inner Loop Header: Depth=1 +; LA32-NEXT: sub.w $sp, $sp, $t2 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: bne $sp, $t1, .LBB7_1 +; LA32-NEXT: # %bb.2: # %entry +; LA32-NEXT: .cfi_def_cfa_register 3 +; LA32-NEXT: lu12i.w $a0, 12 +; LA32-NEXT: ori $a0, $a0, 2576 +; LA32-NEXT: sub.w $sp, $sp, $a0 +; LA32-NEXT: .cfi_def_cfa_offset 1000000016 +; LA32-NEXT: ori $a0, $zero, 3 +; LA32-NEXT: st.b $a0, $sp, 9 +; LA32-NEXT: ld.b $a0, $sp, 9 +; LA32-NEXT: lu12i.w $a1, 244140 +; LA32-NEXT: ori $a1, $a1, 2576 +; LA32-NEXT: add.w $sp, $sp, $a1 +; LA32-NEXT: ret +entry: + %a = alloca i8, i64 1000000007 + %b = getelementptr inbounds i8, ptr %a, i64 101 + store volatile i8 3, ptr %a + %c = load volatile i8, ptr %a + ret i8 %c +} + +; alloca + align < probe_size +define i32 @f8(i64 %i) local_unnamed_addr #0 { +; +; LA64-LABEL: f8: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -896 +; LA64-NEXT: .cfi_def_cfa_offset 896 +; LA64-NEXT: st.d $ra, $sp, 888 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 880 # 8-byte Folded Spill +; LA64-NEXT: .cfi_offset 1, -8 +; LA64-NEXT: .cfi_offset 22, -16 +; LA64-NEXT: addi.d $fp, $sp, 896 +; LA64-NEXT: .cfi_def_cfa 22, 0 +; LA64-NEXT: bstrins.d $sp, $zero, 5, 0 +; LA64-NEXT: slli.d $a0, $a0, 2 +; LA64-NEXT: addi.d $a1, $sp, 64 +; LA64-NEXT: ori $a2, $zero, 1 +; LA64-NEXT: stx.w $a2, $a0, $a1 +; LA64-NEXT: ld.w $a0, $sp, 64 +; LA64-NEXT: addi.d $sp, $fp, -896 +; LA64-NEXT: ld.d $fp, $sp, 880 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 888 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 896 +; LA64-NEXT: ret +; +; LA32-LABEL: f8: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -832 +; LA32-NEXT: .cfi_def_cfa_offset 832 +; LA32-NEXT: st.w $ra, $sp, 828 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 824 # 4-byte Folded Spill +; LA32-NEXT: .cfi_offset 1, -4 +; LA32-NEXT: .cfi_offset 22, -8 +; LA32-NEXT: addi.w $fp, $sp, 832 +; LA32-NEXT: .cfi_def_cfa 22, 0 +; LA32-NEXT: bstrins.w $sp, $zero, 5, 0 +; LA32-NEXT: slli.w $a0, $a0, 2 +; LA32-NEXT: addi.w $a1, $sp, 0 +; LA32-NEXT: add.w $a0, $a1, $a0 +; LA32-NEXT: ori $a1, $zero, 1 +; LA32-NEXT: st.w $a1, $a0, 0 +; LA32-NEXT: ld.w $a0, $sp, 0 +; LA32-NEXT: addi.w $sp, $fp, -832 +; LA32-NEXT: ld.w $fp, $sp, 824 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 828 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 832 +; LA32-NEXT: ret + %a = alloca i32, i32 200, align 64 + %b = getelementptr inbounds i32, ptr %a, i64 %i + store volatile i32 1, ptr %b + %c = load volatile i32, ptr %a + ret i32 %c +} + +; alloca > probe_size, align > probe_size +define i32 @f9(i64 %i) local_unnamed_addr #0 { +; +; LA64-LABEL: f9: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -2032 +; LA64-NEXT: .cfi_def_cfa_offset 2032 +; LA64-NEXT: st.d $ra, $sp, 2024 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 2016 # 8-byte Folded Spill +; LA64-NEXT: .cfi_offset 1, -8 +; LA64-NEXT: .cfi_offset 22, -16 +; LA64-NEXT: addi.d $fp, $sp, 2032 +; LA64-NEXT: .cfi_def_cfa 22, 0 +; LA64-NEXT: lu12i.w $a1, 1 +; LA64-NEXT: sub.d $sp, $sp, $a1 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: sub.d $sp, $sp, $a1 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: bstrins.d $sp, $zero, 10, 0 +; LA64-NEXT: slli.d $a0, $a0, 2 +; LA64-NEXT: ori $a1, $zero, 2048 +; LA64-NEXT: add.d $a1, $sp, $a1 +; LA64-NEXT: ori $a2, $zero, 1 +; LA64-NEXT: stx.w $a2, $a0, $a1 +; LA64-NEXT: ori $a0, $zero, 2048 +; LA64-NEXT: add.d $a0, $sp, $a0 +; LA64-NEXT: ld.w $a0, $a0, 0 +; LA64-NEXT: lu12i.w $a1, 2 +; LA64-NEXT: ori $a1, $a1, 2048 +; LA64-NEXT: sub.d $sp, $fp, $a1 +; LA64-NEXT: lu12i.w $a1, 2 +; LA64-NEXT: ori $a1, $a1, 16 +; LA64-NEXT: add.d $sp, $sp, $a1 +; LA64-NEXT: ld.d $fp, $sp, 2016 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 2024 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 2032 +; LA64-NEXT: ret +; +; LA32-LABEL: f9: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -2032 +; LA32-NEXT: .cfi_def_cfa_offset 2032 +; LA32-NEXT: st.w $ra, $sp, 2028 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 2024 # 4-byte Folded Spill +; LA32-NEXT: .cfi_offset 1, -4 +; LA32-NEXT: .cfi_offset 22, -8 +; LA32-NEXT: addi.w $fp, $sp, 2032 +; LA32-NEXT: .cfi_def_cfa 22, 0 +; LA32-NEXT: lu12i.w $a1, 1 +; LA32-NEXT: sub.w $sp, $sp, $a1 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: sub.w $sp, $sp, $a1 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: bstrins.w $sp, $zero, 10, 0 +; LA32-NEXT: slli.w $a0, $a0, 2 +; LA32-NEXT: ori $a1, $zero, 2048 +; LA32-NEXT: add.w $a1, $sp, $a1 +; LA32-NEXT: add.w $a0, $a1, $a0 +; LA32-NEXT: ori $a1, $zero, 1 +; LA32-NEXT: st.w $a1, $a0, 0 +; LA32-NEXT: ori $a0, $zero, 2048 +; LA32-NEXT: add.w $a0, $sp, $a0 +; LA32-NEXT: ld.w $a0, $a0, 0 +; LA32-NEXT: lu12i.w $a1, 2 +; LA32-NEXT: ori $a1, $a1, 2048 +; LA32-NEXT: sub.w $sp, $fp, $a1 +; LA32-NEXT: lu12i.w $a1, 2 +; LA32-NEXT: ori $a1, $a1, 16 +; LA32-NEXT: add.w $sp, $sp, $a1 +; LA32-NEXT: ld.w $fp, $sp, 2024 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 2028 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 2032 +; LA32-NEXT: ret + %a = alloca i32, i32 2000, align 2048 + %b = getelementptr inbounds i32, ptr %a, i64 %i + store volatile i32 1, ptr %b + %c = load volatile i32, ptr %a + ret i32 %c +} + +; alloca < probe_size, align < probe_size, alloca + align > probe_size +define i32 @f10(i64 %i) local_unnamed_addr #0 { +; +; LA64-LABEL: f10: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -2032 +; LA64-NEXT: .cfi_def_cfa_offset 2032 +; LA64-NEXT: st.d $ra, $sp, 2024 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 2016 # 8-byte Folded Spill +; LA64-NEXT: .cfi_offset 1, -8 +; LA64-NEXT: .cfi_offset 22, -16 +; LA64-NEXT: addi.d $fp, $sp, 2032 +; LA64-NEXT: .cfi_def_cfa 22, 0 +; LA64-NEXT: addi.d $sp, $sp, -2048 +; LA64-NEXT: addi.d $sp, $sp, -1040 +; LA64-NEXT: bstrins.d $sp, $zero, 9, 0 +; LA64-NEXT: slli.d $a0, $a0, 2 +; LA64-NEXT: addi.d $a1, $sp, 1024 +; LA64-NEXT: ori $a2, $zero, 1 +; LA64-NEXT: stx.w $a2, $a0, $a1 +; LA64-NEXT: ld.w $a0, $sp, 1024 +; LA64-NEXT: lu12i.w $a1, 1 +; LA64-NEXT: ori $a1, $a1, 1024 +; LA64-NEXT: sub.d $sp, $fp, $a1 +; LA64-NEXT: addi.d $sp, $sp, 2032 +; LA64-NEXT: addi.d $sp, $sp, 1056 +; LA64-NEXT: ld.d $fp, $sp, 2016 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 2024 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 2032 +; LA64-NEXT: ret +; +; LA32-LABEL: f10: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -2032 +; LA32-NEXT: .cfi_def_cfa_offset 2032 +; LA32-NEXT: st.w $ra, $sp, 2028 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 2024 # 4-byte Folded Spill +; LA32-NEXT: .cfi_offset 1, -4 +; LA32-NEXT: .cfi_offset 22, -8 +; LA32-NEXT: addi.w $fp, $sp, 2032 +; LA32-NEXT: .cfi_def_cfa 22, 0 +; LA32-NEXT: addi.w $sp, $sp, -2048 +; LA32-NEXT: addi.w $sp, $sp, -1040 +; LA32-NEXT: bstrins.w $sp, $zero, 9, 0 +; LA32-NEXT: slli.w $a0, $a0, 2 +; LA32-NEXT: addi.w $a1, $sp, 1024 +; LA32-NEXT: add.w $a0, $a1, $a0 +; LA32-NEXT: ori $a1, $zero, 1 +; LA32-NEXT: st.w $a1, $a0, 0 +; LA32-NEXT: ld.w $a0, $sp, 1024 +; LA32-NEXT: lu12i.w $a1, 1 +; LA32-NEXT: ori $a1, $a1, 1024 +; LA32-NEXT: sub.w $sp, $fp, $a1 +; LA32-NEXT: addi.w $sp, $sp, 2032 +; LA32-NEXT: addi.w $sp, $sp, 1056 +; LA32-NEXT: ld.w $fp, $sp, 2024 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 2028 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 2032 +; LA32-NEXT: ret + %a = alloca i32, i32 1000, align 1024 + %b = getelementptr inbounds i32, ptr %a, i64 %i + store volatile i32 1, ptr %b + %c = load volatile i32, ptr %a + ret i32 %c +} + +define void @f11(i32 %vla_size, i64 %i) #0 { +; +; LA64-LABEL: f11: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -2032 +; LA64-NEXT: .cfi_def_cfa_offset 2032 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: st.d $ra, $sp, 2024 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 2016 # 8-byte Folded Spill +; LA64-NEXT: st.d $s8, $sp, 2008 # 8-byte Folded Spill +; LA64-NEXT: .cfi_offset 1, -8 +; LA64-NEXT: .cfi_offset 22, -16 +; LA64-NEXT: .cfi_offset 31, -24 +; LA64-NEXT: addi.d $fp, $sp, 2032 +; LA64-NEXT: .cfi_def_cfa 22, 0 +; LA64-NEXT: lu12i.w $a2, 15 +; LA64-NEXT: sub.d $t1, $sp, $a2 +; LA64-NEXT: lu12i.w $t2, 1 +; LA64-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1 +; LA64-NEXT: sub.d $sp, $sp, $t2 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: bne $sp, $t1, .LBB11_1 +; LA64-NEXT: # %bb.2: +; LA64-NEXT: addi.d $sp, $sp, -2048 +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: bstrins.d $sp, $zero, 14, 0 +; LA64-NEXT: move $s8, $sp +; LA64-NEXT: slli.d $a1, $a1, 2 +; LA64-NEXT: lu12i.w $a2, 8 +; LA64-NEXT: add.d $a2, $s8, $a2 +; LA64-NEXT: ori $a3, $zero, 1 +; LA64-NEXT: stx.w $a3, $a1, $a2 +; LA64-NEXT: bstrpick.d $a0, $a0, 31, 0 +; LA64-NEXT: addi.d $a0, $a0, 15 +; LA64-NEXT: bstrpick.d $a0, $a0, 32, 4 +; LA64-NEXT: slli.d $a0, $a0, 4 +; LA64-NEXT: sub.d $a0, $sp, $a0 +; LA64-NEXT: bstrins.d $a0, $zero, 10, 0 +; LA64-NEXT: lu12i.w $a1, 1 +; LA64-NEXT: .LBB11_3: # =>This Inner Loop Header: Depth=1 +; LA64-NEXT: sub.d $sp, $sp, $a1 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: bltu $a0, $sp, .LBB11_3 +; LA64-NEXT: # %bb.4: +; LA64-NEXT: move $sp, $a0 +; LA64-NEXT: ld.b $zero, $a0, 0 +; LA64-NEXT: lu12i.w $a0, 16 +; LA64-NEXT: sub.d $sp, $fp, $a0 +; LA64-NEXT: lu12i.w $a0, 15 +; LA64-NEXT: ori $a0, $a0, 2064 +; LA64-NEXT: add.d $sp, $sp, $a0 +; LA64-NEXT: ld.d $s8, $sp, 2008 # 8-byte Folded Reload +; LA64-NEXT: ld.d $fp, $sp, 2016 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 2024 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 2032 +; LA64-NEXT: ret +; +; LA32-LABEL: f11: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -2032 +; LA32-NEXT: .cfi_def_cfa_offset 2032 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: st.w $ra, $sp, 2028 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 2024 # 4-byte Folded Spill +; LA32-NEXT: st.w $s8, $sp, 2020 # 4-byte Folded Spill +; LA32-NEXT: .cfi_offset 1, -4 +; LA32-NEXT: .cfi_offset 22, -8 +; LA32-NEXT: .cfi_offset 31, -12 +; LA32-NEXT: addi.w $fp, $sp, 2032 +; LA32-NEXT: .cfi_def_cfa 22, 0 +; LA32-NEXT: lu12i.w $a2, 15 +; LA32-NEXT: sub.w $t1, $sp, $a2 +; LA32-NEXT: lu12i.w $t2, 1 +; LA32-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1 +; LA32-NEXT: sub.w $sp, $sp, $t2 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: bne $sp, $t1, .LBB11_1 +; LA32-NEXT: # %bb.2: +; LA32-NEXT: addi.w $sp, $sp, -2048 +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: bstrins.w $sp, $zero, 14, 0 +; LA32-NEXT: move $s8, $sp +; LA32-NEXT: slli.w $a1, $a1, 2 +; LA32-NEXT: lu12i.w $a2, 8 +; LA32-NEXT: add.w $a2, $s8, $a2 +; LA32-NEXT: add.w $a1, $a2, $a1 +; LA32-NEXT: ori $a2, $zero, 1 +; LA32-NEXT: st.w $a2, $a1, 0 +; LA32-NEXT: addi.w $a0, $a0, 15 +; LA32-NEXT: addi.w $a1, $zero, -16 +; LA32-NEXT: and $a0, $a0, $a1 +; LA32-NEXT: sub.w $a0, $sp, $a0 +; LA32-NEXT: addi.w $a1, $zero, -2048 +; LA32-NEXT: and $a0, $a0, $a1 +; LA32-NEXT: lu12i.w $a1, 1 +; LA32-NEXT: .LBB11_3: # =>This Inner Loop Header: Depth=1 +; LA32-NEXT: sub.w $sp, $sp, $a1 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: bltu $a0, $sp, .LBB11_3 +; LA32-NEXT: # %bb.4: +; LA32-NEXT: move $sp, $a0 +; LA32-NEXT: ld.b $zero, $a0, 0 +; LA32-NEXT: lu12i.w $a0, 16 +; LA32-NEXT: sub.w $sp, $fp, $a0 +; LA32-NEXT: lu12i.w $a0, 15 +; LA32-NEXT: ori $a0, $a0, 2064 +; LA32-NEXT: add.w $sp, $sp, $a0 +; LA32-NEXT: ld.w $s8, $sp, 2020 # 4-byte Folded Reload +; LA32-NEXT: ld.w $fp, $sp, 2024 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 2028 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 2032 +; LA32-NEXT: ret + %a = alloca i32, i32 4096, align 32768 + %b = getelementptr inbounds i32, ptr %a, i64 %i + store volatile i32 1, ptr %b + %1 = zext i32 %vla_size to i64 + %vla = alloca i8, i64 %1, align 2048 + %2 = load volatile i8, ptr %vla, align 2048 + ret void +} + +attributes #0 = { "probe-stack"="inline-asm" } diff --git a/llvm/test/CodeGen/LoongArch/stack-probing-dynamic-nonentry.ll b/llvm/test/CodeGen/LoongArch/stack-probing-dynamic-nonentry.ll new file mode 100644 index 0000000000000..9f6f0d5a53295 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/stack-probing-dynamic-nonentry.ll @@ -0,0 +1,109 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=loongarch64 -O2 < %s -verify-machineinstrs | FileCheck %s -check-prefix=LA64 +; RUN: llc -mtriple=loongarch32 -O2 < %s -verify-machineinstrs | FileCheck %s -check-prefix=LA32 + +; Test that very large outgoing call frames in functions with variable-sized +; objects get proper stack probing. The outgoing args are large enough to force +; the PROBED_STACKALLOC path, which must be expanded in a non-entry block. + +define void @f(i64 %n) #0 { +; LA64-LABEL: f: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: .cfi_def_cfa_offset 16 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 0 # 8-byte Folded Spill +; LA64-NEXT: .cfi_offset 1, -8 +; LA64-NEXT: .cfi_offset 22, -16 +; LA64-NEXT: addi.d $fp, $sp, 16 +; LA64-NEXT: .cfi_def_cfa 22, 0 +; LA64-NEXT: slli.d $a0, $a0, 2 +; LA64-NEXT: addi.d $a0, $a0, 15 +; LA64-NEXT: bstrins.d $a0, $zero, 3, 0 +; LA64-NEXT: sub.d $a0, $sp, $a0 +; LA64-NEXT: lu12i.w $a1, 1 +; LA64-NEXT: .LBB0_1: # %entry +; LA64-NEXT: # =>This Inner Loop Header: Depth=1 +; LA64-NEXT: sub.d $sp, $sp, $a1 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: bltu $a0, $sp, .LBB0_1 +; LA64-NEXT: # %bb.2: # %entry +; LA64-NEXT: move $sp, $a0 +; LA64-NEXT: lu12i.w $a1, 5 +; LA64-NEXT: sub.d $t1, $sp, $a1 +; LA64-NEXT: lu12i.w $t2, 1 +; LA64-NEXT: .LBB0_3: # %entry +; LA64-NEXT: # =>This Inner Loop Header: Depth=1 +; LA64-NEXT: sub.d $sp, $sp, $t2 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: bne $sp, $t1, .LBB0_3 +; LA64-NEXT: # %bb.4: # %entry +; LA64-NEXT: addi.d $sp, $sp, -2048 +; LA64-NEXT: addi.d $sp, $sp, -1424 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: pcaddu18i $ra, %call36(g) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: lu12i.w $a0, 5 +; LA64-NEXT: ori $a0, $a0, 3472 +; LA64-NEXT: add.d $sp, $sp, $a0 +; LA64-NEXT: addi.d $sp, $fp, -16 +; LA64-NEXT: ld.d $fp, $sp, 0 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +; +; LA32-LABEL: f: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: .cfi_def_cfa_offset 16 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 8 # 4-byte Folded Spill +; LA32-NEXT: .cfi_offset 1, -4 +; LA32-NEXT: .cfi_offset 22, -8 +; LA32-NEXT: addi.w $fp, $sp, 16 +; LA32-NEXT: .cfi_def_cfa 22, 0 +; LA32-NEXT: slli.w $a0, $a0, 2 +; LA32-NEXT: addi.w $a0, $a0, 15 +; LA32-NEXT: addi.w $a1, $zero, -16 +; LA32-NEXT: and $a0, $a0, $a1 +; LA32-NEXT: sub.w $a0, $sp, $a0 +; LA32-NEXT: lu12i.w $a1, 1 +; LA32-NEXT: .LBB0_1: # %entry +; LA32-NEXT: # =>This Inner Loop Header: Depth=1 +; LA32-NEXT: sub.w $sp, $sp, $a1 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: bltu $a0, $sp, .LBB0_1 +; LA32-NEXT: # %bb.2: # %entry +; LA32-NEXT: move $sp, $a0 +; LA32-NEXT: lu12i.w $a1, 5 +; LA32-NEXT: sub.w $t1, $sp, $a1 +; LA32-NEXT: lu12i.w $t2, 1 +; LA32-NEXT: .LBB0_3: # %entry +; LA32-NEXT: # =>This Inner Loop Header: Depth=1 +; LA32-NEXT: sub.w $sp, $sp, $t2 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: bne $sp, $t1, .LBB0_3 +; LA32-NEXT: # %bb.4: # %entry +; LA32-NEXT: addi.w $sp, $sp, -2048 +; LA32-NEXT: addi.w $sp, $sp, -1456 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: bl g +; LA32-NEXT: lu12i.w $a0, 5 +; LA32-NEXT: ori $a0, $a0, 3504 +; LA32-NEXT: add.w $sp, $sp, $a0 +; LA32-NEXT: addi.w $sp, $fp, -16 +; LA32-NEXT: ld.w $fp, $sp, 8 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +entry: + %v = alloca i32, i64 %n + call void @g(ptr %v, [3000 x i64] poison) + ret void +} + +declare void @g(ptr, [3000 x i64]) + +attributes #0 = { "probe-stack"="inline-asm" } diff --git a/llvm/test/CodeGen/LoongArch/stack-probing-dynamic.ll b/llvm/test/CodeGen/LoongArch/stack-probing-dynamic.ll new file mode 100644 index 0000000000000..fd928fe826340 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/stack-probing-dynamic.ll @@ -0,0 +1,479 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=loongarch64 -O2 < %s | FileCheck %s -check-prefix=LA64 +; RUN: llc -mtriple=loongarch32 -O2 < %s | FileCheck %s -check-prefix=LA32 + +; From llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll + +; Dynamically-sized allocation, needs a loop which can handle any size at +; runtime. The final iteration of the loop will temporarily put SP below the +; target address, but this doesn't break any of the ABI constraints on the +; stack, and also doesn't probe below the target SP value. +define void @dynamic(i64 %size, ptr %out) #0 { +; +; LA64-LABEL: dynamic: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: .cfi_def_cfa_offset 16 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 0 # 8-byte Folded Spill +; LA64-NEXT: .cfi_offset 1, -8 +; LA64-NEXT: .cfi_offset 22, -16 +; LA64-NEXT: addi.d $fp, $sp, 16 +; LA64-NEXT: .cfi_def_cfa 22, 0 +; LA64-NEXT: addi.d $a0, $a0, 15 +; LA64-NEXT: bstrins.d $a0, $zero, 3, 0 +; LA64-NEXT: sub.d $a0, $sp, $a0 +; LA64-NEXT: lu12i.w $a2, 1 +; LA64-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 +; LA64-NEXT: sub.d $sp, $sp, $a2 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: bltu $a0, $sp, .LBB0_1 +; LA64-NEXT: # %bb.2: +; LA64-NEXT: move $sp, $a0 +; LA64-NEXT: st.d $a0, $a1, 0 +; LA64-NEXT: addi.d $sp, $fp, -16 +; LA64-NEXT: ld.d $fp, $sp, 0 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +; +; LA32-LABEL: dynamic: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: .cfi_def_cfa_offset 16 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 8 # 4-byte Folded Spill +; LA32-NEXT: .cfi_offset 1, -4 +; LA32-NEXT: .cfi_offset 22, -8 +; LA32-NEXT: addi.w $fp, $sp, 16 +; LA32-NEXT: .cfi_def_cfa 22, 0 +; LA32-NEXT: addi.w $a0, $a0, 15 +; LA32-NEXT: addi.w $a1, $zero, -16 +; LA32-NEXT: and $a0, $a0, $a1 +; LA32-NEXT: sub.w $a0, $sp, $a0 +; LA32-NEXT: lu12i.w $a1, 1 +; LA32-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 +; LA32-NEXT: sub.w $sp, $sp, $a1 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: bltu $a0, $sp, .LBB0_1 +; LA32-NEXT: # %bb.2: +; LA32-NEXT: move $sp, $a0 +; LA32-NEXT: st.w $a0, $a2, 0 +; LA32-NEXT: addi.w $sp, $fp, -16 +; LA32-NEXT: ld.w $fp, $sp, 8 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret + %v = alloca i8, i64 %size, align 1 + store ptr %v, ptr %out, align 8 + ret void +} + +; This function has a fixed-size stack slot and a dynamic one. The fixed size +; slot isn't large enough that we would normally probe it, but we need to do so +; here otherwise the gap between the CSR save and the first probe of the +; dynamic allocation could be too far apart when the size of the dynamic +; allocation is close to the guard size. +define void @dynamic_fixed(i64 %size, ptr %out1, ptr %out2) #0 { +; +; LA64-LABEL: dynamic_fixed: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -96 +; LA64-NEXT: .cfi_def_cfa_offset 96 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 80 # 8-byte Folded Spill +; LA64-NEXT: .cfi_offset 1, -8 +; LA64-NEXT: .cfi_offset 22, -16 +; LA64-NEXT: addi.d $fp, $sp, 96 +; LA64-NEXT: .cfi_def_cfa 22, 0 +; LA64-NEXT: addi.d $a3, $fp, -88 +; LA64-NEXT: st.d $a3, $a1, 0 +; LA64-NEXT: addi.d $a0, $a0, 15 +; LA64-NEXT: bstrins.d $a0, $zero, 3, 0 +; LA64-NEXT: sub.d $a0, $sp, $a0 +; LA64-NEXT: lu12i.w $a1, 1 +; LA64-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1 +; LA64-NEXT: sub.d $sp, $sp, $a1 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: bltu $a0, $sp, .LBB1_1 +; LA64-NEXT: # %bb.2: +; LA64-NEXT: move $sp, $a0 +; LA64-NEXT: st.d $a0, $a2, 0 +; LA64-NEXT: addi.d $sp, $fp, -96 +; LA64-NEXT: ld.d $fp, $sp, 80 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 96 +; LA64-NEXT: ret +; +; LA32-LABEL: dynamic_fixed: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -80 +; LA32-NEXT: .cfi_def_cfa_offset 80 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: st.w $ra, $sp, 76 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 72 # 4-byte Folded Spill +; LA32-NEXT: .cfi_offset 1, -4 +; LA32-NEXT: .cfi_offset 22, -8 +; LA32-NEXT: addi.w $fp, $sp, 80 +; LA32-NEXT: .cfi_def_cfa 22, 0 +; LA32-NEXT: addi.w $a1, $fp, -72 +; LA32-NEXT: st.w $a1, $a2, 0 +; LA32-NEXT: addi.w $a0, $a0, 15 +; LA32-NEXT: addi.w $a1, $zero, -16 +; LA32-NEXT: and $a0, $a0, $a1 +; LA32-NEXT: sub.w $a0, $sp, $a0 +; LA32-NEXT: lu12i.w $a1, 1 +; LA32-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1 +; LA32-NEXT: sub.w $sp, $sp, $a1 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: bltu $a0, $sp, .LBB1_1 +; LA32-NEXT: # %bb.2: +; LA32-NEXT: move $sp, $a0 +; LA32-NEXT: st.w $a0, $a3, 0 +; LA32-NEXT: addi.w $sp, $fp, -80 +; LA32-NEXT: ld.w $fp, $sp, 72 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 76 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 80 +; LA32-NEXT: ret + %v1 = alloca i8, i64 64, align 1 + store ptr %v1, ptr %out1, align 8 + %v2 = alloca i8, i64 %size, align 1 + store ptr %v2, ptr %out2, align 8 + ret void +} + +; Dynamic allocation, with an alignment requirement greater than the alignment +; of SP. Done by ANDing the target SP with a constant to align it down, then +; doing the loop as normal. Note that we also re-align the stack in the prolog, +; which isn't actually needed because the only aligned allocations are dynamic, +; this is done even without stack probing. +define void @dynamic_align_64(i64 %size, ptr %out) #0 { +; +; LA64-LABEL: dynamic_align_64: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -128 +; LA64-NEXT: .cfi_def_cfa_offset 128 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: st.d $ra, $sp, 120 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 112 # 8-byte Folded Spill +; LA64-NEXT: st.d $s8, $sp, 104 # 8-byte Folded Spill +; LA64-NEXT: .cfi_offset 1, -8 +; LA64-NEXT: .cfi_offset 22, -16 +; LA64-NEXT: .cfi_offset 31, -24 +; LA64-NEXT: addi.d $fp, $sp, 128 +; LA64-NEXT: .cfi_def_cfa 22, 0 +; LA64-NEXT: bstrins.d $sp, $zero, 5, 0 +; LA64-NEXT: move $s8, $sp +; LA64-NEXT: addi.d $a0, $a0, 15 +; LA64-NEXT: bstrins.d $a0, $zero, 3, 0 +; LA64-NEXT: sub.d $a0, $sp, $a0 +; LA64-NEXT: bstrins.d $a0, $zero, 5, 0 +; LA64-NEXT: lu12i.w $a2, 1 +; LA64-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1 +; LA64-NEXT: sub.d $sp, $sp, $a2 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: bltu $a0, $sp, .LBB2_1 +; LA64-NEXT: # %bb.2: +; LA64-NEXT: move $sp, $a0 +; LA64-NEXT: st.d $a0, $a1, 0 +; LA64-NEXT: addi.d $sp, $fp, -128 +; LA64-NEXT: ld.d $s8, $sp, 104 # 8-byte Folded Reload +; LA64-NEXT: ld.d $fp, $sp, 112 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 120 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 128 +; LA64-NEXT: ret +; +; LA32-LABEL: dynamic_align_64: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -64 +; LA32-NEXT: .cfi_def_cfa_offset 64 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: st.w $ra, $sp, 60 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 56 # 4-byte Folded Spill +; LA32-NEXT: st.w $s8, $sp, 52 # 4-byte Folded Spill +; LA32-NEXT: .cfi_offset 1, -4 +; LA32-NEXT: .cfi_offset 22, -8 +; LA32-NEXT: .cfi_offset 31, -12 +; LA32-NEXT: addi.w $fp, $sp, 64 +; LA32-NEXT: .cfi_def_cfa 22, 0 +; LA32-NEXT: bstrins.w $sp, $zero, 5, 0 +; LA32-NEXT: move $s8, $sp +; LA32-NEXT: addi.w $a0, $a0, 15 +; LA32-NEXT: addi.w $a1, $zero, -16 +; LA32-NEXT: and $a0, $a0, $a1 +; LA32-NEXT: sub.w $a0, $sp, $a0 +; LA32-NEXT: addi.w $a1, $zero, -64 +; LA32-NEXT: and $a0, $a0, $a1 +; LA32-NEXT: lu12i.w $a1, 1 +; LA32-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1 +; LA32-NEXT: sub.w $sp, $sp, $a1 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: bltu $a0, $sp, .LBB2_1 +; LA32-NEXT: # %bb.2: +; LA32-NEXT: move $sp, $a0 +; LA32-NEXT: st.w $a0, $a2, 0 +; LA32-NEXT: addi.w $sp, $fp, -64 +; LA32-NEXT: ld.w $s8, $sp, 52 # 4-byte Folded Reload +; LA32-NEXT: ld.w $fp, $sp, 56 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 60 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 64 +; LA32-NEXT: ret + %v = alloca i8, i64 %size, align 64 + store ptr %v, ptr %out, align 8 + ret void +} + +; Dynamic allocation, with an alignment greater than the stack guard size. The +; only difference to the dynamic allocation is the constant used for aligning +; the target SP, the loop will probe the whole allocation without needing to +; know about the alignment padding. +define void @dynamic_align_8192(i64 %size, ptr %out) #0 { +; +; LA64-LABEL: dynamic_align_8192: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -2032 +; LA64-NEXT: .cfi_def_cfa_offset 2032 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: st.d $ra, $sp, 2024 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 2016 # 8-byte Folded Spill +; LA64-NEXT: st.d $s8, $sp, 2008 # 8-byte Folded Spill +; LA64-NEXT: .cfi_offset 1, -8 +; LA64-NEXT: .cfi_offset 22, -16 +; LA64-NEXT: .cfi_offset 31, -24 +; LA64-NEXT: addi.d $fp, $sp, 2032 +; LA64-NEXT: .cfi_def_cfa 22, 0 +; LA64-NEXT: lu12i.w $a2, 1 +; LA64-NEXT: sub.d $sp, $sp, $a2 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: sub.d $sp, $sp, $a2 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: sub.d $sp, $sp, $a2 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: addi.d $sp, $sp, -2048 +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: bstrins.d $sp, $zero, 12, 0 +; LA64-NEXT: move $s8, $sp +; LA64-NEXT: addi.d $a0, $a0, 15 +; LA64-NEXT: bstrins.d $a0, $zero, 3, 0 +; LA64-NEXT: sub.d $a0, $sp, $a0 +; LA64-NEXT: bstrins.d $a0, $zero, 12, 0 +; LA64-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 +; LA64-NEXT: sub.d $sp, $sp, $a2 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: bltu $a0, $sp, .LBB3_1 +; LA64-NEXT: # %bb.2: +; LA64-NEXT: move $sp, $a0 +; LA64-NEXT: st.d $a0, $a1, 0 +; LA64-NEXT: lu12i.w $a0, 4 +; LA64-NEXT: sub.d $sp, $fp, $a0 +; LA64-NEXT: lu12i.w $a0, 3 +; LA64-NEXT: ori $a0, $a0, 2064 +; LA64-NEXT: add.d $sp, $sp, $a0 +; LA64-NEXT: ld.d $s8, $sp, 2008 # 8-byte Folded Reload +; LA64-NEXT: ld.d $fp, $sp, 2016 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 2024 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 2032 +; LA64-NEXT: ret +; +; LA32-LABEL: dynamic_align_8192: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -2032 +; LA32-NEXT: .cfi_def_cfa_offset 2032 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: st.w $ra, $sp, 2028 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 2024 # 4-byte Folded Spill +; LA32-NEXT: st.w $s8, $sp, 2020 # 4-byte Folded Spill +; LA32-NEXT: .cfi_offset 1, -4 +; LA32-NEXT: .cfi_offset 22, -8 +; LA32-NEXT: .cfi_offset 31, -12 +; LA32-NEXT: addi.w $fp, $sp, 2032 +; LA32-NEXT: .cfi_def_cfa 22, 0 +; LA32-NEXT: lu12i.w $a1, 1 +; LA32-NEXT: sub.w $sp, $sp, $a1 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: sub.w $sp, $sp, $a1 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: sub.w $sp, $sp, $a1 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: addi.w $sp, $sp, -2048 +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: bstrins.w $sp, $zero, 12, 0 +; LA32-NEXT: move $s8, $sp +; LA32-NEXT: addi.w $a0, $a0, 15 +; LA32-NEXT: addi.w $a1, $zero, -16 +; LA32-NEXT: and $a0, $a0, $a1 +; LA32-NEXT: sub.w $a0, $sp, $a0 +; LA32-NEXT: lu12i.w $a1, -2 +; LA32-NEXT: and $a0, $a0, $a1 +; LA32-NEXT: lu12i.w $a1, 1 +; LA32-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 +; LA32-NEXT: sub.w $sp, $sp, $a1 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: bltu $a0, $sp, .LBB3_1 +; LA32-NEXT: # %bb.2: +; LA32-NEXT: move $sp, $a0 +; LA32-NEXT: st.w $a0, $a2, 0 +; LA32-NEXT: lu12i.w $a0, 4 +; LA32-NEXT: sub.w $sp, $fp, $a0 +; LA32-NEXT: lu12i.w $a0, 3 +; LA32-NEXT: ori $a0, $a0, 2064 +; LA32-NEXT: add.w $sp, $sp, $a0 +; LA32-NEXT: ld.w $s8, $sp, 2020 # 4-byte Folded Reload +; LA32-NEXT: ld.w $fp, $sp, 2024 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 2028 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 2032 +; LA32-NEXT: ret + %v = alloca i8, i64 %size, align 8192 + store ptr %v, ptr %out, align 8 + ret void +} + +; If a function has variable-sized stack objects, then any function calls which +; need to pass arguments on the stack must allocate the stack space for them +; dynamically, to ensure they are at the bottom of the frame. +define void @no_reserved_call_frame(i64 %n) #0 { +; +; LA64-LABEL: no_reserved_call_frame: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: .cfi_def_cfa_offset 16 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 0 # 8-byte Folded Spill +; LA64-NEXT: .cfi_offset 1, -8 +; LA64-NEXT: .cfi_offset 22, -16 +; LA64-NEXT: addi.d $fp, $sp, 16 +; LA64-NEXT: .cfi_def_cfa 22, 0 +; LA64-NEXT: slli.d $a0, $a0, 2 +; LA64-NEXT: addi.d $a0, $a0, 15 +; LA64-NEXT: bstrins.d $a0, $zero, 3, 0 +; LA64-NEXT: sub.d $a0, $sp, $a0 +; LA64-NEXT: lu12i.w $a1, 1 +; LA64-NEXT: .LBB4_1: # %entry +; LA64-NEXT: # =>This Inner Loop Header: Depth=1 +; LA64-NEXT: sub.d $sp, $sp, $a1 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: bltu $a0, $sp, .LBB4_1 +; LA64-NEXT: # %bb.2: # %entry +; LA64-NEXT: move $sp, $a0 +; LA64-NEXT: lu12i.w $a1, 1 +; LA64-NEXT: sub.d $sp, $sp, $a1 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: pcaddu18i $ra, %call36(callee_stack_args) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: lu12i.w $a0, 1 +; LA64-NEXT: add.d $sp, $sp, $a0 +; LA64-NEXT: addi.d $sp, $fp, -16 +; LA64-NEXT: ld.d $fp, $sp, 0 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret +; +; LA32-LABEL: no_reserved_call_frame: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: .cfi_def_cfa_offset 16 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 8 # 4-byte Folded Spill +; LA32-NEXT: .cfi_offset 1, -4 +; LA32-NEXT: .cfi_offset 22, -8 +; LA32-NEXT: addi.w $fp, $sp, 16 +; LA32-NEXT: .cfi_def_cfa 22, 0 +; LA32-NEXT: slli.w $a0, $a0, 2 +; LA32-NEXT: addi.w $a0, $a0, 15 +; LA32-NEXT: addi.w $a1, $zero, -16 +; LA32-NEXT: and $a0, $a0, $a1 +; LA32-NEXT: sub.w $a0, $sp, $a0 +; LA32-NEXT: lu12i.w $a1, 1 +; LA32-NEXT: .LBB4_1: # %entry +; LA32-NEXT: # =>This Inner Loop Header: Depth=1 +; LA32-NEXT: sub.w $sp, $sp, $a1 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: bltu $a0, $sp, .LBB4_1 +; LA32-NEXT: # %bb.2: # %entry +; LA32-NEXT: move $sp, $a0 +; LA32-NEXT: lu12i.w $a1, 1 +; LA32-NEXT: sub.w $sp, $sp, $a1 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: addi.w $sp, $sp, -32 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: bl callee_stack_args +; LA32-NEXT: lu12i.w $a0, 1 +; LA32-NEXT: ori $a0, $a0, 32 +; LA32-NEXT: add.w $sp, $sp, $a0 +; LA32-NEXT: addi.w $sp, $fp, -16 +; LA32-NEXT: ld.w $fp, $sp, 8 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +entry: + %v = alloca i32, i64 %n + call void @callee_stack_args(ptr %v, [518 x i64] poison) + ret void +} + +; Same as above but without a variable-sized allocation, so the reserved call +; frame can be folded into the fixed-size allocation in the prologue. +define void @reserved_call_frame(i64 %n) #0 { +; +; LA64-LABEL: reserved_call_frame: +; LA64: # %bb.0: # %entry +; LA64-NEXT: addi.d $sp, $sp, -2032 +; LA64-NEXT: .cfi_def_cfa_offset 2032 +; LA64-NEXT: st.d $ra, $sp, 2024 # 8-byte Folded Spill +; LA64-NEXT: .cfi_offset 1, -8 +; LA64-NEXT: lu12i.w $a0, 1 +; LA64-NEXT: sub.d $sp, $sp, $a0 +; LA64-NEXT: st.d $zero, $sp, 0 +; LA64-NEXT: .cfi_def_cfa_offset 6128 +; LA64-NEXT: addi.d $sp, $sp, -48 +; LA64-NEXT: .cfi_def_cfa_offset 6176 +; LA64-NEXT: lu12i.w $a0, 1 +; LA64-NEXT: add.d $a0, $sp, $a0 +; LA64-NEXT: pcaddu18i $ra, %call36(callee_stack_args) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: lu12i.w $a0, 1 +; LA64-NEXT: ori $a0, $a0, 48 +; LA64-NEXT: add.d $sp, $sp, $a0 +; LA64-NEXT: ld.d $ra, $sp, 2024 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 2032 +; LA64-NEXT: ret +; +; LA32-LABEL: reserved_call_frame: +; LA32: # %bb.0: # %entry +; LA32-NEXT: addi.w $sp, $sp, -2032 +; LA32-NEXT: .cfi_def_cfa_offset 2032 +; LA32-NEXT: st.w $ra, $sp, 2028 # 4-byte Folded Spill +; LA32-NEXT: .cfi_offset 1, -4 +; LA32-NEXT: lu12i.w $a0, 1 +; LA32-NEXT: sub.w $sp, $sp, $a0 +; LA32-NEXT: st.w $zero, $sp, 0 +; LA32-NEXT: .cfi_def_cfa_offset 6128 +; LA32-NEXT: addi.w $sp, $sp, -80 +; LA32-NEXT: .cfi_def_cfa_offset 6208 +; LA32-NEXT: lu12i.w $a0, 1 +; LA32-NEXT: ori $a0, $a0, 36 +; LA32-NEXT: add.w $a0, $sp, $a0 +; LA32-NEXT: bl callee_stack_args +; LA32-NEXT: lu12i.w $a0, 1 +; LA32-NEXT: ori $a0, $a0, 80 +; LA32-NEXT: add.w $sp, $sp, $a0 +; LA32-NEXT: ld.w $ra, $sp, 2028 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 2032 +; LA32-NEXT: ret +entry: + %v = alloca i32, i64 518 + call void @callee_stack_args(ptr %v, [518 x i64] poison) + ret void +} + +declare void @callee_stack_args(ptr, [518 x i64]) + + +attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" } diff --git a/llvm/test/CodeGen/LoongArch/stack-probing-frame-setup.mir b/llvm/test/CodeGen/LoongArch/stack-probing-frame-setup.mir new file mode 100644 index 0000000000000..465ddd012b134 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/stack-probing-frame-setup.mir @@ -0,0 +1,185 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=loongarch64 -x mir -run-pass=prologepilog -verify-machineinstrs < %s | FileCheck %s --check-prefix=LA64 +# RUN: llc -mtriple=loongarch32 -x mir -run-pass=prologepilog -verify-machineinstrs < %s | FileCheck %s --check-prefix=LA32 + +--- | + ; Function Attrs: uwtable + define void @no_reserved_call_frame(i64 %n) #0 { + entry: + %v = alloca i32, i64 %n, align 4 + call void @callee_stack_args(ptr %v, [518 x i64] poison) + ret void + } + + declare void @callee_stack_args(ptr, [518 x i64]) + + attributes #0 = { uwtable "frame-pointer"="none" "probe-stack"="inline-asm" } +... +--- +name: no_reserved_call_frame +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +noPhis: true +isSSA: false +noVRegs: true +hasFakeUses: false +callsEHReturn: false +callsUnwindInit: false +hasEHContTarget: false +hasEHScopes: false +hasEHFunclets: false +isOutlined: false +debugInstrRef: false +failsVerification: false +tracksDebugUserValues: true +registers: [] +liveins: + - { reg: '$r4', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 8 + adjustsStack: true + hasCalls: true + framePointerPolicy: none + stackProtector: '' + functionContext: '' + maxCallFrameSize: 4294967295 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + hasTailCall: false + isCalleeSavedInfoValid: false + localFrameSize: 0 +fixedStack: [] +stack: + - { id: 0, name: v, type: variable-sized, offset: 0, alignment: 1, stack-id: default, + callee-saved-register: '', callee-saved-restored: true, debug-info-variable: '', + debug-info-expression: '', debug-info-location: '' } +entry_values: [] +callSites: [] +debugValueSubstitutions: [] +constants: [] +machineFunctionInfo: {} +body: | + ; LA64-LABEL: name: no_reserved_call_frame + ; LA64: bb.0.entry: + ; LA64-NEXT: successors: %bb.1(0x80000000) + ; LA64-NEXT: liveins: $r4, $r1 + ; LA64-NEXT: {{ $}} + ; LA64-NEXT: $r3 = frame-setup ADDI_D $r3, -16 + ; LA64-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 + ; LA64-NEXT: ST_D killed $r1, $r3, 8 :: (store (s64) into %stack.1) + ; LA64-NEXT: ST_D killed $r22, $r3, 0 :: (store (s64) into %stack.2) + ; LA64-NEXT: frame-setup CFI_INSTRUCTION offset $r1, -8 + ; LA64-NEXT: frame-setup CFI_INSTRUCTION offset $r22, -16 + ; LA64-NEXT: $r22 = frame-setup ADDI_D $r3, 16 + ; LA64-NEXT: frame-setup CFI_INSTRUCTION def_cfa $r22, 0 + ; LA64-NEXT: renamable $r4 = SLLI_D killed renamable $r4, 2 + ; LA64-NEXT: renamable $r4 = nuw ADDI_D killed renamable $r4, 15 + ; LA64-NEXT: renamable $r4 = BSTRINS_D killed renamable $r4, $r0, 3, 0 + ; LA64-NEXT: renamable $r4 = SUB_D $r3, killed renamable $r4 + ; LA64-NEXT: renamable $r5 = LU12I_W 1 + ; LA64-NEXT: {{ $}} + ; LA64-NEXT: bb.1.entry: + ; LA64-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; LA64-NEXT: liveins: $r4, $r5 + ; LA64-NEXT: {{ $}} + ; LA64-NEXT: $r3 = SUB_D $r3, renamable $r5 + ; LA64-NEXT: ST_D $r0, $r3, 0 + ; LA64-NEXT: BLTU renamable $r4, $r3, %bb.1 + ; LA64-NEXT: {{ $}} + ; LA64-NEXT: bb.2.entry: + ; LA64-NEXT: liveins: $r4 + ; LA64-NEXT: {{ $}} + ; LA64-NEXT: $r3 = OR renamable $r4, $r0 + ; LA64-NEXT: $r5 = LU12I_W 1 + ; LA64-NEXT: $r3 = SUB_D $r3, killed $r5 + ; LA64-NEXT: PseudoCALL_MEDIUM target-flags(loongarch-call-plt) @callee_stack_args, csr_ilp32d_lp64d, implicit-def dead $r1, implicit-def dead $r20, implicit $r4, implicit undef $r5, implicit undef $r6, implicit undef $r7, implicit undef $r8, implicit undef $r9, implicit undef $r10, implicit undef $r11, implicit-def $r3 + ; LA64-NEXT: $r4 = LU12I_W 1 + ; LA64-NEXT: $r3 = ADD_D $r3, killed $r4 + ; LA64-NEXT: $r3 = frame-destroy ADDI_D $r22, -16 + ; LA64-NEXT: $r22 = LD_D $r3, 0 :: (load (s64) from %stack.2) + ; LA64-NEXT: $r1 = LD_D $r3, 8 :: (load (s64) from %stack.1) + ; LA64-NEXT: $r3 = frame-destroy ADDI_D $r3, 16 + ; LA64-NEXT: PseudoRET + ; + ; LA32-LABEL: name: no_reserved_call_frame + ; LA32: bb.0.entry: + ; LA32-NEXT: successors: %bb.1(0x80000000) + ; LA32-NEXT: liveins: $r4, $r1 + ; LA32-NEXT: {{ $}} + ; LA32-NEXT: $r3 = frame-setup ADDI_W $r3, -16 + ; LA32-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 + ; LA32-NEXT: ST_W killed $r1, $r3, 12 :: (store (s32) into %stack.1) + ; LA32-NEXT: ST_W killed $r22, $r3, 8 :: (store (s32) into %stack.2) + ; LA32-NEXT: frame-setup CFI_INSTRUCTION offset $r1, -4 + ; LA32-NEXT: frame-setup CFI_INSTRUCTION offset $r22, -8 + ; LA32-NEXT: $r22 = frame-setup ADDI_W $r3, 16 + ; LA32-NEXT: frame-setup CFI_INSTRUCTION def_cfa $r22, 0 + ; LA32-NEXT: renamable $r4 = SLLI_D killed renamable $r4, 2 + ; LA32-NEXT: renamable $r4 = nuw ADDI_D killed renamable $r4, 15 + ; LA32-NEXT: renamable $r4 = BSTRINS_D killed renamable $r4, $r0, 3, 0 + ; LA32-NEXT: renamable $r4 = SUB_D $r3, killed renamable $r4 + ; LA32-NEXT: renamable $r5 = LU12I_W 1 + ; LA32-NEXT: {{ $}} + ; LA32-NEXT: bb.1.entry: + ; LA32-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; LA32-NEXT: liveins: $r4, $r5 + ; LA32-NEXT: {{ $}} + ; LA32-NEXT: $r3 = SUB_D $r3, renamable $r5 + ; LA32-NEXT: ST_D $r0, $r3, 0 + ; LA32-NEXT: BLTU renamable $r4, $r3, %bb.1 + ; LA32-NEXT: {{ $}} + ; LA32-NEXT: bb.2.entry: + ; LA32-NEXT: liveins: $r4 + ; LA32-NEXT: {{ $}} + ; LA32-NEXT: $r3 = OR renamable $r4, $r0 + ; LA32-NEXT: $r5 = LU12I_W 1 + ; LA32-NEXT: $r3 = SUB_W $r3, killed $r5 + ; LA32-NEXT: PseudoCALL_MEDIUM target-flags(loongarch-call-plt) @callee_stack_args, csr_ilp32d_lp64d, implicit-def dead $r1, implicit-def dead $r20, implicit $r4, implicit undef $r5, implicit undef $r6, implicit undef $r7, implicit undef $r8, implicit undef $r9, implicit undef $r10, implicit undef $r11, implicit-def $r3 + ; LA32-NEXT: $r4 = LU12I_W 1 + ; LA32-NEXT: $r3 = ADD_W $r3, killed $r4 + ; LA32-NEXT: $r3 = frame-destroy ADDI_W $r22, -16 + ; LA32-NEXT: $r22 = LD_W $r3, 8 :: (load (s32) from %stack.2) + ; LA32-NEXT: $r1 = LD_W $r3, 12 :: (load (s32) from %stack.1) + ; LA32-NEXT: $r3 = frame-destroy ADDI_W $r3, 16 + ; LA32-NEXT: PseudoRET + bb.0.entry: + successors: %bb.1(0x80000000) + liveins: $r4 + + renamable $r4 = SLLI_D killed renamable $r4, 2 + renamable $r4 = nuw ADDI_D killed renamable $r4, 15 + renamable $r4 = BSTRINS_D killed renamable $r4, $r0, 3, 0 + renamable $r4 = SUB_D $r3, killed renamable $r4 + renamable $r5 = LU12I_W 1 + + bb.1.entry: + successors: %bb.2(0x40000000), %bb.1(0x40000000) + liveins: $r4, $r5 + + $r3 = SUB_D $r3, renamable $r5 + ST_D $r0, $r3, 0 + BLTU renamable $r4, $r3, %bb.1 + + bb.2.entry: + liveins: $r4 + + $r3 = OR renamable $r4, $r0 + ADJCALLSTACKDOWN 4088, 0, implicit-def dead $r3, implicit $r3 + PseudoCALL_MEDIUM target-flags(loongarch-call-plt) @callee_stack_args, csr_ilp32d_lp64d, implicit-def dead $r1, implicit-def dead $r20, implicit $r4, implicit undef $r5, implicit undef $r6, implicit undef $r7, implicit undef $r8, implicit undef $r9, implicit undef $r10, implicit undef $r11, implicit-def $r3 + ADJCALLSTACKUP 4088, 0, implicit-def dead $r3, implicit $r3 + PseudoRET +... _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
