https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/179204
>From 58afe59f34949a1903c0ece45cc45cea6d3278c8 Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell <[email protected]> Date: Mon, 2 Feb 2026 11:31:16 +0000 Subject: [PATCH] [AArch64][SME] Simplify SME ABI pass (revert cross-block optimizations) These were enabled recently, but they're not quite ready for production use. For now, this patch removes these optimizations, we will look into adding these (or something similar) back at a later date. --- clang/test/CodeGen/AArch64/sme-remarks.c | 40 +-- llvm/lib/Target/AArch64/MachineSMEABIPass.cpp | 272 +----------------- llvm/test/CodeGen/AArch64/O0-pipeline.ll | 1 - llvm/test/CodeGen/AArch64/O3-pipeline.ll | 1 - .../machine-sme-abi-find-insert-pt.mir | 8 +- .../AArch64/sme-abi-save-call-remarks.ll | 86 ------ llvm/test/CodeGen/AArch64/sme-agnostic-za.ll | 82 ++---- .../CodeGen/AArch64/sme-za-control-flow.ll | 164 +++++------ .../test/CodeGen/AArch64/sme-za-exceptions.ll | 66 +++-- .../AArch64/sme-za-lazy-save-buffer.ll | 136 +++------ 10 files changed, 217 insertions(+), 639 deletions(-) diff --git a/clang/test/CodeGen/AArch64/sme-remarks.c b/clang/test/CodeGen/AArch64/sme-remarks.c index f7a1f33f3372d..4b926ac705522 100644 --- a/clang/test/CodeGen/AArch64/sme-remarks.c +++ b/clang/test/CodeGen/AArch64/sme-remarks.c @@ -3,39 +3,11 @@ // RUN: %clang_cc1 -triple aarch64 -target-feature +sme -mllvm -aarch64-new-sme-abi=false -Rpass-analysis=sme -verify=expected-sdag %s -S -o /dev/null // RUN: %clang_cc1 -triple aarch64 -target-feature +sme -Rpass-analysis=sme -verify %s -S -o /dev/null %s -void private_za_callee_a(); -void private_za_callee_b(); -void private_za_callee_c(); +void private_za_callee(); -void test_za_merge_paths(int a) __arm_inout("za") { - // expected-remark@+1 {{lazy save of ZA emitted in 'test_za_merge_paths'}} - if (a != 0) - // expected-sdag-remark@+2 {{call from 'test_za_merge_paths' to 'unknown callee' sets up a lazy save for ZA}} - // expected-remark@+1 {{call to 'private_za_callee_a' requires ZA save}} - private_za_callee_a(); - else - // expected-sdag-remark@+2 {{call from 'test_za_merge_paths' to 'unknown callee' sets up a lazy save for ZA}} - // expected-remark@+1 {{call to 'private_za_callee_b' requires ZA save}} - private_za_callee_b(); - // expected-sdag-remark@+3 {{call from 'test_za_merge_paths' to 'unknown callee' sets up a lazy save for ZA}} - /// The new lowering won't report this call as the save is already needed due - /// to the call to `private_za_callee_a/b()` calls on both paths to this call. - private_za_callee_c(); -} - -void test_lazy_save_multiple_paths(int a) __arm_inout("za") { - // expected-remark@+1 {{lazy save of ZA emitted in 'test_lazy_save_multiple_paths'}} - if (a != 0) - // expected-sdag-remark@+2 {{call from 'test_lazy_save_multiple_paths' to 'unknown callee' sets up a lazy save for ZA}} - // expected-remark@+1 {{call to 'private_za_callee_a' requires ZA save}} - private_za_callee_a(); - else { - // expected-sdag-remark@+2 {{call from 'test_lazy_save_multiple_paths' to 'unknown callee' sets up a lazy save for ZA}} - // expected-remark@+1 {{call to 'private_za_callee_b' requires ZA save}} - private_za_callee_b(); - // expected-sdag-remark@+3 {{call from 'test_lazy_save_multiple_paths' to 'unknown callee' sets up a lazy save for ZA}} - /// The new lowering won't report this call as the save is already needed - /// due to the call to `private_za_callee_b()`. - private_za_callee_c(); - } +void test_save_remarks(int a) __arm_inout("za") { + // expected-sdag-remark@+3 {{call from 'test_save_remarks' to 'unknown callee' sets up a lazy save for ZA}} + // expected-remark@+2 {{lazy save of ZA emitted in 'test_save_remarks'}} + // expected-remark@+1 {{call to 'private_za_callee' requires ZA save}} + private_za_callee(); } diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp index 39c2bff0ffca4..caf57a13acdf2 100644 --- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp +++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp @@ -10,48 +10,6 @@ // implementing the lazy (and agnostic) ZA state save schemes around calls. // //===----------------------------------------------------------------------===// -// -// This pass works by collecting instructions that require ZA to be in a -// specific state (e.g., "ACTIVE" or "SAVED") and inserting the necessary state -// transitions to ensure ZA is in the required state before instructions. State -// transitions represent actions such as setting up or restoring a lazy save. -// Certain points within a function may also have predefined states independent -// of any instructions, for example, a "shared_za" function is always entered -// and exited in the "ACTIVE" state. -// -// To handle ZA state across control flow, we make use of edge bundling. This -// assigns each block an "incoming" and "outgoing" edge bundle (representing -// incoming and outgoing edges). Initially, these are unique to each block; -// then, in the process of forming bundles, the outgoing bundle of a block is -// joined with the incoming bundle of all successors. The result is that each -// bundle can be assigned a single ZA state, which ensures the state required by -// all a blocks' successors is the same, and that each basic block will always -// be entered with the same ZA state. This eliminates the need for splitting -// edges to insert state transitions or "phi" nodes for ZA states. -// -// See below for a simple example of edge bundling. -// -// The following shows a conditionally executed basic block (BB1): -// -// if (cond) -// BB1 -// BB2 -// -// Initial Bundles Joined Bundles -// -// ┌──0──┐ ┌──0──┐ -// │ BB0 │ │ BB0 │ -// └──1──┘ └──1──┘ -// ├───────┐ ├───────┐ -// ▼ │ ▼ │ -// ┌──2──┐ │ ─────► ┌──1──┐ │ -// │ BB1 │ ▼ │ BB1 │ ▼ -// └──3──┘ ┌──4──┐ └──1──┘ ┌──1──┐ -// └───►4 BB2 │ └───►1 BB2 │ -// └──5──┘ └──2──┘ -// -// On the left are the initial per-block bundles, and on the right are the -// joined bundles (which are the result of the EdgeBundles analysis). #include "AArch64InstrInfo.h" #include "AArch64MachineFunctionInfo.h" @@ -59,7 +17,6 @@ #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/ADT/BitmaskEnum.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/CodeGen/EdgeBundles.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -137,9 +94,6 @@ struct InstInfo { /// that do not require a ZA state are not recorded. struct BlockInfo { SmallVector<InstInfo> Insts; - ZAState FixedEntryState{ZAState::ANY}; - ZAState DesiredIncomingState{ZAState::ANY}; - ZAState DesiredOutgoingState{ZAState::ANY}; LiveRegs PhysLiveRegsAtEntry = LiveRegs::None; LiveRegs PhysLiveRegsAtExit = LiveRegs::None; }; @@ -201,23 +155,6 @@ class EmitContext { Register AgnosticZABufferPtr = AArch64::NoRegister; }; -/// Checks if \p State is a legal edge bundle state. For a state to be a legal -/// bundle state, it must be possible to transition from it to any other bundle -/// state without losing any ZA state. This is the case for ACTIVE/LOCAL_SAVED, -/// as you can transition between those states by saving/restoring ZA. The OFF -/// state would not be legal, as transitioning to it drops the content of ZA. -static bool isLegalEdgeBundleZAState(ZAState State) { - switch (State) { - case ZAState::ACTIVE: // ZA state within the accumulator/ZT0. - case ZAState::ACTIVE_ZT0_SAVED: // ZT0 is saved (ZA is active). - case ZAState::LOCAL_SAVED: // ZA state may be saved on the stack. - case ZAState::LOCAL_COMMITTED: // ZA state is saved on the stack. - return true; - default: - return false; - } -} - StringRef getZAStateString(ZAState State) { #define MAKE_CASE(V) \ case V: \ @@ -303,7 +240,6 @@ struct MachineSMEABI : public MachineFunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired<EdgeBundlesWrapperLegacy>(); AU.addRequired<MachineOptimizationRemarkEmitterPass>(); AU.addRequired<LibcallLoweringInfoWrapper>(); AU.addPreservedID(MachineLoopInfoID); @@ -315,21 +251,9 @@ struct MachineSMEABI : public MachineFunctionPass { /// within the machine function. FunctionInfo collectNeededZAStates(SMEAttrs SMEFnAttrs); - /// Assigns each edge bundle a ZA state based on the needed states of blocks - /// that have incoming or outgoing edges in that bundle. - SmallVector<ZAState> assignBundleZAStates(const EdgeBundles &Bundles, - const FunctionInfo &FnInfo); - /// Inserts code to handle changes between ZA states within the function. /// E.g., ACTIVE -> LOCAL_SAVED will insert code required to save ZA. - void insertStateChanges(EmitContext &, const FunctionInfo &FnInfo, - const EdgeBundles &Bundles, - ArrayRef<ZAState> BundleStates); - - /// Propagates desired states forwards (from predecessors -> successors) if - /// \p Forwards, otherwise, propagates backwards (from successors -> - /// predecessors). - void propagateDesiredStates(FunctionInfo &FnInfo, bool Forwards = true); + void insertStateChanges(EmitContext &, const FunctionInfo &FnInfo); void addSMELibCall(MachineInstrBuilder &MIB, RTLIB::Libcall LC, CallingConv::ID ExpectedCC); @@ -482,14 +406,6 @@ FunctionInfo MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) { for (MachineBasicBlock &MBB : *MF) { BlockInfo &Block = Blocks[MBB.getNumber()]; - if (MBB.isEntryBlock()) { - // Entry block: - Block.FixedEntryState = ZAState::ENTRY; - } else if (MBB.isEHPad()) { - // EH entry block: - Block.FixedEntryState = ZAState::LOCAL_COMMITTED; - } - LiveRegUnits LiveUnits(*TRI); LiveUnits.addLiveOuts(MBB); @@ -512,7 +428,6 @@ FunctionInfo MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) { auto [NeededState, InsertPt] = getInstNeededZAState(*TRI, MI, SMEFnAttrs); assert((InsertPt == MBBI || isCallStartOpcode(InsertPt->getOpcode())) && "Unexpected state change insertion point!"); - // TODO: Do something to avoid state changes where NZCV is live. if (MBBI == FirstTerminatorInsertPt) Block.PhysLiveRegsAtExit = PhysLiveRegs; if (MBBI == FirstNonPhiInsertPt) @@ -523,128 +438,12 @@ FunctionInfo MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) { // Reverse vector (as we had to iterate backwards for liveness). std::reverse(Block.Insts.begin(), Block.Insts.end()); - - // Record the desired states on entry/exit of this block. These are the - // states that would not incur a state transition. - if (!Block.Insts.empty()) { - Block.DesiredIncomingState = Block.Insts.front().NeededState; - Block.DesiredOutgoingState = Block.Insts.back().NeededState; - } } return FunctionInfo{std::move(Blocks), AfterSMEProloguePt, PhysLiveRegsAfterSMEPrologue}; } -void MachineSMEABI::propagateDesiredStates(FunctionInfo &FnInfo, - bool Forwards) { - // If `Forwards`, this propagates desired states from predecessors to - // successors, otherwise, this propagates states from successors to - // predecessors. - auto GetBlockState = [](BlockInfo &Block, bool Incoming) -> ZAState & { - return Incoming ? Block.DesiredIncomingState : Block.DesiredOutgoingState; - }; - - SmallVector<MachineBasicBlock *> Worklist; - for (auto [BlockID, BlockInfo] : enumerate(FnInfo.Blocks)) { - if (!isLegalEdgeBundleZAState(GetBlockState(BlockInfo, Forwards))) - Worklist.push_back(MF->getBlockNumbered(BlockID)); - } - - while (!Worklist.empty()) { - MachineBasicBlock *MBB = Worklist.pop_back_val(); - BlockInfo &Block = FnInfo.Blocks[MBB->getNumber()]; - - // Pick a legal edge bundle state that matches the majority of - // predecessors/successors. - int StateCounts[ZAState::NUM_ZA_STATE] = {0}; - for (MachineBasicBlock *PredOrSucc : - Forwards ? predecessors(MBB) : successors(MBB)) { - BlockInfo &PredOrSuccBlock = FnInfo.Blocks[PredOrSucc->getNumber()]; - ZAState ZAState = GetBlockState(PredOrSuccBlock, !Forwards); - if (isLegalEdgeBundleZAState(ZAState)) - StateCounts[ZAState]++; - } - - ZAState PropagatedState = ZAState(max_element(StateCounts) - StateCounts); - ZAState &CurrentState = GetBlockState(Block, Forwards); - if (PropagatedState != CurrentState) { - CurrentState = PropagatedState; - ZAState &OtherState = GetBlockState(Block, !Forwards); - // Propagate to the incoming/outgoing state if that is also "ANY". - if (OtherState == ZAState::ANY) - OtherState = PropagatedState; - // Push any successors/predecessors that may need updating to the - // worklist. - for (MachineBasicBlock *SuccOrPred : - Forwards ? successors(MBB) : predecessors(MBB)) { - BlockInfo &SuccOrPredBlock = FnInfo.Blocks[SuccOrPred->getNumber()]; - if (!isLegalEdgeBundleZAState(GetBlockState(SuccOrPredBlock, Forwards))) - Worklist.push_back(SuccOrPred); - } - } - } -} - -/// Assigns each edge bundle a ZA state based on the needed states of blocks -/// that have incoming or outgoing edges in that bundle. -SmallVector<ZAState> -MachineSMEABI::assignBundleZAStates(const EdgeBundles &Bundles, - const FunctionInfo &FnInfo) { - SmallVector<ZAState> BundleStates(Bundles.getNumBundles()); - for (unsigned I = 0, E = Bundles.getNumBundles(); I != E; ++I) { - LLVM_DEBUG(dbgs() << "Assigning ZA state for edge bundle: " << I << '\n'); - - // Attempt to assign a ZA state for this bundle that minimizes state - // transitions. Edges within loops are given a higher weight as we assume - // they will be executed more than once. - int EdgeStateCounts[ZAState::NUM_ZA_STATE] = {0}; - for (unsigned BlockID : Bundles.getBlocks(I)) { - LLVM_DEBUG(dbgs() << "- bb." << BlockID); - - const BlockInfo &Block = FnInfo.Blocks[BlockID]; - bool InEdge = Bundles.getBundle(BlockID, /*Out=*/false) == I; - bool OutEdge = Bundles.getBundle(BlockID, /*Out=*/true) == I; - - bool LegalInEdge = - InEdge && isLegalEdgeBundleZAState(Block.DesiredIncomingState); - bool LegalOutEgde = - OutEdge && isLegalEdgeBundleZAState(Block.DesiredOutgoingState); - if (LegalInEdge) { - LLVM_DEBUG(dbgs() << " DesiredIncomingState: " - << getZAStateString(Block.DesiredIncomingState)); - EdgeStateCounts[Block.DesiredIncomingState]++; - } - if (LegalOutEgde) { - LLVM_DEBUG(dbgs() << " DesiredOutgoingState: " - << getZAStateString(Block.DesiredOutgoingState)); - EdgeStateCounts[Block.DesiredOutgoingState]++; - } - if (!LegalInEdge && !LegalOutEgde) - LLVM_DEBUG(dbgs() << " (no state preference)"); - LLVM_DEBUG(dbgs() << '\n'); - } - - ZAState BundleState = - ZAState(max_element(EdgeStateCounts) - EdgeStateCounts); - - if (BundleState == ZAState::ANY) - BundleState = ZAState::ACTIVE; - - LLVM_DEBUG({ - dbgs() << "Chosen ZA state: " << getZAStateString(BundleState) << '\n' - << "Edge counts:"; - for (auto [State, Count] : enumerate(EdgeStateCounts)) - dbgs() << " " << getZAStateString(ZAState(State)) << ": " << Count; - dbgs() << "\n\n"; - }); - - BundleStates[I] = BundleState; - } - - return BundleStates; -} - std::pair<MachineBasicBlock::iterator, LiveRegs> MachineSMEABI::findStateChangeInsertionPoint( MachineBasicBlock &MBB, const BlockInfo &Block, @@ -695,17 +494,15 @@ MachineSMEABI::findStateChangeInsertionPoint( } void MachineSMEABI::insertStateChanges(EmitContext &Context, - const FunctionInfo &FnInfo, - const EdgeBundles &Bundles, - ArrayRef<ZAState> BundleStates) { + const FunctionInfo &FnInfo) { + // TODO: Avoid redundant state transitions between blocks. for (MachineBasicBlock &MBB : *MF) { const BlockInfo &Block = FnInfo.Blocks[MBB.getNumber()]; - ZAState InState = BundleStates[Bundles.getBundle(MBB.getNumber(), - /*Out=*/false)]; - - ZAState CurrentState = Block.FixedEntryState; - if (CurrentState == ZAState::ANY) - CurrentState = InState; + ZAState CurrentState = ZAState::ACTIVE; + if (MBB.isEntryBlock()) + CurrentState = ZAState::ENTRY; + else if (MBB.isEHPad()) + CurrentState = ZAState::LOCAL_COMMITTED; for (auto &Inst : Block.Insts) { if (CurrentState != Inst.NeededState) { @@ -720,12 +517,10 @@ void MachineSMEABI::insertStateChanges(EmitContext &Context, if (MBB.succ_empty()) continue; - ZAState OutState = - BundleStates[Bundles.getBundle(MBB.getNumber(), /*Out=*/true)]; - if (CurrentState != OutState) { + if (CurrentState != ZAState::ACTIVE) { auto [InsertPt, PhysLiveRegs] = findStateChangeInsertionPoint(MBB, Block, Block.Insts.end()); - emitStateChange(Context, MBB, InsertPt, CurrentState, OutState, + emitStateChange(Context, MBB, InsertPt, CurrentState, ZAState::ACTIVE, PhysLiveRegs); } } @@ -1178,8 +973,7 @@ void MachineSMEABI::emitStateChange(EmitContext &Context, // TODO: Avoid setting up the save buffer if there's no transition to // LOCAL_SAVED. if (From == ZAState::ENTRY) { - assert(&MBB == &MBB.getParent()->front() && - "ENTRY state only valid in entry block"); + assert(MBB.isEntryBlock() && "ENTRY state only valid in entry block"); emitSMEPrologue(MBB, MBB.getFirstNonPHI()); if (To == ZAState::ACTIVE) return; // Nothing more to do (ZA is active after the prologue). @@ -1232,6 +1026,7 @@ void MachineSMEABI::emitStateChange(EmitContext &Context, case transitionFrom(ZAState::LOCAL_COMMITTED).to(ZAState::ACTIVE): case transitionFrom(ZAState::LOCAL_COMMITTED).to(ZAState::ACTIVE_ZT0_SAVED): case transitionFrom(ZAState::LOCAL_SAVED).to(ZAState::ACTIVE): + case transitionFrom(ZAState::LOCAL_SAVED).to(ZAState::ACTIVE_ZT0_SAVED): if (HasZAState) emitZARestore(Context, MBB, InsertPt, PhysLiveRegs); else @@ -1283,51 +1078,10 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) { TRI = Subtarget->getRegisterInfo(); MRI = &MF.getRegInfo(); - const EdgeBundles &Bundles = - getAnalysis<EdgeBundlesWrapperLegacy>().getEdgeBundles(); - FunctionInfo FnInfo = collectNeededZAStates(SMEFnAttrs); - if (OptLevel != CodeGenOptLevel::None) { - // Propagate desired states forward, then backwards. Most of the propagation - // should be done in the forward step, and backwards propagation is then - // used to fill in the gaps. Note: Doing both in one step can give poor - // results. For example, consider this subgraph: - // - // ┌─────┐ - // ┌─┤ BB0 ◄───┐ - // │ └─┬───┘ │ - // │ ┌─▼───◄──┐│ - // │ │ BB1 │ ││ - // │ └─┬┬──┘ ││ - // │ │└─────┘│ - // │ ┌─▼───┐ │ - // │ │ BB2 ├───┘ - // │ └─┬───┘ - // │ ┌─▼───┐ - // └─► BB3 │ - // └─────┘ - // - // If: - // - "BB0" and "BB2" (outer loop) has no state preference - // - "BB1" (inner loop) desires the ACTIVE state on entry/exit - // - "BB3" desires the LOCAL_SAVED state on entry - // - // If we propagate forwards first, ACTIVE is propagated from BB1 to BB2, - // then from BB2 to BB0. Which results in the inner and outer loops having - // the "ACTIVE" state. This avoids any state changes in the loops. - // - // If we propagate backwards first, we _could_ propagate LOCAL_SAVED from - // BB3 to BB0, which would result in a transition from ACTIVE -> LOCAL_SAVED - // in the outer loop. - for (bool Forwards : {true, false}) - propagateDesiredStates(FnInfo, Forwards); - } - - SmallVector<ZAState> BundleStates = assignBundleZAStates(Bundles, FnInfo); - EmitContext Context; - insertStateChanges(Context, FnInfo, Bundles, BundleStates); + insertStateChanges(Context, FnInfo); if (Context.needsSaveBuffer()) { if (FnInfo.AfterSMEProloguePt) { diff --git a/llvm/test/CodeGen/AArch64/O0-pipeline.ll b/llvm/test/CodeGen/AArch64/O0-pipeline.ll index cc0655b31d892..0e1e84cca20fc 100644 --- a/llvm/test/CodeGen/AArch64/O0-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O0-pipeline.ll @@ -54,7 +54,6 @@ ; CHECK-NEXT: AArch64 Instruction Selection ; CHECK-NEXT: Finalize ISel and expand pseudo-instructions ; CHECK-NEXT: Local Stack Slot Allocation -; CHECK-NEXT: Bundle Machine CFG Edges ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Machine Optimization Remark Emitter ; CHECK-NEXT: Machine SME ABI pass diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll index 472f1f616c600..299ae6ffae2d3 100644 --- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -139,7 +139,6 @@ ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: AArch64 Local Dynamic TLS Access Clean-up ; CHECK-NEXT: Finalize ISel and expand pseudo-instructions -; CHECK-NEXT: Bundle Machine CFG Edges ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: Machine Optimization Remark Emitter ; CHECK-NEXT: Machine SME ABI pass diff --git a/llvm/test/CodeGen/AArch64/machine-sme-abi-find-insert-pt.mir b/llvm/test/CodeGen/AArch64/machine-sme-abi-find-insert-pt.mir index ed768dec77998..5b561743eed80 100644 --- a/llvm/test/CodeGen/AArch64/machine-sme-abi-find-insert-pt.mir +++ b/llvm/test/CodeGen/AArch64/machine-sme-abi-find-insert-pt.mir @@ -121,10 +121,6 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $sp ; CHECK-NEXT: $nzcv = IMPLICIT_DEF ; CHECK-NEXT: $zab0 = IMPLICIT_DEF - ; CHECK-NEXT: [[MRS:%[0-9]+]]:gpr64 = MRS 55824, implicit-def $nzcv, implicit $nzcv - ; CHECK-NEXT: $x0 = COPY [[COPY1]] - ; CHECK-NEXT: BL &__arm_sme_save, csr_aarch64_sme_abi_support_routines_preservemost_from_x1, implicit-def $lr, implicit $sp, implicit $x0 - ; CHECK-NEXT: MSR 55824, [[MRS]], implicit-def $nzcv ; CHECK-NEXT: Bcc 2, %bb.1, implicit $nzcv ; CHECK-NEXT: B %bb.2 ; CHECK-NEXT: {{ $}} @@ -132,6 +128,8 @@ body: | ; CHECK-NEXT: liveins: $nzcv ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: FAKE_USE $nzcv + ; CHECK-NEXT: $x0 = COPY [[COPY1]] + ; CHECK-NEXT: BL &__arm_sme_save, csr_aarch64_sme_abi_support_routines_preservemost_from_x1, implicit-def $lr, implicit $sp, implicit $x0 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp ; CHECK-NEXT: RequiresZASavePseudo ; CHECK-NEXT: BL @clobber, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp @@ -141,6 +139,8 @@ body: | ; CHECK-NEXT: RET_ReallyLR ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: $x0 = COPY [[COPY1]] + ; CHECK-NEXT: BL &__arm_sme_save, csr_aarch64_sme_abi_support_routines_preservemost_from_x1, implicit-def $lr, implicit $sp, implicit $x0 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp ; CHECK-NEXT: RequiresZASavePseudo ; CHECK-NEXT: BL @clobber, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp diff --git a/llvm/test/CodeGen/AArch64/sme-abi-save-call-remarks.ll b/llvm/test/CodeGen/AArch64/sme-abi-save-call-remarks.ll index c3c76e3e803d0..5b75c8e755130 100644 --- a/llvm/test/CodeGen/AArch64/sme-abi-save-call-remarks.ll +++ b/llvm/test/CodeGen/AArch64/sme-abi-save-call-remarks.ll @@ -40,89 +40,3 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou %res = call float @llvm.cos.f32(float %a) ret float %res } - -define void @test_lazy_save_multiple_paths(i1 %a) "aarch64_inout_za" { -; CHECK-SDAG: remark: <unknown>:0:0: call from 'test_lazy_save_multiple_paths' to 'private_za_callee_a' sets up a lazy save for ZA -; CHECK-SDAG: remark: <unknown>:0:0: call from 'test_lazy_save_multiple_paths' to 'private_za_callee_b' sets up a lazy save for ZA -; CHECK-SDAG: remark: <unknown>:0:0: call from 'test_lazy_save_multiple_paths' to 'private_za_callee_c' sets up a lazy save for ZA - -; CHECK: remark: <unknown>:0:0: lazy save of ZA emitted in 'test_lazy_save_multiple_paths' -; CHECK-NEXT: remark: <unknown>:0:0: call to 'private_za_callee_b' requires ZA save -; CHECK-NEXT: remark: <unknown>:0:0: call to 'private_za_callee_a' requires ZA save -entry: - br i1 %a, label %if.end, label %if.else - -if.else: - call void @private_za_callee_a() - br label %if.end - -if.end: - call void @private_za_callee_b() - ; The new lowering won't report this call as the save is already needed due to - ; the call to `private_za_callee_b()`. - call void @private_za_callee_c() - - ret void -} - -define void @test_lazy_save_with_zt0() "aarch64_inout_za" "aarch64_inout_zt0" -{ -; CHECK-SDAG: remark: <unknown>:0:0: call from 'test_lazy_save_with_zt0' to 'private_za_callee' sets up a lazy save for ZA - -; CHECK: remark: <unknown>:0:0: spill of ZT0 emitted in 'test_lazy_save_with_zt0' -; CHECK-NEXT: remark: <unknown>:0:0: call to 'shared_za_callee' requires ZT0 save -; CHECK-NEXT: remark: <unknown>:0:0: lazy save of ZA emitted in 'test_lazy_save_with_zt0' -; CHECK-NEXT: remark: <unknown>:0:0: call to 'private_za_callee' requires ZA save - call void @shared_za_callee() ; Save ZT0 (remark ZT0 spill) - call void @private_za_callee() ; Save ZA (remark ZA save) - ret void -} - -define void @test_lazy_save_with_zt0_reload() "aarch64_inout_za" "aarch64_inout_zt0" -{ -; CHECK-SDAG: remark: <unknown>:0:0: call from 'test_lazy_save_with_zt0_reload' to 'private_za_callee' sets up a lazy save for ZA - -; CHECK: remark: <unknown>:0:0: spill of ZT0 emitted in 'test_lazy_save_with_zt0_reload' -; CHECK-NEXT: remark: <unknown>:0:0: call to 'shared_za_callee' requires ZT0 save -; CHECK-NEXT: remark: <unknown>:0:0: spill of ZT0 emitted in 'test_lazy_save_with_zt0_reload' -; CHECK-NEXT: remark: <unknown>:0:0: lazy save of ZA emitted in 'test_lazy_save_with_zt0_reload' -; CHECK-NEXT: remark: <unknown>:0:0: call to 'private_za_callee' requires ZA save - call void @shared_za_callee() ; Save ZT0 (remark ZT0 spill) - call void @shared_za_zt0_callee() ; Reload ZT0 - call void @private_za_callee() ; Save ZA, ZT0 (remark ZT0 spill and ZA save) - ret void -} - -define void @test_za_merge_paths(i1 %a) "aarch64_za_state_agnostic" { -;; Note: The old lowering does not emit any remarks for agnostic ZA saves. - -; CHECK: remark: <unknown>:0:0: full save of ZA emitted in 'test_za_merge_paths' -; CHECK-NEXT: remark: <unknown>:0:0: call to 'private_za_callee_b' requires ZA save -; CHECK-NEXT: remark: <unknown>:0:0: call to 'private_za_callee_a' requires ZA save -entry: - br i1 %a, label %if.end, label %if.else - -if.else: - call void @private_za_callee_a() - br label %exit - -if.end: - call void @private_za_callee_b() - br label %exit - -exit: - ; The new lowering won't report this call as the save is already needed due to - ; the call to `private_za_callee_*()` calls on both paths to this BB. - call void @private_za_callee_c() - - ret void -} - -define void @test_lazy_save_function_ptr_callee(ptr %private_za_callee) nounwind "aarch64_inout_za" { -; CHECK-SDAG: remark: <unknown>:0:0: call from 'test_lazy_save_function_ptr_callee' to 'unknown callee' sets up a lazy save for ZA - -; CHECK: remark: <unknown>:0:0: lazy save of ZA emitted in 'test_lazy_save_function_ptr_callee' -; CHECK-NEXT: remark: <unknown>:0:0: call requires ZA save - call void %private_za_callee() - ret void -} diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll index 0ee410d18bb3c..7a89879625632 100644 --- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll +++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll @@ -352,61 +352,33 @@ define i64 @test_many_callee_arguments( } define void @agnostic_za_buffer_alloc_with_stack_probes() nounwind "aarch64_za_state_agnostic" "probe-stack"="inline-asm" "stack-probe-size"="65536"{ -; CHECK-SDAG-LABEL: agnostic_za_buffer_alloc_with_stack_probes: -; CHECK-SDAG: // %bb.0: -; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-SDAG-NEXT: str x19, [sp, #16] // 8-byte Spill -; CHECK-SDAG-NEXT: mov x29, sp -; CHECK-SDAG-NEXT: bl __arm_sme_state_size -; CHECK-SDAG-NEXT: mov x8, sp -; CHECK-SDAG-NEXT: sub x19, x8, x0 -; CHECK-SDAG-NEXT: .LBB7_1: // =>This Inner Loop Header: Depth=1 -; CHECK-SDAG-NEXT: sub sp, sp, #16, lsl #12 // =65536 -; CHECK-SDAG-NEXT: cmp sp, x19 -; CHECK-SDAG-NEXT: b.le .LBB7_3 -; CHECK-SDAG-NEXT: // %bb.2: // in Loop: Header=BB7_1 Depth=1 -; CHECK-SDAG-NEXT: ldr xzr, [sp] -; CHECK-SDAG-NEXT: b .LBB7_1 -; CHECK-SDAG-NEXT: .LBB7_3: -; CHECK-SDAG-NEXT: mov sp, x19 -; CHECK-SDAG-NEXT: ldr xzr, [sp] -; CHECK-SDAG-NEXT: mov x0, x19 -; CHECK-SDAG-NEXT: bl __arm_sme_save -; CHECK-SDAG-NEXT: bl private_za -; CHECK-SDAG-NEXT: mov x0, x19 -; CHECK-SDAG-NEXT: bl __arm_sme_restore -; CHECK-SDAG-NEXT: mov sp, x29 -; CHECK-SDAG-NEXT: ldr x19, [sp, #16] // 8-byte Reload -; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -; CHECK-SDAG-NEXT: ret -; -; CHECK-LABEL: agnostic_za_buffer_alloc_with_stack_probes: -; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #16] // 8-byte Spill -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: bl __arm_sme_state_size -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: sub x19, x8, x0 -; CHECK-NEXT: mov x0, x19 -; CHECK-NEXT: bl __arm_sme_save -; CHECK-NEXT: .LBB7_1: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 -; CHECK-NEXT: cmp sp, x19 -; CHECK-NEXT: b.le .LBB7_3 -; CHECK-NEXT: // %bb.2: // in Loop: Header=BB7_1 Depth=1 -; CHECK-NEXT: ldr xzr, [sp] -; CHECK-NEXT: b .LBB7_1 -; CHECK-NEXT: .LBB7_3: -; CHECK-NEXT: mov sp, x19 -; CHECK-NEXT: ldr xzr, [sp] -; CHECK-NEXT: bl private_za -; CHECK-NEXT: mov x0, x19 -; CHECK-NEXT: bl __arm_sme_restore -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload -; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-COMMON-LABEL: agnostic_za_buffer_alloc_with_stack_probes: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: str x19, [sp, #16] // 8-byte Spill +; CHECK-COMMON-NEXT: mov x29, sp +; CHECK-COMMON-NEXT: bl __arm_sme_state_size +; CHECK-COMMON-NEXT: mov x8, sp +; CHECK-COMMON-NEXT: sub x19, x8, x0 +; CHECK-COMMON-NEXT: .LBB7_1: // =>This Inner Loop Header: Depth=1 +; CHECK-COMMON-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-COMMON-NEXT: cmp sp, x19 +; CHECK-COMMON-NEXT: b.le .LBB7_3 +; CHECK-COMMON-NEXT: // %bb.2: // in Loop: Header=BB7_1 Depth=1 +; CHECK-COMMON-NEXT: ldr xzr, [sp] +; CHECK-COMMON-NEXT: b .LBB7_1 +; CHECK-COMMON-NEXT: .LBB7_3: +; CHECK-COMMON-NEXT: mov sp, x19 +; CHECK-COMMON-NEXT: ldr xzr, [sp] +; CHECK-COMMON-NEXT: mov x0, x19 +; CHECK-COMMON-NEXT: bl __arm_sme_save +; CHECK-COMMON-NEXT: bl private_za +; CHECK-COMMON-NEXT: mov x0, x19 +; CHECK-COMMON-NEXT: bl __arm_sme_restore +; CHECK-COMMON-NEXT: mov sp, x29 +; CHECK-COMMON-NEXT: ldr x19, [sp, #16] // 8-byte Reload +; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ret call void @private_za() ret void } diff --git a/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll b/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll index 50449172ce85b..4e2de8aa8b5ac 100644 --- a/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll +++ b/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll @@ -49,36 +49,40 @@ define void @private_za_loop(i32 %n) "aarch64_inout_za" nounwind { ; CHECK-LABEL: private_za_loop: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #16] // 8-byte Spill +; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: sub x10, x29, #16 ; CHECK-NEXT: cmp w0, #1 ; CHECK-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEXT: msr TPIDR2_EL0, x10 -; CHECK-NEXT: b.lt .LBB0_3 +; CHECK-NEXT: b.lt .LBB0_5 ; CHECK-NEXT: // %bb.1: // %loop.preheader ; CHECK-NEXT: mov w19, w0 +; CHECK-NEXT: sub x20, x29, #16 +; CHECK-NEXT: b .LBB0_3 ; CHECK-NEXT: .LBB0_2: // %loop +; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: cbz w19, .LBB0_5 +; CHECK-NEXT: .LBB0_3: // %loop ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: msr TPIDR2_EL0, x20 ; CHECK-NEXT: bl private_za_call -; CHECK-NEXT: subs w19, w19, #1 -; CHECK-NEXT: b.ne .LBB0_2 -; CHECK-NEXT: .LBB0_3: // %exit +; CHECK-NEXT: sub w19, w19, #1 ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB0_5 -; CHECK-NEXT: // %bb.4: // %exit +; CHECK-NEXT: cbnz x8, .LBB0_2 +; CHECK-NEXT: // %bb.4: // %loop +; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: b .LBB0_2 ; CHECK-NEXT: .LBB0_5: // %exit -; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload +; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload ; CHECK-NEXT: ret entry: @@ -420,20 +424,30 @@ define void @cond_clobber_followed_by_clobber(i1 %cond) "aarch64_inout_za" nounw ; CHECK-NEXT: mov w19, w0 ; CHECK-NEXT: stp x9, x8, [x29, #-16] ; CHECK-NEXT: bl shared_za_call +; CHECK-NEXT: tbz w19, #0, .LBB5_4 +; CHECK-NEXT: // %bb.1: // %cond_clobber ; CHECK-NEXT: sub x8, x29, #16 ; CHECK-NEXT: msr TPIDR2_EL0, x8 -; CHECK-NEXT: tbz w19, #0, .LBB5_2 -; CHECK-NEXT: // %bb.1: // %cond_clobber -; CHECK-NEXT: bl private_za_call -; CHECK-NEXT: .LBB5_2: // %exit ; CHECK-NEXT: bl private_za_call ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB5_4 -; CHECK-NEXT: // %bb.3: // %exit +; CHECK-NEXT: cbnz x8, .LBB5_3 +; CHECK-NEXT: // %bb.2: // %cond_clobber ; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB5_3: // %cond_clobber +; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: .LBB5_4: // %exit +; CHECK-NEXT: sub x8, x29, #16 +; CHECK-NEXT: msr TPIDR2_EL0, x8 +; CHECK-NEXT: bl private_za_call +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB5_6 +; CHECK-NEXT: // %bb.5: // %exit +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB5_6: // %exit ; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: mov sp, x29 ; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload @@ -543,81 +557,47 @@ merge_shared: define void @diamond_mixed_za_merge_private(i1 %cond) "aarch64_inout_za" nounwind { -; CHECK-SDAG-LABEL: diamond_mixed_za_merge_private: -; CHECK-SDAG: // %bb.0: // %entry -; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-SDAG-NEXT: mov x29, sp -; CHECK-SDAG-NEXT: sub sp, sp, #16 -; CHECK-SDAG-NEXT: rdsvl x8, #1 -; CHECK-SDAG-NEXT: mov x9, sp -; CHECK-SDAG-NEXT: msub x9, x8, x8, x9 -; CHECK-SDAG-NEXT: mov sp, x9 -; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16] -; CHECK-SDAG-NEXT: tbz w0, #0, .LBB8_2 -; CHECK-SDAG-NEXT: // %bb.1: // %then -; CHECK-SDAG-NEXT: bl shared_za_call -; CHECK-SDAG-NEXT: b .LBB8_5 -; CHECK-SDAG-NEXT: .LBB8_2: // %else -; CHECK-SDAG-NEXT: sub x8, x29, #16 -; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x8 -; CHECK-SDAG-NEXT: bl private_za_call -; CHECK-SDAG-NEXT: smstart za -; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-SDAG-NEXT: sub x0, x29, #16 -; CHECK-SDAG-NEXT: cbnz x8, .LBB8_4 -; CHECK-SDAG-NEXT: // %bb.3: // %else -; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore -; CHECK-SDAG-NEXT: .LBB8_4: // %else -; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr -; CHECK-SDAG-NEXT: .LBB8_5: // %merge_private_za -; CHECK-SDAG-NEXT: sub x8, x29, #16 -; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x8 -; CHECK-SDAG-NEXT: bl private_za_call -; CHECK-SDAG-NEXT: smstart za -; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-SDAG-NEXT: sub x0, x29, #16 -; CHECK-SDAG-NEXT: cbnz x8, .LBB8_7 -; CHECK-SDAG-NEXT: // %bb.6: // %merge_private_za -; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore -; CHECK-SDAG-NEXT: .LBB8_7: // %merge_private_za -; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr -; CHECK-SDAG-NEXT: mov sp, x29 -; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -; CHECK-SDAG-NEXT: ret -; -; CHECK-LABEL: diamond_mixed_za_merge_private: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x9, x8, x8, x9 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEXT: tbz w0, #0, .LBB8_2 -; CHECK-NEXT: // %bb.1: // %then -; CHECK-NEXT: bl shared_za_call -; CHECK-NEXT: sub x8, x29, #16 -; CHECK-NEXT: msr TPIDR2_EL0, x8 -; CHECK-NEXT: b .LBB8_3 -; CHECK-NEXT: .LBB8_2: // %else -; CHECK-NEXT: sub x8, x29, #16 -; CHECK-NEXT: msr TPIDR2_EL0, x8 -; CHECK-NEXT: bl private_za_call -; CHECK-NEXT: .LBB8_3: // %merge_private_za -; CHECK-NEXT: bl private_za_call -; CHECK-NEXT: smstart za -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB8_5 -; CHECK-NEXT: // %bb.4: // %merge_private_za -; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB8_5: // %merge_private_za -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-COMMON-LABEL: diamond_mixed_za_merge_private: +; CHECK-COMMON: // %bb.0: // %entry +; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: mov x29, sp +; CHECK-COMMON-NEXT: sub sp, sp, #16 +; CHECK-COMMON-NEXT: rdsvl x8, #1 +; CHECK-COMMON-NEXT: mov x9, sp +; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 +; CHECK-COMMON-NEXT: mov sp, x9 +; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16] +; CHECK-COMMON-NEXT: tbz w0, #0, .LBB8_2 +; CHECK-COMMON-NEXT: // %bb.1: // %then +; CHECK-COMMON-NEXT: bl shared_za_call +; CHECK-COMMON-NEXT: b .LBB8_5 +; CHECK-COMMON-NEXT: .LBB8_2: // %else +; CHECK-COMMON-NEXT: sub x8, x29, #16 +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x8 +; CHECK-COMMON-NEXT: bl private_za_call +; CHECK-COMMON-NEXT: smstart za +; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-COMMON-NEXT: sub x0, x29, #16 +; CHECK-COMMON-NEXT: cbnz x8, .LBB8_4 +; CHECK-COMMON-NEXT: // %bb.3: // %else +; CHECK-COMMON-NEXT: bl __arm_tpidr2_restore +; CHECK-COMMON-NEXT: .LBB8_4: // %else +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr +; CHECK-COMMON-NEXT: .LBB8_5: // %merge_private_za +; CHECK-COMMON-NEXT: sub x8, x29, #16 +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x8 +; CHECK-COMMON-NEXT: bl private_za_call +; CHECK-COMMON-NEXT: smstart za +; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-COMMON-NEXT: sub x0, x29, #16 +; CHECK-COMMON-NEXT: cbnz x8, .LBB8_7 +; CHECK-COMMON-NEXT: // %bb.6: // %merge_private_za +; CHECK-COMMON-NEXT: bl __arm_tpidr2_restore +; CHECK-COMMON-NEXT: .LBB8_7: // %merge_private_za +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr +; CHECK-COMMON-NEXT: mov sp, x29 +; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ret entry: br i1 %cond, label %then, label %else diff --git a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll index 5243b8d7203d8..e7a4f87d6dc5e 100644 --- a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll +++ b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll @@ -63,17 +63,25 @@ define void @za_with_raii(i1 %fail) "aarch64_inout_za" personality ptr @__gxx_pe ; CHECK-NEXT: ldr x1, [x1, :got_lo12:typeinfo_for_char_const_ptr] ; CHECK-NEXT: bl __cxa_throw ; CHECK-NEXT: .Ltmp1: // EH_LABEL -; CHECK-NEXT: // %bb.3: // %throw_fail -; CHECK-NEXT: .LBB0_4: // %unwind_dtors +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB0_4 +; CHECK-NEXT: // %bb.3: // %throw_exception +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB0_4: // %throw_exception +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: // %bb.5: // %throw_fail +; CHECK-NEXT: .LBB0_6: // %unwind_dtors ; CHECK-NEXT: .Ltmp2: // EH_LABEL ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB0_6 -; CHECK-NEXT: // %bb.5: // %unwind_dtors +; CHECK-NEXT: cbnz x8, .LBB0_8 +; CHECK-NEXT: // %bb.7: // %unwind_dtors ; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB0_6: // %unwind_dtors +; CHECK-NEXT: .LBB0_8: // %unwind_dtors ; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: bl shared_za_call ; CHECK-NEXT: sub x8, x29, #16 @@ -224,15 +232,15 @@ define void @try_catch() "aarch64_inout_za" personality ptr @__gxx_personality_v ; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl may_throw ; CHECK-NEXT: .Ltmp4: // EH_LABEL -; CHECK-NEXT: .LBB1_1: // %after_catch ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB1_3 -; CHECK-NEXT: // %bb.2: // %after_catch +; CHECK-NEXT: cbnz x8, .LBB1_2 +; CHECK-NEXT: // %bb.1: ; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB1_3: // %after_catch +; CHECK-NEXT: .LBB1_2: ; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .LBB1_3: // %after_catch ; CHECK-NEXT: mov sp, x29 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: b shared_za_call @@ -251,7 +259,15 @@ define void @try_catch() "aarch64_inout_za" personality ptr @__gxx_personality_v ; CHECK-NEXT: sub x8, x29, #16 ; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl __cxa_end_catch -; CHECK-NEXT: b .LBB1_1 +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB1_8 +; CHECK-NEXT: // %bb.7: // %catch +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB1_8: // %catch +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: b .LBB1_3 ; ; CHECK-SDAG-LABEL: try_catch: ; CHECK-SDAG: .Lfunc_begin1: @@ -636,9 +652,9 @@ define void @try_catch_agnostic_za() "aarch64_za_state_agnostic" personality ptr ; CHECK-NEXT: bl __arm_sme_save ; CHECK-NEXT: bl may_throw ; CHECK-NEXT: .Ltmp13: // EH_LABEL -; CHECK-NEXT: .LBB4_1: // %exit ; CHECK-NEXT: mov x0, x19 ; CHECK-NEXT: bl __arm_sme_restore +; CHECK-NEXT: .LBB4_1: // %exit ; CHECK-NEXT: mov sp, x29 ; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload ; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload @@ -647,6 +663,8 @@ define void @try_catch_agnostic_za() "aarch64_za_state_agnostic" personality ptr ; CHECK-NEXT: .Ltmp14: // EH_LABEL ; CHECK-NEXT: bl __cxa_begin_catch ; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl __arm_sme_restore ; CHECK-NEXT: b .LBB4_1 ; ; CHECK-SDAG-LABEL: try_catch_agnostic_za: @@ -746,9 +764,9 @@ define void @try_catch_agnostic_za_invoke() "aarch64_za_state_agnostic" personal ; CHECK-NEXT: bl __arm_sme_save ; CHECK-NEXT: bl agnostic_za_call ; CHECK-NEXT: .Ltmp16: // EH_LABEL -; CHECK-NEXT: .LBB5_1: // %exit ; CHECK-NEXT: mov x0, x19 ; CHECK-NEXT: bl __arm_sme_restore +; CHECK-NEXT: .LBB5_1: // %exit ; CHECK-NEXT: mov sp, x29 ; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Reload ; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload @@ -757,6 +775,8 @@ define void @try_catch_agnostic_za_invoke() "aarch64_za_state_agnostic" personal ; CHECK-NEXT: .Ltmp17: // EH_LABEL ; CHECK-NEXT: bl __cxa_begin_catch ; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: mov x0, x19 +; CHECK-NEXT: bl __arm_sme_restore ; CHECK-NEXT: b .LBB5_1 ; ; CHECK-SDAG-LABEL: try_catch_agnostic_za_invoke: @@ -845,15 +865,15 @@ define void @try_catch_inout_za_agnostic_za_callee() "aarch64_inout_za" personal ; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl agnostic_za_call ; CHECK-NEXT: .Ltmp19: // EH_LABEL -; CHECK-NEXT: .LBB6_1: // %exit ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB6_3 -; CHECK-NEXT: // %bb.2: // %exit +; CHECK-NEXT: cbnz x8, .LBB6_2 +; CHECK-NEXT: // %bb.1: // %entry ; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB6_3: // %exit +; CHECK-NEXT: .LBB6_2: // %entry ; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .LBB6_3: // %exit ; CHECK-NEXT: mov sp, x29 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: ret @@ -861,7 +881,15 @@ define void @try_catch_inout_za_agnostic_za_callee() "aarch64_inout_za" personal ; CHECK-NEXT: .Ltmp20: // EH_LABEL ; CHECK-NEXT: bl __cxa_begin_catch ; CHECK-NEXT: bl __cxa_end_catch -; CHECK-NEXT: b .LBB6_1 +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB6_6 +; CHECK-NEXT: // %bb.5: // %catch +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB6_6: // %catch +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: b .LBB6_3 ; ; CHECK-SDAG-LABEL: try_catch_inout_za_agnostic_za_callee: ; CHECK-SDAG: .Lfunc_begin6: @@ -967,9 +995,9 @@ define void @try_catch_inout_zt0() "aarch64_inout_zt0" personality ptr @__gxx_pe ; CHECK-NEXT: smstop za ; CHECK-NEXT: bl may_throw ; CHECK-NEXT: .Ltmp22: // EH_LABEL -; CHECK-NEXT: .LBB7_1: // %exit ; CHECK-NEXT: smstart za ; CHECK-NEXT: ldr zt0, [x19] +; CHECK-NEXT: .LBB7_1: // %exit ; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #80 ; CHECK-NEXT: ret @@ -977,6 +1005,8 @@ define void @try_catch_inout_zt0() "aarch64_inout_zt0" personality ptr @__gxx_pe ; CHECK-NEXT: .Ltmp23: // EH_LABEL ; CHECK-NEXT: bl __cxa_begin_catch ; CHECK-NEXT: bl __cxa_end_catch +; CHECK-NEXT: smstart za +; CHECK-NEXT: ldr zt0, [x19] ; CHECK-NEXT: b .LBB7_1 ; ; CHECK-SDAG-LABEL: try_catch_inout_zt0: diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll index 44f4ea2ad242b..f6d64e9771359 100644 --- a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll +++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll @@ -64,95 +64,50 @@ exit: } define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float %c) "aarch64_inout_za" "probe-stack"="inline-asm" "stack-probe-size"="65536" { -; CHECK-SDAG-LABEL: multi_bb_stpidr2_save_required_stackprobe: -; CHECK-SDAG: // %bb.0: -; CHECK-SDAG-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-SDAG-NEXT: mov x29, sp -; CHECK-SDAG-NEXT: ldr xzr, [sp, #-16]! -; CHECK-SDAG-NEXT: .cfi_def_cfa w29, 16 -; CHECK-SDAG-NEXT: .cfi_offset w30, -8 -; CHECK-SDAG-NEXT: .cfi_offset w29, -16 -; CHECK-SDAG-NEXT: rdsvl x8, #1 -; CHECK-SDAG-NEXT: mov x9, sp -; CHECK-SDAG-NEXT: msub x9, x8, x8, x9 -; CHECK-SDAG-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1 -; CHECK-SDAG-NEXT: sub sp, sp, #16, lsl #12 // =65536 -; CHECK-SDAG-NEXT: cmp sp, x9 -; CHECK-SDAG-NEXT: b.le .LBB2_3 -; CHECK-SDAG-NEXT: // %bb.2: // in Loop: Header=BB2_1 Depth=1 -; CHECK-SDAG-NEXT: ldr xzr, [sp] -; CHECK-SDAG-NEXT: b .LBB2_1 -; CHECK-SDAG-NEXT: .LBB2_3: -; CHECK-SDAG-NEXT: mov sp, x9 -; CHECK-SDAG-NEXT: ldr xzr, [sp] -; CHECK-SDAG-NEXT: stp x9, x8, [x29, #-16] -; CHECK-SDAG-NEXT: cbz w0, .LBB2_5 -; CHECK-SDAG-NEXT: // %bb.4: // %use_b -; CHECK-SDAG-NEXT: fmov s1, #4.00000000 -; CHECK-SDAG-NEXT: fadd s0, s0, s1 -; CHECK-SDAG-NEXT: b .LBB2_8 -; CHECK-SDAG-NEXT: .LBB2_5: // %use_c -; CHECK-SDAG-NEXT: fmov s0, s1 -; CHECK-SDAG-NEXT: sub x8, x29, #16 -; CHECK-SDAG-NEXT: msr TPIDR2_EL0, x8 -; CHECK-SDAG-NEXT: bl cosf -; CHECK-SDAG-NEXT: smstart za -; CHECK-SDAG-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-SDAG-NEXT: sub x0, x29, #16 -; CHECK-SDAG-NEXT: cbnz x8, .LBB2_7 -; CHECK-SDAG-NEXT: // %bb.6: // %use_c -; CHECK-SDAG-NEXT: bl __arm_tpidr2_restore -; CHECK-SDAG-NEXT: .LBB2_7: // %use_c -; CHECK-SDAG-NEXT: msr TPIDR2_EL0, xzr -; CHECK-SDAG-NEXT: .LBB2_8: // %exit -; CHECK-SDAG-NEXT: mov sp, x29 -; CHECK-SDAG-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -; CHECK-SDAG-NEXT: ret -; -; CHECK-LABEL: multi_bb_stpidr2_save_required_stackprobe: -; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: ldr xzr, [sp, #-16]! -; CHECK-NEXT: .cfi_def_cfa w29, 16 -; CHECK-NEXT: .cfi_offset w30, -8 -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: sub x10, x29, #16 -; CHECK-NEXT: msub x9, x8, x8, x9 -; CHECK-NEXT: msr TPIDR2_EL0, x10 -; CHECK-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 -; CHECK-NEXT: cmp sp, x9 -; CHECK-NEXT: b.le .LBB2_3 -; CHECK-NEXT: // %bb.2: // in Loop: Header=BB2_1 Depth=1 -; CHECK-NEXT: ldr xzr, [sp] -; CHECK-NEXT: b .LBB2_1 -; CHECK-NEXT: .LBB2_3: -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: ldr xzr, [sp] -; CHECK-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEXT: cbz w0, .LBB2_5 -; CHECK-NEXT: // %bb.4: // %use_b -; CHECK-NEXT: fmov s1, #4.00000000 -; CHECK-NEXT: fadd s0, s0, s1 -; CHECK-NEXT: b .LBB2_6 -; CHECK-NEXT: .LBB2_5: // %use_c -; CHECK-NEXT: fmov s0, s1 -; CHECK-NEXT: bl cosf -; CHECK-NEXT: .LBB2_6: // %exit -; CHECK-NEXT: smstart za -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB2_8 -; CHECK-NEXT: // %bb.7: // %exit -; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB2_8: // %exit -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -; CHECK-NEXT: ret +; CHECK-COMMON-LABEL: multi_bb_stpidr2_save_required_stackprobe: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: mov x29, sp +; CHECK-COMMON-NEXT: ldr xzr, [sp, #-16]! +; CHECK-COMMON-NEXT: .cfi_def_cfa w29, 16 +; CHECK-COMMON-NEXT: .cfi_offset w30, -8 +; CHECK-COMMON-NEXT: .cfi_offset w29, -16 +; CHECK-COMMON-NEXT: rdsvl x8, #1 +; CHECK-COMMON-NEXT: mov x9, sp +; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 +; CHECK-COMMON-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1 +; CHECK-COMMON-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-COMMON-NEXT: cmp sp, x9 +; CHECK-COMMON-NEXT: b.le .LBB2_3 +; CHECK-COMMON-NEXT: // %bb.2: // in Loop: Header=BB2_1 Depth=1 +; CHECK-COMMON-NEXT: ldr xzr, [sp] +; CHECK-COMMON-NEXT: b .LBB2_1 +; CHECK-COMMON-NEXT: .LBB2_3: +; CHECK-COMMON-NEXT: mov sp, x9 +; CHECK-COMMON-NEXT: ldr xzr, [sp] +; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16] +; CHECK-COMMON-NEXT: cbz w0, .LBB2_5 +; CHECK-COMMON-NEXT: // %bb.4: // %use_b +; CHECK-COMMON-NEXT: fmov s1, #4.00000000 +; CHECK-COMMON-NEXT: fadd s0, s0, s1 +; CHECK-COMMON-NEXT: b .LBB2_8 +; CHECK-COMMON-NEXT: .LBB2_5: // %use_c +; CHECK-COMMON-NEXT: fmov s0, s1 +; CHECK-COMMON-NEXT: sub x8, x29, #16 +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x8 +; CHECK-COMMON-NEXT: bl cosf +; CHECK-COMMON-NEXT: smstart za +; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-COMMON-NEXT: sub x0, x29, #16 +; CHECK-COMMON-NEXT: cbnz x8, .LBB2_7 +; CHECK-COMMON-NEXT: // %bb.6: // %use_c +; CHECK-COMMON-NEXT: bl __arm_tpidr2_restore +; CHECK-COMMON-NEXT: .LBB2_7: // %use_c +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr +; CHECK-COMMON-NEXT: .LBB2_8: // %exit +; CHECK-COMMON-NEXT: mov sp, x29 +; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ret %cmp = icmp ne i32 %a, 0 br i1 %cmp, label %use_b, label %use_c @@ -170,3 +125,6 @@ exit: } declare float @llvm.cos.f32(float) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} +; CHECK-SDAG: {{.*}} _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
