https://github.com/MacDue created 
https://github.com/llvm/llvm-project/pull/149064

This extends the MachineSMEABIPass to handle agnostic ZA functions. This case 
is currently handled like shared ZA functions, but we don't require ZA state to 
be reloaded before agnostic ZA calls.

Note: This patch does not yet fully handle agnostic ZA functions that can catch 
exceptions. E.g.:

```
__arm_agnostic("sme_za_state") void try_catch_agnostic_za_callee()
{
  try {
    agnostic_za_call();
  } catch(...) {
    noexcept_agnostic_za_call();
  }
}
```

As in this case, we won't commit a ZA save before the `agnostic_za_call()`, 
which would be needed to restore ZA in the catch block. This will be handled in 
a later patch.

Change-Id: I9cce7b42ec8b64d5442b35231b65dfaf9d149eed

>From 3fb2e45062e1be3883f18f001fcb2ce3c1ab95ef Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxw...@arm.com>
Date: Tue, 15 Jul 2025 11:47:48 +0000
Subject: [PATCH] [AArch64][SME] Support agnostic ZA functions in the
 MachineSMEABIPass

This extends the MachineSMEABIPass to handle agnostic ZA functions. This
case is currently handled like shared ZA functions, but we don't require
ZA state to be reloaded before agnostic ZA calls.

Note: This patch does not yet fully handle agnostic ZA functions that
can catch exceptions. E.g.:

```
__arm_agnostic("sme_za_state") void try_catch_agnostic_za_callee()
{
  try {
    agnostic_za_call();
  } catch(...) {
    noexcept_agnostic_za_call();
  }
}
```

As in this case, we won't commit a ZA save before the
`agnostic_za_call()`, which would be needed to restore ZA in the catch
block. This will be handled in a later patch.

Change-Id: I9cce7b42ec8b64d5442b35231b65dfaf9d149eed
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  13 +-
 llvm/lib/Target/AArch64/MachineSMEABIPass.cpp | 177 +++++++++++++++--
 llvm/test/CodeGen/AArch64/sme-agnostic-za.ll  | 181 ++++++++++++++++--
 3 files changed, 332 insertions(+), 39 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp 
b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d586942582d8b..e0f157141c899 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8154,7 +8154,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
   if (Subtarget->hasCustomCallingConv())
     Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
 
-  if (Subtarget->useNewSMEABILowering() && !Attrs.hasAgnosticZAInterface()) {
+  if (Subtarget->useNewSMEABILowering()) {
     if (Subtarget->isTargetWindows() || hasInlineStackProbe(MF)) {
       SDValue Size;
       if (Attrs.hasZAState()) {
@@ -8965,9 +8965,13 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   bool UseNewSMEABILowering = Subtarget->useNewSMEABILowering();
   bool IsAgnosticZAFunction = CallAttrs.caller().hasAgnosticZAInterface();
   auto ZAMarkerNode = [&]() -> std::optional<unsigned> {
-    // TODO: Handle agnostic ZA functions.
-    if (!UseNewSMEABILowering || IsAgnosticZAFunction)
+    if (!UseNewSMEABILowering)
+      return std::nullopt;
+    if (IsAgnosticZAFunction) {
+      if (CallAttrs.requiresPreservingAllZAState())
+        return AArch64ISD::REQUIRES_ZA_SAVE;
       return std::nullopt;
+    }
     if (!CallAttrs.caller().hasZAState() && !CallAttrs.caller().hasZT0State())
       return std::nullopt;
     return CallAttrs.requiresLazySave() ? AArch64ISD::REQUIRES_ZA_SAVE
@@ -9047,7 +9051,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   };
 
   bool RequiresLazySave = !UseNewSMEABILowering && 
CallAttrs.requiresLazySave();
-  bool RequiresSaveAllZA = CallAttrs.requiresPreservingAllZAState();
+  bool RequiresSaveAllZA =
+      !UseNewSMEABILowering && CallAttrs.requiresPreservingAllZAState();
   if (RequiresLazySave) {
     const TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
     MachinePointerInfo MPI =
diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp 
b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
index 287cc86e19bde..7c0cad299cc64 100644
--- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -7,7 +7,7 @@
 
//===----------------------------------------------------------------------===//
 //
 // This pass implements the SME ABI requirements for ZA state. This includes
-// implementing the lazy ZA state save schemes around calls.
+// implementing the lazy (and agnostic) ZA state save schemes around calls.
 //
 
//===----------------------------------------------------------------------===//
 
@@ -128,7 +128,7 @@ struct MachineSMEABI : public MachineFunctionPass {
 
   void collectNeededZAStates(MachineFunction &MF, SMEAttrs);
   void pickBundleZAStates(MachineFunction &MF);
-  void insertStateChanges(MachineFunction &MF);
+  void insertStateChanges(MachineFunction &MF, bool IsAgnosticZA);
 
   // Emission routines for private and shared ZA functions (using lazy saves).
   void emitNewZAPrologue(MachineBasicBlock &MBB,
@@ -143,11 +143,46 @@ struct MachineSMEABI : public MachineFunctionPass {
   void emitZAOff(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                  bool ClearTPIDR2);
 
+  // Emission routines for agnostic ZA functions.
+  void emitSetupFullZASave(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           LiveRegs PhysLiveRegs);
+  void emitFullZASaveRestore(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator MBBI,
+                             LiveRegs PhysLiveRegs, bool IsSave);
+  void emitAllocateFullZASaveBuffer(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MBBI,
+                                    LiveRegs PhysLiveRegs);
+
   void emitStateChange(MachineBasicBlock &MBB, MachineBasicBlock::iterator 
MBBI,
-                       ZAState From, ZAState To, LiveRegs PhysLiveRegs);
+                       ZAState From, ZAState To, LiveRegs PhysLiveRegs,
+                       bool IsAgnosticZA);
+
+  // Helpers for switching between lazy/full ZA save/restore routines.
+  void emitZASave(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                  LiveRegs PhysLiveRegs, bool IsAgnosticZA) {
+    if (IsAgnosticZA)
+      return emitFullZASaveRestore(MBB, MBBI, PhysLiveRegs, /*IsSave=*/true);
+    return emitSetupLazySave(MBB, MBBI);
+  }
+  void emitZARestore(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                     LiveRegs PhysLiveRegs, bool IsAgnosticZA) {
+    if (IsAgnosticZA)
+      return emitFullZASaveRestore(MBB, MBBI, PhysLiveRegs, /*IsSave=*/false);
+    return emitRestoreLazySave(MBB, MBBI, PhysLiveRegs);
+  }
+  void emitAllocateZASaveBuffer(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MBBI,
+                                LiveRegs PhysLiveRegs, bool IsAgnosticZA) {
+    if (IsAgnosticZA)
+      return emitAllocateFullZASaveBuffer(MBB, MBBI, PhysLiveRegs);
+    return emitAllocateLazySaveBuffer(MBB, MBBI);
+  }
 
   TPIDR2State getTPIDR2Block(MachineFunction &MF);
 
+  Register getAgnosticZABufferPtr(MachineFunction &MF);
+
 private:
   struct InstInfo {
     ZAState NeededState{ZAState::ANY};
@@ -158,6 +193,7 @@ struct MachineSMEABI : public MachineFunctionPass {
   struct BlockInfo {
     ZAState FixedEntryState{ZAState::ANY};
     SmallVector<InstInfo> Insts;
+    LiveRegs PhysLiveRegsAtEntry = LiveRegs::None;
     LiveRegs PhysLiveRegsAtExit = LiveRegs::None;
   };
 
@@ -167,6 +203,9 @@ struct MachineSMEABI : public MachineFunctionPass {
     SmallVector<ZAState> BundleStates;
     std::optional<TPIDR2State> TPIDR2Block;
     std::optional<MachineBasicBlock::iterator> AfterSMEProloguePt;
+    Register AgnosticZABufferPtr = AArch64::NoRegister;
+    LiveRegs PhysLiveRegsAfterSMEPrologue = LiveRegs::None;
+    bool HasFullZASaveRestore = false;
   } State;
 
   EdgeBundles *Bundles = nullptr;
@@ -175,7 +214,8 @@ struct MachineSMEABI : public MachineFunctionPass {
 void MachineSMEABI::collectNeededZAStates(MachineFunction &MF,
                                           SMEAttrs SMEFnAttrs) {
   const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
-  assert((SMEFnAttrs.hasZT0State() || SMEFnAttrs.hasZAState()) &&
+  assert((SMEFnAttrs.hasAgnosticZAInterface() || SMEFnAttrs.hasZT0State() ||
+          SMEFnAttrs.hasZAState()) &&
          "Expected function to have ZA/ZT0 state!");
 
   State.Blocks.resize(MF.getNumBlockIDs());
@@ -209,6 +249,7 @@ void MachineSMEABI::collectNeededZAStates(MachineFunction 
&MF,
 
     Block.PhysLiveRegsAtExit = GetPhysLiveRegs();
     auto FirstTerminatorInsertPt = MBB.getFirstTerminator();
+    auto FirstNonPhiInsertPt = MBB.getFirstNonPHI();
     for (MachineInstr &MI : reverse(MBB)) {
       MachineBasicBlock::iterator MBBI(MI);
       LiveUnits.stepBackward(MI);
@@ -219,15 +260,20 @@ void MachineSMEABI::collectNeededZAStates(MachineFunction 
&MF,
       // block setup.
       if (MI.getOpcode() == AArch64::SMEStateAllocPseudo) {
         State.AfterSMEProloguePt = MBBI;
+        State.PhysLiveRegsAfterSMEPrologue = PhysLiveRegs;
       }
+      // Note: We treat Agnostic ZA as inout_za with an alternate save/restore.
       auto [NeededState, InsertPt] = getInstNeededZAState(
-          TRI, MI, /*ZALiveAtReturn=*/SMEFnAttrs.hasSharedZAInterface());
+          TRI, MI, /*ZALiveAtReturn=*/SMEFnAttrs.hasSharedZAInterface() ||
+                       SMEFnAttrs.hasAgnosticZAInterface());
       assert((InsertPt == MBBI ||
               InsertPt->getOpcode() == AArch64::ADJCALLSTACKDOWN) &&
              "Unexpected state change insertion point!");
       // TODO: Do something to avoid state changes where NZCV is live.
       if (MBBI == FirstTerminatorInsertPt)
         Block.PhysLiveRegsAtExit = PhysLiveRegs;
+      if (MBBI == FirstNonPhiInsertPt)
+        Block.PhysLiveRegsAtEntry = PhysLiveRegs;
       if (NeededState != ZAState::ANY)
         Block.Insts.push_back({NeededState, InsertPt, PhysLiveRegs});
     }
@@ -294,7 +340,7 @@ void MachineSMEABI::pickBundleZAStates(MachineFunction &MF) 
{
   }
 }
 
-void MachineSMEABI::insertStateChanges(MachineFunction &MF) {
+void MachineSMEABI::insertStateChanges(MachineFunction &MF, bool IsAgnosticZA) 
{
   for (MachineBasicBlock &MBB : MF) {
     BlockInfo &Block = State.Blocks[MBB.getNumber()];
     ZAState InState =
@@ -309,7 +355,7 @@ void MachineSMEABI::insertStateChanges(MachineFunction &MF) 
{
     for (auto &Inst : Block.Insts) {
       if (CurrentState != Inst.NeededState)
         emitStateChange(MBB, Inst.InsertPt, CurrentState, Inst.NeededState,
-                        Inst.PhysLiveRegs);
+                        Inst.PhysLiveRegs, IsAgnosticZA);
       CurrentState = Inst.NeededState;
     }
 
@@ -318,7 +364,7 @@ void MachineSMEABI::insertStateChanges(MachineFunction &MF) 
{
 
     if (CurrentState != OutState)
       emitStateChange(MBB, MBB.getFirstTerminator(), CurrentState, OutState,
-                      Block.PhysLiveRegsAtExit);
+                      Block.PhysLiveRegsAtExit, IsAgnosticZA);
   }
 }
 
@@ -571,10 +617,98 @@ void MachineSMEABI::emitNewZAPrologue(MachineBasicBlock 
&MBB,
     emitZeroZA(TII, DL, MBB, MBBI, /*Mask=*/0b11111111);
 }
 
+Register MachineSMEABI::getAgnosticZABufferPtr(MachineFunction &MF) {
+  if (State.AgnosticZABufferPtr != AArch64::NoRegister)
+    return State.AgnosticZABufferPtr;
+  if (auto BufferPtr =
+          MF.getInfo<AArch64FunctionInfo>()->getEarlyAllocSMESaveBuffer();
+      BufferPtr != AArch64::NoRegister)
+    State.AgnosticZABufferPtr = BufferPtr;
+  else
+    State.AgnosticZABufferPtr =
+        MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+  return State.AgnosticZABufferPtr;
+}
+
+void MachineSMEABI::emitFullZASaveRestore(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator MBBI,
+                                          LiveRegs PhysLiveRegs, bool IsSave) {
+  MachineFunction &MF = *MBB.getParent();
+  auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+  const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  State.HasFullZASaveRestore = true;
+  DebugLoc DL = getDebugLoc(MBB, MBBI);
+  Register BufferPtr = AArch64::X0;
+
+  ScopedPhysRegSave ScopedPhysRegSave(MRI, TII, DL, MBB, MBBI, PhysLiveRegs);
+
+  // Copy the buffer pointer into X0.
+  BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::COPY), BufferPtr)
+      .addReg(getAgnosticZABufferPtr(MF));
+
+  // Call __arm_sme_save/__arm_sme_restore.
+  BuildMI(MBB, MBBI, DL, TII.get(AArch64::BL))
+      .addReg(BufferPtr, RegState::Implicit)
+      .addExternalSymbol(IsSave ? "__arm_sme_save" : "__arm_sme_restore")
+      .addRegMask(TRI.getCallPreservedMask(
+          MF,
+          CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1));
+}
+
+void MachineSMEABI::emitAllocateFullZASaveBuffer(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    LiveRegs PhysLiveRegs) {
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  auto *AFI = MF.getInfo<AArch64FunctionInfo>();
+
+  // Buffer already allocated in SelectionDAG.
+  if (AFI->getEarlyAllocSMESaveBuffer())
+    return;
+
+  DebugLoc DL = getDebugLoc(MBB, MBBI);
+  Register BufferPtr = getAgnosticZABufferPtr(MF);
+  Register BufferSize = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+
+  ScopedPhysRegSave ScopedPhysRegSave(MRI, TII, DL, MBB, MBBI, PhysLiveRegs);
+
+  // Calculate the SME state size.
+  {
+    const AArch64RegisterInfo *TRI = Subtarget.getRegisterInfo();
+    BuildMI(MBB, MBBI, DL, TII.get(AArch64::BL))
+        .addExternalSymbol("__arm_sme_state_size")
+        .addReg(AArch64::X0, RegState::ImplicitDefine)
+        .addRegMask(TRI->getCallPreservedMask(
+            MF, CallingConv::
+                    AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1));
+    BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::COPY), BufferSize)
+        .addReg(AArch64::X0);
+  }
+
+  // Allocate a buffer object of the size given __arm_sme_state_size.
+  {
+    BuildMI(MBB, MBBI, DL, TII.get(AArch64::SUBXrx64), AArch64::SP)
+        .addReg(AArch64::SP)
+        .addReg(BufferSize)
+        .addImm(AArch64_AM::getArithExtendImm(AArch64_AM::UXTX, 0));
+    BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::COPY), BufferPtr)
+        .addReg(AArch64::SP);
+
+    // We have just allocated a variable sized object, tell this to PEI.
+    MFI.CreateVariableSizedObject(Align(16), nullptr);
+  }
+}
+
 void MachineSMEABI::emitStateChange(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator InsertPt,
                                     ZAState From, ZAState To,
-                                    LiveRegs PhysLiveRegs) {
+                                    LiveRegs PhysLiveRegs, bool IsAgnosticZA) {
 
   // ZA not used.
   if (From == ZAState::ANY || To == ZAState::ANY)
@@ -601,10 +735,11 @@ void MachineSMEABI::emitStateChange(MachineBasicBlock 
&MBB,
   }
 
   if (From == ZAState::ACTIVE && To == ZAState::LOCAL_SAVED)
-    emitSetupLazySave(MBB, InsertPt);
+    emitZASave(MBB, InsertPt, PhysLiveRegs, IsAgnosticZA);
   else if (From == ZAState::LOCAL_SAVED && To == ZAState::ACTIVE)
-    emitRestoreLazySave(MBB, InsertPt, PhysLiveRegs);
+    emitZARestore(MBB, InsertPt, PhysLiveRegs, IsAgnosticZA);
   else if (To == ZAState::OFF) {
+    assert(!IsAgnosticZA && "Should not turn ZA off in agnostic ZA function");
     // If we're exiting from the CALLER_DORMANT state that means this new ZA
     // function did not touch ZA (so ZA was never turned on).
     if (From != ZAState::CALLER_DORMANT)
@@ -627,7 +762,8 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction 
&MF) {
 
   auto *AFI = MF.getInfo<AArch64FunctionInfo>();
   SMEAttrs SMEFnAttrs = AFI->getSMEFnAttrs();
-  if (!SMEFnAttrs.hasZAState() && !SMEFnAttrs.hasZT0State())
+  if (!SMEFnAttrs.hasZAState() && !SMEFnAttrs.hasZT0State() &&
+      !SMEFnAttrs.hasAgnosticZAInterface())
     return false;
 
   assert(MF.getRegInfo().isSSA() && "Expected to be run on SSA form!");
@@ -636,20 +772,27 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction 
&MF) {
   State = PassState{};
   Bundles = &getAnalysis<EdgeBundlesWrapperLegacy>().getEdgeBundles();
 
+  bool IsAgnosticZA = SMEFnAttrs.hasAgnosticZAInterface();
+
   collectNeededZAStates(MF, SMEFnAttrs);
   pickBundleZAStates(MF);
-  insertStateChanges(MF);
+  insertStateChanges(MF, /*IsAgnosticZA=*/IsAgnosticZA);
 
   // Allocate save buffer (if needed).
-  if (State.TPIDR2Block.has_value()) {
+  if (State.HasFullZASaveRestore || State.TPIDR2Block.has_value()) {
     if (State.AfterSMEProloguePt) {
       // Note: With inline stack probes the AfterSMEProloguePt may not be in 
the
       // entry block (due to the probing loop).
-      emitAllocateLazySaveBuffer(*(*State.AfterSMEProloguePt)->getParent(),
-                                 *State.AfterSMEProloguePt);
+      emitAllocateZASaveBuffer(*(*State.AfterSMEProloguePt)->getParent(),
+                               *State.AfterSMEProloguePt,
+                               State.PhysLiveRegsAfterSMEPrologue,
+                               /*IsAgnosticZA=*/IsAgnosticZA);
     } else {
       MachineBasicBlock &EntryBlock = MF.front();
-      emitAllocateLazySaveBuffer(EntryBlock, EntryBlock.getFirstNonPHI());
+      emitAllocateZASaveBuffer(
+          EntryBlock, EntryBlock.getFirstNonPHI(),
+          State.Blocks[EntryBlock.getNumber()].PhysLiveRegsAtEntry,
+          /*IsAgnosticZA=*/IsAgnosticZA);
     }
   }
 
diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll 
b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
index d1ec53f54c702..0447166a2dde6 100644
--- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
+++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 
UTC_ARGS: --version 5
-; RUN: llc -mattr=+sme2 < %s | FileCheck %s
-; RUN: llc -mattr=+sme2 < %s -aarch64-new-sme-abi | FileCheck %s
+; RUN: llc -mattr=+sme2 < %s | FileCheck %s --check-prefixes=CHECK-COMMON,CHECK
+; RUN: llc -mattr=+sme2 < %s -aarch64-new-sme-abi | FileCheck %s 
--check-prefixes=CHECK-COMMON,CHECK-NEWLOWERING
 
 target triple = "aarch64"
 
@@ -9,10 +9,10 @@ declare i64 @agnostic_decl(i64) "aarch64_za_state_agnostic"
 
 ; No calls. Test that no buffer is allocated.
 define i64 @agnostic_caller_no_callees(ptr %ptr) nounwind 
"aarch64_za_state_agnostic" {
-; CHECK-LABEL: agnostic_caller_no_callees:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-COMMON-LABEL: agnostic_caller_no_callees:
+; CHECK-COMMON:       // %bb.0:
+; CHECK-COMMON-NEXT:    ldr x0, [x0]
+; CHECK-COMMON-NEXT:    ret
   %v = load i64, ptr %ptr
   ret i64 %v
 }
@@ -51,6 +51,29 @@ define i64 @agnostic_caller_private_za_callee(i64 %v) 
nounwind "aarch64_za_state
 ; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; CHECK-NEWLOWERING-LABEL: agnostic_caller_private_za_callee:
+; CHECK-NEWLOWERING:       // %bb.0:
+; CHECK-NEWLOWERING-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT:    mov x29, sp
+; CHECK-NEWLOWERING-NEXT:    mov x8, x0
+; CHECK-NEWLOWERING-NEXT:    bl __arm_sme_state_size
+; CHECK-NEWLOWERING-NEXT:    sub sp, sp, x0
+; CHECK-NEWLOWERING-NEXT:    mov x19, sp
+; CHECK-NEWLOWERING-NEXT:    mov x0, x19
+; CHECK-NEWLOWERING-NEXT:    bl __arm_sme_save
+; CHECK-NEWLOWERING-NEXT:    mov x0, x8
+; CHECK-NEWLOWERING-NEXT:    bl private_za_decl
+; CHECK-NEWLOWERING-NEXT:    bl private_za_decl
+; CHECK-NEWLOWERING-NEXT:    mov x8, x0
+; CHECK-NEWLOWERING-NEXT:    mov x0, x19
+; CHECK-NEWLOWERING-NEXT:    bl __arm_sme_restore
+; CHECK-NEWLOWERING-NEXT:    mov x0, x8
+; CHECK-NEWLOWERING-NEXT:    mov sp, x29
+; CHECK-NEWLOWERING-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    ret
   %res = call i64 @private_za_decl(i64 %v)
   %res2 = call i64 @private_za_decl(i64 %res)
   ret i64 %res2
@@ -60,12 +83,12 @@ define i64 @agnostic_caller_private_za_callee(i64 %v) 
nounwind "aarch64_za_state
 ;
 ; Should not result in save/restore code.
 define i64 @agnostic_caller_agnostic_callee(i64 %v) nounwind 
"aarch64_za_state_agnostic" {
-; CHECK-LABEL: agnostic_caller_agnostic_callee:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    bl agnostic_decl
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-COMMON-LABEL: agnostic_caller_agnostic_callee:
+; CHECK-COMMON:       // %bb.0:
+; CHECK-COMMON-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-COMMON-NEXT:    bl agnostic_decl
+; CHECK-COMMON-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-COMMON-NEXT:    ret
   %res = call i64 @agnostic_decl(i64 %v)
   ret i64 %res
 }
@@ -74,12 +97,12 @@ define i64 @agnostic_caller_agnostic_callee(i64 %v) 
nounwind "aarch64_za_state_a
 ;
 ; Should not result in lazy-save or save of ZT0
 define i64 @shared_caller_agnostic_callee(i64 %v) nounwind "aarch64_inout_za" 
"aarch64_inout_zt0" {
-; CHECK-LABEL: shared_caller_agnostic_callee:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    bl agnostic_decl
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; CHECK-COMMON-LABEL: shared_caller_agnostic_callee:
+; CHECK-COMMON:       // %bb.0:
+; CHECK-COMMON-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-COMMON-NEXT:    bl agnostic_decl
+; CHECK-COMMON-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-COMMON-NEXT:    ret
   %res = call i64 @agnostic_decl(i64 %v)
   ret i64 %res
 }
@@ -130,6 +153,45 @@ define i64 
@streaming_agnostic_caller_nonstreaming_private_za_callee(i64 %v) nou
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #112 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; CHECK-NEWLOWERING-LABEL: 
streaming_agnostic_caller_nonstreaming_private_za_callee:
+; CHECK-NEWLOWERING:       // %bb.0:
+; CHECK-NEWLOWERING-NEXT:    stp d15, d14, [sp, #-112]! // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT:    mov x9, x0
+; CHECK-NEWLOWERING-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT:    bl __arm_get_current_vg
+; CHECK-NEWLOWERING-NEXT:    str x0, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT:    mov x0, x9
+; CHECK-NEWLOWERING-NEXT:    add x29, sp, #64
+; CHECK-NEWLOWERING-NEXT:    stp x20, x19, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT:    mov x8, x0
+; CHECK-NEWLOWERING-NEXT:    bl __arm_sme_state_size
+; CHECK-NEWLOWERING-NEXT:    sub sp, sp, x0
+; CHECK-NEWLOWERING-NEXT:    mov x20, sp
+; CHECK-NEWLOWERING-NEXT:    mov x0, x20
+; CHECK-NEWLOWERING-NEXT:    bl __arm_sme_save
+; CHECK-NEWLOWERING-NEXT:    smstop sm
+; CHECK-NEWLOWERING-NEXT:    mov x0, x8
+; CHECK-NEWLOWERING-NEXT:    bl private_za_decl
+; CHECK-NEWLOWERING-NEXT:    smstart sm
+; CHECK-NEWLOWERING-NEXT:    smstop sm
+; CHECK-NEWLOWERING-NEXT:    bl private_za_decl
+; CHECK-NEWLOWERING-NEXT:    smstart sm
+; CHECK-NEWLOWERING-NEXT:    mov x8, x0
+; CHECK-NEWLOWERING-NEXT:    mov x0, x20
+; CHECK-NEWLOWERING-NEXT:    bl __arm_sme_restore
+; CHECK-NEWLOWERING-NEXT:    mov x0, x8
+; CHECK-NEWLOWERING-NEXT:    sub sp, x29, #64
+; CHECK-NEWLOWERING-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    ldp d15, d14, [sp], #112 // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    ret
   %res = call i64 @private_za_decl(i64 %v)
   %res2 = call i64 @private_za_decl(i64 %res)
   ret i64 %res2
@@ -197,6 +259,64 @@ define i64 
@streaming_compatible_agnostic_caller_nonstreaming_private_za_callee(
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #112 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; CHECK-NEWLOWERING-LABEL: 
streaming_compatible_agnostic_caller_nonstreaming_private_za_callee:
+; CHECK-NEWLOWERING:       // %bb.0:
+; CHECK-NEWLOWERING-NEXT:    stp d15, d14, [sp, #-112]! // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT:    mov x9, x0
+; CHECK-NEWLOWERING-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT:    bl __arm_get_current_vg
+; CHECK-NEWLOWERING-NEXT:    str x0, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT:    mov x0, x9
+; CHECK-NEWLOWERING-NEXT:    add x29, sp, #64
+; CHECK-NEWLOWERING-NEXT:    stp x20, x19, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT:    mov x8, x0
+; CHECK-NEWLOWERING-NEXT:    bl __arm_sme_state_size
+; CHECK-NEWLOWERING-NEXT:    sub sp, sp, x0
+; CHECK-NEWLOWERING-NEXT:    mov x19, sp
+; CHECK-NEWLOWERING-NEXT:    bl __arm_sme_state
+; CHECK-NEWLOWERING-NEXT:    mov x9, x0
+; CHECK-NEWLOWERING-NEXT:    mov x0, x19
+; CHECK-NEWLOWERING-NEXT:    bl __arm_sme_save
+; CHECK-NEWLOWERING-NEXT:    and x20, x9, #0x1
+; CHECK-NEWLOWERING-NEXT:    tbz w20, #0, .LBB5_2
+; CHECK-NEWLOWERING-NEXT:  // %bb.1:
+; CHECK-NEWLOWERING-NEXT:    smstop sm
+; CHECK-NEWLOWERING-NEXT:  .LBB5_2:
+; CHECK-NEWLOWERING-NEXT:    mov x0, x8
+; CHECK-NEWLOWERING-NEXT:    bl private_za_decl
+; CHECK-NEWLOWERING-NEXT:    mov x2, x0
+; CHECK-NEWLOWERING-NEXT:    tbz w20, #0, .LBB5_4
+; CHECK-NEWLOWERING-NEXT:  // %bb.3:
+; CHECK-NEWLOWERING-NEXT:    smstart sm
+; CHECK-NEWLOWERING-NEXT:  .LBB5_4:
+; CHECK-NEWLOWERING-NEXT:    bl __arm_sme_state
+; CHECK-NEWLOWERING-NEXT:    and x20, x0, #0x1
+; CHECK-NEWLOWERING-NEXT:    tbz w20, #0, .LBB5_6
+; CHECK-NEWLOWERING-NEXT:  // %bb.5:
+; CHECK-NEWLOWERING-NEXT:    smstop sm
+; CHECK-NEWLOWERING-NEXT:  .LBB5_6:
+; CHECK-NEWLOWERING-NEXT:    mov x0, x2
+; CHECK-NEWLOWERING-NEXT:    bl private_za_decl
+; CHECK-NEWLOWERING-NEXT:    tbz w20, #0, .LBB5_8
+; CHECK-NEWLOWERING-NEXT:  // %bb.7:
+; CHECK-NEWLOWERING-NEXT:    smstart sm
+; CHECK-NEWLOWERING-NEXT:  .LBB5_8:
+; CHECK-NEWLOWERING-NEXT:    mov x8, x0
+; CHECK-NEWLOWERING-NEXT:    mov x0, x19
+; CHECK-NEWLOWERING-NEXT:    bl __arm_sme_restore
+; CHECK-NEWLOWERING-NEXT:    mov x0, x8
+; CHECK-NEWLOWERING-NEXT:    sub sp, x29, #64
+; CHECK-NEWLOWERING-NEXT:    ldp x20, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    ldp d15, d14, [sp], #112 // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    ret
   %res = call i64 @private_za_decl(i64 %v)
   %res2 = call i64 @private_za_decl(i64 %res)
   ret i64 %res2
@@ -233,6 +353,31 @@ define i64  @test_many_callee_arguments(
 ; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
+;
+; CHECK-NEWLOWERING-LABEL: test_many_callee_arguments:
+; CHECK-NEWLOWERING:       // %bb.0:
+; CHECK-NEWLOWERING-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT:    mov x29, sp
+; CHECK-NEWLOWERING-NEXT:    mov x8, x0
+; CHECK-NEWLOWERING-NEXT:    bl __arm_sme_state_size
+; CHECK-NEWLOWERING-NEXT:    sub sp, sp, x0
+; CHECK-NEWLOWERING-NEXT:    mov x19, sp
+; CHECK-NEWLOWERING-NEXT:    ldp x9, x10, [x29, #32]
+; CHECK-NEWLOWERING-NEXT:    mov x0, x19
+; CHECK-NEWLOWERING-NEXT:    bl __arm_sme_save
+; CHECK-NEWLOWERING-NEXT:    stp x9, x10, [sp, #-16]!
+; CHECK-NEWLOWERING-NEXT:    mov x0, x8
+; CHECK-NEWLOWERING-NEXT:    bl many_args_private_za_callee
+; CHECK-NEWLOWERING-NEXT:    add sp, sp, #16
+; CHECK-NEWLOWERING-NEXT:    mov x8, x0
+; CHECK-NEWLOWERING-NEXT:    mov x0, x19
+; CHECK-NEWLOWERING-NEXT:    bl __arm_sme_restore
+; CHECK-NEWLOWERING-NEXT:    mov x0, x8
+; CHECK-NEWLOWERING-NEXT:    mov sp, x29
+; CHECK-NEWLOWERING-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    ret
   i64 %0, i64 %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, i64 %7, i64 %8, i64 
%9
 ) nounwind "aarch64_za_state_agnostic" {
   %ret = call i64 @many_args_private_za_callee(

_______________________________________________
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

Reply via email to