https://github.com/MacDue created 
https://github.com/llvm/llvm-project/pull/166362

This patch extends the MachineSMEABIPass to support ZT0. This is done with the 
addition of two new states:

  - `ACTIVE_ZT0_SAVED`
    * This is used when calling a function that shares ZA, but does share ZT0 
(i.e., no ZT0 attributes).
    * This state indicates ZT0 must be saved to the save slot, but must remain 
on, with no lazy save setup
  - `LOCAL_COMMITTED`
    * This is used for saving ZT0 in functions without ZA state.
    * This state indicates ZA is off and ZT0 has been saved.
    * This state is general enough to support ZA, but those have not been 
implemented†

To aid with readability, the state transitions have been reworked to a switch 
of `transitionFrom(<FromState>).to(<ToState>)`, rather than nested ifs, which 
helps manage more transitions.

† This could be implemented to handle some cases of undefined behavior better.

>From dc41be430aa17616f431e0ce793e66f92df28881 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <[email protected]>
Date: Mon, 3 Nov 2025 15:41:49 +0000
Subject: [PATCH] [AArch64][SME] Support saving/restoring ZT0 in the
 MachineSMEABIPass
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch extends the MachineSMEABIPass to support ZT0. This is done
with the addition of two new states:

  - `ACTIVE_ZT0_SAVED`
    * This is used when calling a function that shares ZA, but does
      share ZT0 (i.e., no ZT0 attributes).
    * This state indicates ZT0 must be saved to the save slot, but
      must remain on, with no lazy save setup
  - `LOCAL_COMMITTED`
    * This is used for saving ZT0 in functions without ZA state.
    * This state indicates ZA is off and ZT0 has been saved.
    * This state is general enough to support ZA, but those
      have not been implemented†

To aid with readability, the state transitions have been reworked to a
switch of `transitionFrom(<FromState>).to(<ToState>)`, rather than
nested ifs, which helps manage more transitions.

† This could be implemented to handle some cases of undefined behavior
  better.

Change-Id: I14be4a7f8b998fe667bfaade5088f88039515f91
---
 .../AArch64/AArch64ExpandPseudoInsts.cpp      |   1 +
 .../Target/AArch64/AArch64ISelLowering.cpp    |  11 +-
 .../lib/Target/AArch64/AArch64SMEInstrInfo.td |   6 +
 llvm/lib/Target/AArch64/MachineSMEABIPass.cpp | 176 +++++++++++++++---
 .../test/CodeGen/AArch64/sme-peephole-opts.ll |   4 -
 .../test/CodeGen/AArch64/sme-za-exceptions.ll | 124 +++++++++---
 llvm/test/CodeGen/AArch64/sme-zt0-state.ll    | 104 ++++++-----
 7 files changed, 321 insertions(+), 105 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp 
b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 34d74d04c4419..60e6a82d41cc8 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1717,6 +1717,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
    }
    case AArch64::InOutZAUsePseudo:
    case AArch64::RequiresZASavePseudo:
+   case AArch64::RequiresZT0SavePseudo:
    case AArch64::SMEStateAllocPseudo:
    case AArch64::COALESCER_BARRIER_FPR16:
    case AArch64::COALESCER_BARRIER_FPR32:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp 
b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 30f961043e78b..20c1c6790b2fb 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9457,6 +9457,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     if (CallAttrs.requiresLazySave() ||
         CallAttrs.requiresPreservingAllZAState())
       ZAMarkerNode = AArch64ISD::REQUIRES_ZA_SAVE;
+    else if (CallAttrs.requiresPreservingZT0())
+      ZAMarkerNode = AArch64ISD::REQUIRES_ZT0_SAVE;
     else if (CallAttrs.caller().hasZAState() ||
              CallAttrs.caller().hasZT0State())
       ZAMarkerNode = AArch64ISD::INOUT_ZA_USE;
@@ -9576,7 +9578,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   SDValue ZTFrameIdx;
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  bool ShouldPreserveZT0 = CallAttrs.requiresPreservingZT0();
+  bool ShouldPreserveZT0 =
+      !UseNewSMEABILowering && CallAttrs.requiresPreservingZT0();
 
   // If the caller has ZT0 state which will not be preserved by the callee,
   // spill ZT0 before the call.
@@ -9589,7 +9592,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // If caller shares ZT0 but the callee is not shared ZA, we need to stop
   // PSTATE.ZA before the call if there is no lazy-save active.
-  bool DisableZA = CallAttrs.requiresDisablingZABeforeCall();
+  bool DisableZA =
+      !UseNewSMEABILowering && CallAttrs.requiresDisablingZABeforeCall();
   assert((!DisableZA || !RequiresLazySave) &&
          "Lazy-save should have PSTATE.SM=1 on entry to the function");
 
@@ -10074,7 +10078,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
         getSMToggleCondition(CallAttrs));
   }
 
-  if (RequiresLazySave || CallAttrs.requiresEnablingZAAfterCall())
+  if (!UseNewSMEABILowering &&
+      (RequiresLazySave || CallAttrs.requiresEnablingZAAfterCall()))
     // Unconditionally resume ZA.
     Result = DAG.getNode(
         AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Result,
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td 
b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 8f8f211c5fceb..2753a4561daae 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -102,6 +102,7 @@ def : Pat<(i64 (AArch64AllocateSMESaveBuffer GPR64:$size)),
 let hasSideEffects = 1, isMeta = 1 in {
   def InOutZAUsePseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
   def RequiresZASavePseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
+  def RequiresZT0SavePseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
 }
 
 def SMEStateAllocPseudo : Pseudo<(outs), (ins), []>, Sched<[]>;
@@ -122,6 +123,11 @@ def AArch64_requires_za_save
            [SDNPHasChain, SDNPInGlue]>;
 def : Pat<(AArch64_requires_za_save), (RequiresZASavePseudo)>;
 
+def AArch64_requires_zt0_save
+  : SDNode<"AArch64ISD::REQUIRES_ZT0_SAVE", SDTypeProfile<0, 0, []>,
+           [SDNPHasChain, SDNPInGlue]>;
+def : Pat<(AArch64_requires_zt0_save), (RequiresZT0SavePseudo)>;
+
 def AArch64_sme_state_alloc
   : SDNode<"AArch64ISD::SME_STATE_ALLOC", SDTypeProfile<0, 0,[]>,
            [SDNPHasChain]>;
diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp 
b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
index bb4dfe8c60904..c8d20b571d702 100644
--- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -72,16 +72,30 @@ using namespace llvm;
 
 namespace {
 
-enum ZAState {
+// Note: For agnostic ZA, we assume the function is always entered/exited in 
the
+// "ACTIVE" state -- this _may_ not be the case (since OFF is also a
+// possibility, but for the purpose of placing ZA saves/restores, that does not
+// matter).
+enum ZAState : uint8_t {
   // Any/unknown state (not valid)
   ANY = 0,
 
   // ZA is in use and active (i.e. within the accumulator)
   ACTIVE,
 
+  // ZA is active, but ZT0 has been saved.
+  // This handles the edge case of sharedZA && !sharesZT0.
+  ACTIVE_ZT0_SAVED,
+
   // A ZA save has been set up or committed (i.e. ZA is dormant or off)
+  // If the function uses ZT0 it must also be saved.
   LOCAL_SAVED,
 
+  // ZA has been committed to the lazy save buffer of the current function.
+  // If the function uses ZT0 it must also be saved.
+  // ZA is off when a save has been committed.
+  LOCAL_COMMITTED,
+
   // The ZA/ZT0 state on entry to the function.
   ENTRY,
 
@@ -164,6 +178,14 @@ class EmitContext {
     return AgnosticZABufferPtr;
   }
 
+  int getZT0SaveSlot(MachineFunction &MF) {
+    if (ZT0SaveFI)
+      return *ZT0SaveFI;
+    MachineFrameInfo &MFI = MF.getFrameInfo();
+    ZT0SaveFI = MFI.CreateSpillStackObject(64, Align(16));
+    return *ZT0SaveFI;
+  }
+
   /// Returns true if the function must allocate a ZA save buffer on entry. 
This
   /// will be the case if, at any point in the function, a ZA save was emitted.
   bool needsSaveBuffer() const {
@@ -173,6 +195,7 @@ class EmitContext {
   }
 
 private:
+  std::optional<int> ZT0SaveFI;
   std::optional<int> TPIDR2BlockFI;
   Register AgnosticZABufferPtr = AArch64::NoRegister;
 };
@@ -184,8 +207,10 @@ class EmitContext {
 /// state would not be legal, as transitioning to it drops the content of ZA.
 static bool isLegalEdgeBundleZAState(ZAState State) {
   switch (State) {
-  case ZAState::ACTIVE:      // ZA state within the accumulator/ZT0.
-  case ZAState::LOCAL_SAVED: // ZA state is saved on the stack.
+  case ZAState::ACTIVE:           // ZA state within the accumulator/ZT0.
+  case ZAState::ACTIVE_ZT0_SAVED: // ZT0 is saved (ZA is active).
+  case ZAState::LOCAL_SAVED:      // ZA state may be saved on the stack.
+  case ZAState::LOCAL_COMMITTED:  // ZA state is saved on the stack.
     return true;
   default:
     return false;
@@ -199,7 +224,9 @@ StringRef getZAStateString(ZAState State) {
   switch (State) {
     MAKE_CASE(ZAState::ANY)
     MAKE_CASE(ZAState::ACTIVE)
+    MAKE_CASE(ZAState::ACTIVE_ZT0_SAVED)
     MAKE_CASE(ZAState::LOCAL_SAVED)
+    MAKE_CASE(ZAState::LOCAL_COMMITTED)
     MAKE_CASE(ZAState::ENTRY)
     MAKE_CASE(ZAState::OFF)
   default:
@@ -221,18 +248,34 @@ static bool isZAorZTRegOp(const TargetRegisterInfo &TRI,
 /// Returns the required ZA state needed before \p MI and an iterator pointing
 /// to where any code required to change the ZA state should be inserted.
 static std::pair<ZAState, MachineBasicBlock::iterator>
-getZAStateBeforeInst(const TargetRegisterInfo &TRI, MachineInstr &MI,
-                     bool ZAOffAtReturn) {
+getInstNeededZAState(const TargetRegisterInfo &TRI, MachineInstr &MI,
+                     SMEAttrs SMEFnAttrs) {
   MachineBasicBlock::iterator InsertPt(MI);
 
   if (MI.getOpcode() == AArch64::InOutZAUsePseudo)
     return {ZAState::ACTIVE, std::prev(InsertPt)};
 
+  // Note: If we need to save both ZA and ZT0 we use RequiresZASavePseudo.
   if (MI.getOpcode() == AArch64::RequiresZASavePseudo)
     return {ZAState::LOCAL_SAVED, std::prev(InsertPt)};
 
-  if (MI.isReturn())
+  // If we only need to save ZT0 there's two cases to consider:
+  //   1. The function has ZA state (that we don't need to save).
+  //      - In this case we switch to the "ACTIVE_ZT0_SAVED" state.
+  //        This only saves ZT0.
+  //   2. The function does not have ZA state
+  //      - In this case we switch to "LOCAL_COMMITTED" state.
+  //        This saves ZT0 and turns ZA off.
+  if (MI.getOpcode() == AArch64::RequiresZT0SavePseudo) {
+    return {SMEFnAttrs.hasZAState() ? ZAState::ACTIVE_ZT0_SAVED
+                                    : ZAState::LOCAL_COMMITTED,
+            std::prev(InsertPt)};
+  }
+
+  if (MI.isReturn()) {
+    bool ZAOffAtReturn = SMEFnAttrs.hasPrivateZAInterface();
     return {ZAOffAtReturn ? ZAState::OFF : ZAState::ACTIVE, InsertPt};
+  }
 
   for (auto &MO : MI.operands()) {
     if (isZAorZTRegOp(TRI, MO))
@@ -280,6 +323,9 @@ struct MachineSMEABI : public MachineFunctionPass {
   /// predecessors).
   void propagateDesiredStates(FunctionInfo &FnInfo, bool Forwards = true);
 
+  void emitZT0SaveRestore(EmitContext &, MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MBBI, bool IsSave);
+
   // Emission routines for private and shared ZA functions (using lazy saves).
   void emitSMEPrologue(MachineBasicBlock &MBB,
                        MachineBasicBlock::iterator MBBI);
@@ -290,8 +336,8 @@ struct MachineSMEABI : public MachineFunctionPass {
                          MachineBasicBlock::iterator MBBI);
   void emitAllocateLazySaveBuffer(EmitContext &, MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator MBBI);
-  void emitZAOff(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-                 bool ClearTPIDR2);
+  void emitZAMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                  bool ClearTPIDR2, bool On);
 
   // Emission routines for agnostic ZA functions.
   void emitSetupFullZASave(MachineBasicBlock &MBB,
@@ -398,7 +444,7 @@ FunctionInfo MachineSMEABI::collectNeededZAStates(SMEAttrs 
SMEFnAttrs) {
       Block.FixedEntryState = ZAState::ENTRY;
     } else if (MBB.isEHPad()) {
       // EH entry block:
-      Block.FixedEntryState = ZAState::LOCAL_SAVED;
+      Block.FixedEntryState = ZAState::LOCAL_COMMITTED;
     }
 
     LiveRegUnits LiveUnits(*TRI);
@@ -420,8 +466,7 @@ FunctionInfo MachineSMEABI::collectNeededZAStates(SMEAttrs 
SMEFnAttrs) {
         PhysLiveRegsAfterSMEPrologue = PhysLiveRegs;
       }
       // Note: We treat Agnostic ZA as inout_za with an alternate save/restore.
-      auto [NeededState, InsertPt] = getZAStateBeforeInst(
-          *TRI, MI, /*ZAOffAtReturn=*/SMEFnAttrs.hasPrivateZAInterface());
+      auto [NeededState, InsertPt] = getInstNeededZAState(*TRI, MI, 
SMEFnAttrs);
       assert((InsertPt == MBBI ||
               InsertPt->getOpcode() == AArch64::ADJCALLSTACKDOWN) &&
              "Unexpected state change insertion point!");
@@ -742,9 +787,9 @@ void MachineSMEABI::emitRestoreLazySave(EmitContext 
&Context,
   restorePhyRegSave(RegSave, MBB, MBBI, DL);
 }
 
-void MachineSMEABI::emitZAOff(MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator MBBI,
-                              bool ClearTPIDR2) {
+void MachineSMEABI::emitZAMode(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MBBI,
+                               bool ClearTPIDR2, bool On) {
   DebugLoc DL = getDebugLoc(MBB, MBBI);
 
   if (ClearTPIDR2)
@@ -755,7 +800,7 @@ void MachineSMEABI::emitZAOff(MachineBasicBlock &MBB,
   // Disable ZA.
   BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSRpstatesvcrImm1))
       .addImm(AArch64SVCR::SVCRZA)
-      .addImm(0);
+      .addImm(On ? 1 : 0);
 }
 
 void MachineSMEABI::emitAllocateLazySaveBuffer(
@@ -884,6 +929,28 @@ void MachineSMEABI::emitFullZASaveRestore(EmitContext 
&Context,
   restorePhyRegSave(RegSave, MBB, MBBI, DL);
 }
 
+void MachineSMEABI::emitZT0SaveRestore(EmitContext &Context,
+                                       MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MBBI,
+                                       bool IsSave) {
+  DebugLoc DL = getDebugLoc(MBB, MBBI);
+  Register ZT0Save = MRI->createVirtualRegister(&AArch64::GPR64spRegClass);
+
+  BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), ZT0Save)
+      .addFrameIndex(Context.getZT0SaveSlot(*MF))
+      .addImm(0)
+      .addImm(0);
+
+  if (IsSave) {
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::STR_TX))
+        .addReg(AArch64::ZT0)
+        .addReg(ZT0Save);
+  } else {
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::LDR_TX), AArch64::ZT0)
+        .addReg(ZT0Save);
+  }
+}
+
 void MachineSMEABI::emitAllocateFullZASaveBuffer(
     EmitContext &Context, MachineBasicBlock &MBB,
     MachineBasicBlock::iterator MBBI, LiveRegs PhysLiveRegs) {
@@ -928,6 +995,17 @@ void MachineSMEABI::emitAllocateFullZASaveBuffer(
   restorePhyRegSave(RegSave, MBB, MBBI, DL);
 }
 
+struct FromState {
+  ZAState From;
+
+  constexpr uint8_t to(ZAState To) const {
+    static_assert(NUM_ZA_STATE < 16, "expected ZAState to fit in 4-bits");
+    return uint8_t(From) << 4 | uint8_t(To);
+  }
+};
+
+constexpr FromState transitionFrom(ZAState From) { return FromState{From}; }
+
 void MachineSMEABI::emitStateChange(EmitContext &Context,
                                     MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator InsertPt,
@@ -959,17 +1037,63 @@ void MachineSMEABI::emitStateChange(EmitContext &Context,
     From = ZAState::ACTIVE;
   }
 
-  if (From == ZAState::ACTIVE && To == ZAState::LOCAL_SAVED)
-    emitZASave(Context, MBB, InsertPt, PhysLiveRegs);
-  else if (From == ZAState::LOCAL_SAVED && To == ZAState::ACTIVE)
-    emitZARestore(Context, MBB, InsertPt, PhysLiveRegs);
-  else if (To == ZAState::OFF) {
-    assert(From != ZAState::ENTRY &&
-           "ENTRY to OFF should have already been handled");
-    assert(!SMEFnAttrs.hasAgnosticZAInterface() &&
-           "Should not turn ZA off in agnostic ZA function");
-    emitZAOff(MBB, InsertPt, /*ClearTPIDR2=*/From == ZAState::LOCAL_SAVED);
-  } else {
+  bool IsAgnosticZA = SMEFnAttrs.hasAgnosticZAInterface();
+  bool HasZT0State = SMEFnAttrs.hasZT0State();
+  bool HasZAState = IsAgnosticZA || SMEFnAttrs.hasZAState();
+
+  switch (transitionFrom(From).to(To)) {
+  // This section handles: ACTIVE <-> ACTIVE_ZT0_SAVED
+  case transitionFrom(ZAState::ACTIVE).to(ZAState::ACTIVE_ZT0_SAVED):
+    emitZT0SaveRestore(Context, MBB, InsertPt, /*IsSave=*/true);
+    break;
+  case transitionFrom(ZAState::ACTIVE_ZT0_SAVED).to(ZAState::ACTIVE):
+    emitZT0SaveRestore(Context, MBB, InsertPt, /*IsSave=*/false);
+    break;
+
+  // This section handles: ACTIVE -> LOCAL_SAVED
+  case transitionFrom(ZAState::ACTIVE).to(ZAState::LOCAL_SAVED):
+    if (HasZT0State)
+      emitZT0SaveRestore(Context, MBB, InsertPt, /*IsSave=*/true);
+    if (HasZAState)
+      emitZASave(Context, MBB, InsertPt, PhysLiveRegs);
+    break;
+
+  // This section handles: ACTIVE -> LOCAL_COMMITTED
+  case transitionFrom(ZAState::ACTIVE).to(ZAState::LOCAL_COMMITTED):
+    // Note: We could support ZA state here, but this transition is currently
+    // only possible when we _don't_ have ZA state.
+    assert(HasZT0State && !HasZAState && "Expect to only have ZT0 state.");
+    emitZT0SaveRestore(Context, MBB, InsertPt, /*IsSave=*/true);
+    emitZAMode(MBB, InsertPt, /*ClearTPIDR2=*/false, /*On=*/false);
+    break;
+
+  // This section handles: LOCAL_COMMITTED -> (OFF|LOCAL_SAVED)
+  case transitionFrom(ZAState::LOCAL_COMMITTED).to(ZAState::OFF):
+  case transitionFrom(ZAState::LOCAL_COMMITTED).to(ZAState::LOCAL_SAVED):
+    // These transistions are a no-op.
+    break;
+
+  // This section handles: LOCAL_(SAVED|COMMITTED) -> ACTIVE[_ZT0_SAVED]
+  case transitionFrom(ZAState::LOCAL_COMMITTED).to(ZAState::ACTIVE):
+  case transitionFrom(ZAState::LOCAL_COMMITTED).to(ZAState::ACTIVE_ZT0_SAVED):
+  case transitionFrom(ZAState::LOCAL_SAVED).to(ZAState::ACTIVE):
+    if (HasZAState)
+      emitZARestore(Context, MBB, InsertPt, PhysLiveRegs);
+    else
+      emitZAMode(MBB, InsertPt, /*ClearTPIDR2=*/false, /*On=*/true);
+    if (HasZT0State && To == ZAState::ACTIVE)
+      emitZT0SaveRestore(Context, MBB, InsertPt, /*IsSave=*/false);
+    break;
+  default:
+    if (To == ZAState::OFF) {
+      assert(From != ZAState::ENTRY &&
+             "ENTRY to OFF should have already been handled");
+      assert(!AFI->getSMEFnAttrs().hasAgnosticZAInterface() &&
+             "Should not turn ZA off in agnostic ZA function");
+      emitZAMode(MBB, InsertPt, /*ClearTPIDR2=*/From == ZAState::LOCAL_SAVED,
+                 /*On=*/false);
+      break;
+    }
     dbgs() << "Error: Transition from " << getZAStateString(From) << " to "
            << getZAStateString(To) << '\n';
     llvm_unreachable("Unimplemented state transition");
diff --git a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll 
b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
index ced0d41c22dab..f4a3b55e49cd7 100644
--- a/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
+++ b/llvm/test/CodeGen/AArch64/sme-peephole-opts.ll
@@ -230,10 +230,6 @@ define void @test7() nounwind "aarch64_inout_zt0" {
 ; CHECK-NEXT:    str zt0, [x19]
 ; CHECK-NEXT:    smstop za
 ; CHECK-NEXT:    bl callee
-; CHECK-NEXT:    smstart za
-; CHECK-NEXT:    ldr zt0, [x19]
-; CHECK-NEXT:    str zt0, [x19]
-; CHECK-NEXT:    smstop za
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    ldr zt0, [x19]
diff --git a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll 
b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
index dcdc56c669077..f219b1169af01 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
@@ -511,7 +511,6 @@ exit:
 ;
 ; This code may require reloading ZT0 in the cleanup for ~ZT0Resource().
 ;
-; FIXME: Codegen with `-aarch64-new-sme-abi` is broken with ZT0 (as it is not 
implemented).
 define void @try_catch_shared_zt0_callee() "aarch64_inout_zt0" personality ptr 
@__gxx_personality_v0 {
 ; CHECK-LABEL: try_catch_shared_zt0_callee:
 ; CHECK:       .Lfunc_begin3:
@@ -519,52 +518,37 @@ define void @try_catch_shared_zt0_callee() 
"aarch64_inout_zt0" personality ptr @
 ; CHECK-NEXT:    .cfi_personality 156, DW.ref.__gxx_personality_v0
 ; CHECK-NEXT:    .cfi_lsda 28, .Lexception3
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    mov x29, sp
-; CHECK-NEXT:    sub sp, sp, #80
-; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 96
 ; CHECK-NEXT:    .cfi_offset w19, -8
 ; CHECK-NEXT:    .cfi_offset w20, -16
-; CHECK-NEXT:    .cfi_offset w30, -24
-; CHECK-NEXT:    .cfi_offset w29, -32
-; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x9, x8, x8, x9
-; CHECK-NEXT:    mov sp, x9
-; CHECK-NEXT:    stp x9, x8, [x29, #-80]
+; CHECK-NEXT:    .cfi_offset w30, -32
 ; CHECK-NEXT:  .Ltmp9: // EH_LABEL
-; CHECK-NEXT:    sub x19, x29, #64
+; CHECK-NEXT:    mov x19, sp
 ; CHECK-NEXT:    str zt0, [x19]
 ; CHECK-NEXT:    smstop za
 ; CHECK-NEXT:    bl may_throw
+; CHECK-NEXT:  .Ltmp10: // EH_LABEL
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    ldr zt0, [x19]
-; CHECK-NEXT:  .Ltmp10: // EH_LABEL
 ; CHECK-NEXT:  // %bb.1: // %return_normally
-; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB3_2: // %unwind_dtors
 ; CHECK-NEXT:  .Ltmp11: // EH_LABEL
-; CHECK-NEXT:    sub x20, x29, #64
+; CHECK-NEXT:    mov x20, sp
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    smstart za
-; CHECK-NEXT:    mrs x8, TPIDR2_EL0
-; CHECK-NEXT:    sub x0, x29, #80
-; CHECK-NEXT:    cbnz x8, .LBB3_4
-; CHECK-NEXT:  // %bb.3: // %unwind_dtors
-; CHECK-NEXT:    bl __arm_tpidr2_restore
-; CHECK-NEXT:  .LBB3_4: // %unwind_dtors
-; CHECK-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-NEXT:    ldr zt0, [x20]
 ; CHECK-NEXT:    bl shared_zt0_call
 ; CHECK-NEXT:    str zt0, [x20]
 ; CHECK-NEXT:    smstop za
 ; CHECK-NEXT:    mov x0, x19
 ; CHECK-NEXT:    bl _Unwind_Resume
-; CHECK-NEXT:    smstart za
-; CHECK-NEXT:    ldr zt0, [x20]
 ;
 ; CHECK-SDAG-LABEL: try_catch_shared_zt0_callee:
 ; CHECK-SDAG:       .Lfunc_begin3:
@@ -965,6 +949,90 @@ exit:
   ret void
 }
 
+define void @try_catch_inout_zt0() "aarch64_inout_zt0" personality ptr 
@__gxx_personality_v0 {
+; CHECK-LABEL: try_catch_inout_zt0:
+; CHECK:       .Lfunc_begin7:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:    .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-NEXT:    .cfi_lsda 28, .Lexception7
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #80
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:  .Ltmp21: // EH_LABEL
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    str zt0, [x19]
+; CHECK-NEXT:    smstop za
+; CHECK-NEXT:    bl may_throw
+; CHECK-NEXT:  .Ltmp22: // EH_LABEL
+; CHECK-NEXT:  .LBB7_1: // %exit
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    ldr zt0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #80
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB7_2: // %catch
+; CHECK-NEXT:  .Ltmp23: // EH_LABEL
+; CHECK-NEXT:    bl __cxa_begin_catch
+; CHECK-NEXT:    bl __cxa_end_catch
+; CHECK-NEXT:    b .LBB7_1
+;
+; CHECK-SDAG-LABEL: try_catch_inout_zt0:
+; CHECK-SDAG:       .Lfunc_begin7:
+; CHECK-SDAG-NEXT:    .cfi_startproc
+; CHECK-SDAG-NEXT:    .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-SDAG-NEXT:    .cfi_lsda 28, .Lexception7
+; CHECK-SDAG-NEXT:  // %bb.0: // %entry
+; CHECK-SDAG-NEXT:    sub sp, sp, #80
+; CHECK-SDAG-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-SDAG-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-SDAG-NEXT:    .cfi_offset w19, -8
+; CHECK-SDAG-NEXT:    .cfi_offset w30, -16
+; CHECK-SDAG-NEXT:  .Ltmp21: // EH_LABEL
+; CHECK-SDAG-NEXT:    mov x19, sp
+; CHECK-SDAG-NEXT:    str zt0, [x19]
+; CHECK-SDAG-NEXT:    smstop za
+; CHECK-SDAG-NEXT:    bl may_throw
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    ldr zt0, [x19]
+; CHECK-SDAG-NEXT:  .Ltmp22: // EH_LABEL
+; CHECK-SDAG-NEXT:  .LBB7_1: // %exit
+; CHECK-SDAG-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-SDAG-NEXT:    add sp, sp, #80
+; CHECK-SDAG-NEXT:    ret
+; CHECK-SDAG-NEXT:  .LBB7_2: // %catch
+; CHECK-SDAG-NEXT:  .Ltmp23: // EH_LABEL
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    ldr zt0, [x19]
+; CHECK-SDAG-NEXT:    str zt0, [x19]
+; CHECK-SDAG-NEXT:    smstop za
+; CHECK-SDAG-NEXT:    bl __cxa_begin_catch
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    ldr zt0, [x19]
+; CHECK-SDAG-NEXT:    str zt0, [x19]
+; CHECK-SDAG-NEXT:    smstop za
+; CHECK-SDAG-NEXT:    bl __cxa_end_catch
+; CHECK-SDAG-NEXT:    smstart za
+; CHECK-SDAG-NEXT:    ldr zt0, [x19]
+; CHECK-SDAG-NEXT:    b .LBB7_1
+entry:
+  invoke void @may_throw()
+          to label %exit unwind label %catch
+
+catch:
+  %eh_info = landingpad { ptr, i32 }
+          catch ptr null
+  %exception_ptr = extractvalue { ptr, i32 } %eh_info, 0
+  tail call ptr @__cxa_begin_catch(ptr %exception_ptr)
+  tail call void @__cxa_end_catch()
+  br label %exit
+
+exit:
+  ret void
+}
+
 declare ptr @__cxa_allocate_exception(i64)
 declare void @__cxa_throw(ptr, ptr, ptr)
 declare ptr @__cxa_begin_catch(ptr)
diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll 
b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
index 4c48e41294a3a..e8f4f6ed78b9c 100644
--- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
+++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
@@ -193,7 +193,7 @@ define void @zt0_new_caller_zt0_new_callee(ptr %callee) 
"aarch64_new_zt0" nounwi
 ; CHECK-NEWLOWERING-LABEL: zt0_new_caller_zt0_new_callee:
 ; CHECK-NEWLOWERING:       // %bb.0:
 ; CHECK-NEWLOWERING-NEXT:    sub sp, sp, #80
-; CHECK-NEWLOWERING-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
 ; CHECK-NEWLOWERING-NEXT:    mrs x8, TPIDR2_EL0
 ; CHECK-NEWLOWERING-NEXT:    cbz x8, .LBB6_2
 ; CHECK-NEWLOWERING-NEXT:  // %bb.1:
@@ -202,14 +202,11 @@ define void @zt0_new_caller_zt0_new_callee(ptr %callee) 
"aarch64_new_zt0" nounwi
 ; CHECK-NEWLOWERING-NEXT:    zero { zt0 }
 ; CHECK-NEWLOWERING-NEXT:  .LBB6_2:
 ; CHECK-NEWLOWERING-NEXT:    smstart za
-; CHECK-NEWLOWERING-NEXT:    mov x19, sp
-; CHECK-NEWLOWERING-NEXT:    str zt0, [x19]
+; CHECK-NEWLOWERING-NEXT:    mov x8, sp
+; CHECK-NEWLOWERING-NEXT:    str zt0, [x8]
 ; CHECK-NEWLOWERING-NEXT:    smstop za
 ; CHECK-NEWLOWERING-NEXT:    blr x0
-; CHECK-NEWLOWERING-NEXT:    smstart za
-; CHECK-NEWLOWERING-NEXT:    ldr zt0, [x19]
-; CHECK-NEWLOWERING-NEXT:    smstop za
-; CHECK-NEWLOWERING-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEWLOWERING-NEXT:    add sp, sp, #80
 ; CHECK-NEWLOWERING-NEXT:    ret
   call void %callee() "aarch64_new_zt0";
@@ -246,7 +243,7 @@ define i64 @zt0_new_caller_abi_routine_callee() 
"aarch64_new_zt0" nounwind {
 ; CHECK-NEWLOWERING-LABEL: zt0_new_caller_abi_routine_callee:
 ; CHECK-NEWLOWERING:       // %bb.0:
 ; CHECK-NEWLOWERING-NEXT:    sub sp, sp, #80
-; CHECK-NEWLOWERING-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
 ; CHECK-NEWLOWERING-NEXT:    mrs x8, TPIDR2_EL0
 ; CHECK-NEWLOWERING-NEXT:    cbz x8, .LBB7_2
 ; CHECK-NEWLOWERING-NEXT:  // %bb.1:
@@ -255,12 +252,11 @@ define i64 @zt0_new_caller_abi_routine_callee() 
"aarch64_new_zt0" nounwind {
 ; CHECK-NEWLOWERING-NEXT:    zero { zt0 }
 ; CHECK-NEWLOWERING-NEXT:  .LBB7_2:
 ; CHECK-NEWLOWERING-NEXT:    smstart za
-; CHECK-NEWLOWERING-NEXT:    mov x19, sp
-; CHECK-NEWLOWERING-NEXT:    str zt0, [x19]
-; CHECK-NEWLOWERING-NEXT:    bl __arm_sme_state
-; CHECK-NEWLOWERING-NEXT:    ldr zt0, [x19]
+; CHECK-NEWLOWERING-NEXT:    mov x8, sp
+; CHECK-NEWLOWERING-NEXT:    str zt0, [x8]
 ; CHECK-NEWLOWERING-NEXT:    smstop za
-; CHECK-NEWLOWERING-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    bl __arm_sme_state
+; CHECK-NEWLOWERING-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
 ; CHECK-NEWLOWERING-NEXT:    add sp, sp, #80
 ; CHECK-NEWLOWERING-NEXT:    ret
   %res = call {i64, i64} @__arm_sme_state()
@@ -382,37 +378,57 @@ define void @shared_za_new_zt0(ptr %callee) 
"aarch64_inout_za" "aarch64_new_zt0"
 
 
 define void @zt0_multiple_private_za_calls(ptr %callee) "aarch64_in_zt0" 
nounwind {
-; CHECK-COMMON-LABEL: zt0_multiple_private_za_calls:
-; CHECK-COMMON:       // %bb.0:
-; CHECK-COMMON-NEXT:    sub sp, sp, #96
-; CHECK-COMMON-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
-; CHECK-COMMON-NEXT:    mov x20, sp
-; CHECK-COMMON-NEXT:    mov x19, x0
-; CHECK-COMMON-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
-; CHECK-COMMON-NEXT:    str zt0, [x20]
-; CHECK-COMMON-NEXT:    smstop za
-; CHECK-COMMON-NEXT:    blr x0
-; CHECK-COMMON-NEXT:    smstart za
-; CHECK-COMMON-NEXT:    ldr zt0, [x20]
-; CHECK-COMMON-NEXT:    str zt0, [x20]
-; CHECK-COMMON-NEXT:    smstop za
-; CHECK-COMMON-NEXT:    blr x19
-; CHECK-COMMON-NEXT:    smstart za
-; CHECK-COMMON-NEXT:    ldr zt0, [x20]
-; CHECK-COMMON-NEXT:    str zt0, [x20]
-; CHECK-COMMON-NEXT:    smstop za
-; CHECK-COMMON-NEXT:    blr x19
-; CHECK-COMMON-NEXT:    smstart za
-; CHECK-COMMON-NEXT:    ldr zt0, [x20]
-; CHECK-COMMON-NEXT:    str zt0, [x20]
-; CHECK-COMMON-NEXT:    smstop za
-; CHECK-COMMON-NEXT:    blr x19
-; CHECK-COMMON-NEXT:    smstart za
-; CHECK-COMMON-NEXT:    ldr zt0, [x20]
-; CHECK-COMMON-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
-; CHECK-COMMON-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
-; CHECK-COMMON-NEXT:    add sp, sp, #96
-; CHECK-COMMON-NEXT:    ret
+; CHECK-LABEL: zt0_multiple_private_za_calls:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x20, sp
+; CHECK-NEXT:    mov x19, x0
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    str zt0, [x20]
+; CHECK-NEXT:    smstop za
+; CHECK-NEXT:    blr x0
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    ldr zt0, [x20]
+; CHECK-NEXT:    str zt0, [x20]
+; CHECK-NEXT:    smstop za
+; CHECK-NEXT:    blr x19
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    ldr zt0, [x20]
+; CHECK-NEXT:    str zt0, [x20]
+; CHECK-NEXT:    smstop za
+; CHECK-NEXT:    blr x19
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    ldr zt0, [x20]
+; CHECK-NEXT:    str zt0, [x20]
+; CHECK-NEXT:    smstop za
+; CHECK-NEXT:    blr x19
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    ldr zt0, [x20]
+; CHECK-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    ret
+;
+; CHECK-NEWLOWERING-LABEL: zt0_multiple_private_za_calls:
+; CHECK-NEWLOWERING:       // %bb.0:
+; CHECK-NEWLOWERING-NEXT:    sub sp, sp, #96
+; CHECK-NEWLOWERING-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT:    mov x20, sp
+; CHECK-NEWLOWERING-NEXT:    mov x19, x0
+; CHECK-NEWLOWERING-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEWLOWERING-NEXT:    str zt0, [x20]
+; CHECK-NEWLOWERING-NEXT:    smstop za
+; CHECK-NEWLOWERING-NEXT:    blr x0
+; CHECK-NEWLOWERING-NEXT:    blr x19
+; CHECK-NEWLOWERING-NEXT:    blr x19
+; CHECK-NEWLOWERING-NEXT:    blr x19
+; CHECK-NEWLOWERING-NEXT:    smstart za
+; CHECK-NEWLOWERING-NEXT:    ldr zt0, [x20]
+; CHECK-NEWLOWERING-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEWLOWERING-NEXT:    add sp, sp, #96
+; CHECK-NEWLOWERING-NEXT:    ret
   call void %callee()
   call void %callee()
   call void %callee()

_______________________________________________
llvm-branch-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

Reply via email to