https://github.com/MacDue updated 
https://github.com/llvm/llvm-project/pull/149510

>From c2d34149b2860cadf03824cc35a724775aaf60f8 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxw...@arm.com>
Date: Tue, 15 Jul 2025 17:00:04 +0000
Subject: [PATCH] [AArch64][SME] Propagate desired ZA states in the
 MachineSMEABIPass

This patch adds a propagation step to the MachineSMEABIPass that
propagates desired ZA states forwards (from predecessors to successors).

The aim of this is to pick better ZA states for edge bundles, as when
many (or all) blocks in a bundle do not have a preferred ZA state, the
ZA state assigned to a bundle can be less than ideal.

An important case is nested loops, where only the inner loop has a
preferred ZA state. Here we'd like to propagate the ZA state up from the
inner loop to the outer loops (to avoid saves/restores in any loop).

Change-Id: I39f9c7d7608e2fa070be2fb88351b4d1d0079041
---
 llvm/lib/Target/AArch64/MachineSMEABIPass.cpp |  85 ++++-
 .../sme-za-function-with-many-blocks.ll       | 296 ++++++++++++++++++
 2 files changed, 364 insertions(+), 17 deletions(-)
 create mode 100644 
llvm/test/CodeGen/AArch64/sme-za-function-with-many-blocks.ll

diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp 
b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
index 7f3bb42e5a08e..4bf11a7e9da2c 100644
--- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp
@@ -138,6 +138,7 @@ struct MachineSMEABI : public MachineFunctionPass {
   }
 
   void collectNeededZAStates(MachineFunction &MF, SMEAttrs);
+  void propagateDesiredStates(MachineFunction &MF);
   void pickBundleZAStates(MachineFunction &MF);
   void insertStateChanges(MachineFunction &MF, bool IsAgnosticZA);
 
@@ -202,8 +203,10 @@ struct MachineSMEABI : public MachineFunctionPass {
   };
 
   struct BlockInfo {
-    ZAState FixedEntryState{ZAState::ANY};
     SmallVector<InstInfo> Insts;
+    ZAState FixedEntryState{ZAState::ANY};
+    ZAState DesiredIncomingState{ZAState::ANY};
+    ZAState DesiredOutgoingState{ZAState::ANY};
     LiveRegs PhysLiveRegsAtEntry = LiveRegs::None;
     LiveRegs PhysLiveRegsAtExit = LiveRegs::None;
   };
@@ -294,28 +297,74 @@ void MachineSMEABI::collectNeededZAStates(MachineFunction 
&MF,
 
     // Reverse vector (as we had to iterate backwards for liveness).
     std::reverse(Block.Insts.begin(), Block.Insts.end());
+
+    // Record the desired states on entry/exit of this block. These are the
+    // states that would not incur a state transition.
+    if (!Block.Insts.empty()) {
+      Block.DesiredIncomingState = Block.Insts.front().NeededState;
+      Block.DesiredOutgoingState = Block.Insts.back().NeededState;
+    }
+  }
+}
+
+void MachineSMEABI::propagateDesiredStates(MachineFunction &MF) {
+  // This propagates desired states from predecessors to successors. This
+  // propagates state up loop nests (as an inner loop is a predecessor
+  // to outer its loops).
+  SmallVector<MachineBasicBlock *> Worklist;
+  for (auto [BlockID, BlockInfo] : enumerate(State.Blocks)) {
+    if (!isLegalEdgeBundleZAState(BlockInfo.DesiredIncomingState))
+      Worklist.push_back(MF.getBlockNumbered(BlockID));
+  }
+
+  while (!Worklist.empty()) {
+    MachineBasicBlock *MBB = Worklist.pop_back_val();
+    auto &BlockInfo = State.Blocks[MBB->getNumber()];
+
+    // Pick a legal edge bundle state that matches the majority of 
predecessors.
+    int PredStateCounts[ZAState::NUM_ZA_STATE] = {0};
+    for (MachineBasicBlock *Pred : predecessors(MBB)) {
+      auto &PredBlockInfo = State.Blocks[Pred->getNumber()];
+      if (isLegalEdgeBundleZAState(PredBlockInfo.DesiredOutgoingState))
+        PredStateCounts[PredBlockInfo.DesiredOutgoingState]++;
+    }
+    ZAState PropagatedState =
+        ZAState(max_element(PredStateCounts) - PredStateCounts);
+
+    if (PropagatedState != BlockInfo.DesiredIncomingState) {
+      BlockInfo.DesiredIncomingState = PropagatedState;
+      // Propagate to outgoing state for blocks that don't care about their
+      // ZA state.
+      if (BlockInfo.DesiredOutgoingState == ZAState::ANY)
+        BlockInfo.DesiredOutgoingState = PropagatedState;
+
+      // Push any successors that may need updating to the worklist.
+      for (MachineBasicBlock *Succ : successors(MBB)) {
+        auto &SuccBlockInfo = State.Blocks[Succ->getNumber()];
+        if (!isLegalEdgeBundleZAState(SuccBlockInfo.DesiredIncomingState))
+          Worklist.push_back(Succ);
+      }
+    }
   }
 }
 
 void MachineSMEABI::pickBundleZAStates(MachineFunction &MF) {
   State.BundleStates.resize(Bundles->getNumBundles());
+
+  if (OptLevel != CodeGenOptLevel::None)
+    propagateDesiredStates(MF);
+
   for (unsigned I = 0, E = Bundles->getNumBundles(); I != E; ++I) {
     LLVM_DEBUG(dbgs() << "Picking ZA state for edge bundle: " << I << '\n');
 
     // Attempt to pick a ZA state for this bundle that minimizes state
     // transitions. Edges within loops are given a higher weight as we assume
     // they will be executed more than once.
-    // TODO: We should propagate desired incoming/outgoing states through 
blocks
-    // that have the "ANY" state first to make better global decisions.
     int EdgeStateCounts[ZAState::NUM_ZA_STATE] = {0};
     for (unsigned BlockID : Bundles->getBlocks(I)) {
       LLVM_DEBUG(dbgs() << "- bb." << BlockID);
 
       BlockInfo &Block = State.Blocks[BlockID];
-      if (Block.Insts.empty()) {
-        LLVM_DEBUG(dbgs() << " (no state preference)\n");
-        continue;
-      }
       bool IsLoop = MLI && MLI->getLoopFor(MF.getBlockNumbered(BlockID));
       bool InEdge = Bundles->getBundle(BlockID, /*Out=*/false) == I;
       bool OutEdge = Bundles->getBundle(BlockID, /*Out=*/true) == I;
@@ -324,26 +373,28 @@ void MachineSMEABI::pickBundleZAStates(MachineFunction 
&MF) {
         LLVM_DEBUG(dbgs() << " IsLoop");
 
       LLVM_DEBUG(dbgs() << " (EdgeWeight: " << EdgeWeight << ')');
-      ZAState DesiredIncomingState = Block.Insts.front().NeededState;
-      if (InEdge && isLegalEdgeBundleZAState(DesiredIncomingState)) {
-        EdgeStateCounts[DesiredIncomingState] += EdgeWeight;
+      bool LegalInEdge =
+          InEdge && isLegalEdgeBundleZAState(Block.DesiredIncomingState);
+      bool LegalOutEgde =
+          OutEdge && isLegalEdgeBundleZAState(Block.DesiredOutgoingState);
+      if (LegalInEdge) {
         LLVM_DEBUG(dbgs() << " DesiredIncomingState: "
-                          << getZAStateString(DesiredIncomingState));
+                          << getZAStateString(Block.DesiredIncomingState));
+        EdgeStateCounts[Block.DesiredIncomingState] += EdgeWeight;
       }
-      ZAState DesiredOutgoingState = Block.Insts.back().NeededState;
-      if (OutEdge && isLegalEdgeBundleZAState(DesiredOutgoingState)) {
-        EdgeStateCounts[DesiredOutgoingState] += EdgeWeight;
+      if (LegalOutEgde) {
         LLVM_DEBUG(dbgs() << " DesiredOutgoingState: "
-                          << getZAStateString(DesiredOutgoingState));
+                          << getZAStateString(Block.DesiredOutgoingState));
+        EdgeStateCounts[Block.DesiredOutgoingState] += EdgeWeight;
       }
+      if (!LegalInEdge && !LegalOutEgde)
+        LLVM_DEBUG(dbgs() << " (no state preference)");
       LLVM_DEBUG(dbgs() << '\n');
     }
 
     ZAState BundleState =
         ZAState(max_element(EdgeStateCounts) - EdgeStateCounts);
 
-    // Force ZA to be active in bundles that don't have a preferred state.
-    // TODO: Something better here (to avoid extra mode switches).
     if (BundleState == ZAState::ANY)
       BundleState = ZAState::ACTIVE;
 
diff --git a/llvm/test/CodeGen/AArch64/sme-za-function-with-many-blocks.ll 
b/llvm/test/CodeGen/AArch64/sme-za-function-with-many-blocks.ll
new file mode 100644
index 0000000000000..0306b27cb17e1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-za-function-with-many-blocks.ll
@@ -0,0 +1,296 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -aarch64-new-sme-abi < %s | 
FileCheck %s
+
+; This test case was generated by lowering 
mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/matmul.mlir to LLVM IR.
+; The actual contents of the function are not that important. The main 
interesting quality here is that many blocks
+; don't directly use ZA. The only blocks that require ZA are the MOPA (and 
load/stores) in the inner loop, and the
+;`printMemrefF32()` call in the exit block.
+;
+; If ZA states are not propagated in the MachineSMEABIPass block %48 (which is 
within the outer loop), will
+; have  an edge to block %226 (the exit block), which requires ZA in the 
"saved" state, and an edge to block %51
+; (which has no preference on ZA state). This means block %48 will also end up 
in the locally saved state.
+; This is not really what we want, as it means we will save/restore ZA in the 
outer loop. We can fix this by
+; propagating the "active" state from the inner loop through basic blocks with 
no preference, to ensure the outer
+; loop is in the "active" state too.
+;
+; If done correctly, the only ZA save/restore should be in the exit block 
(with all other blocks in the active state).
+
+define void @matmul(ptr %0, ptr %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, 
ptr %7, ptr %8, i64 %9, i64 %10, i64 %11, i64 %12, i64 %13, ptr %14, ptr %15, 
i64 %16, i64 %17, i64 %18, i64 %19, i64 %20) #0 {
+; Check for a ZA zero in the entry block, then no uses of TPIDR2_EL0 (for ZA 
saves/restore)
+; until the exit block (which contains the call to printMemrefF32).
+;
+; CHECK-LABEL: matmul:
+; CHECK:      zero {za}
+; CHECK-NOT:  TPIDR2_EL0
+; CHECK:      msr TPIDR2_EL0, x{{.*}}
+; CHECK-NOT:  .LBB{{.*}}
+; CHECK:      bl printMemrefF32
+  %22 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } poison, ptr %14, 0
+  %23 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %22, ptr %15, 1
+  %24 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %23, i64 %16, 2
+  %25 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %24, i64 %17, 3, 0
+  %26 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %25, i64 %19, 4, 0
+  %27 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %26, i64 %18, 3, 1
+  %28 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %27, i64 %20, 4, 1
+  %29 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } poison, ptr %7, 0
+  %30 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %29, ptr %8, 1
+  %31 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %30, i64 %9, 2
+  %32 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %31, i64 %10, 3, 0
+  %33 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %32, i64 %12, 4, 0
+  %34 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %33, i64 %11, 3, 1
+  %35 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %34, i64 %13, 4, 1
+  %36 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } poison, ptr %0, 0
+  %37 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %36, ptr %1, 1
+  %38 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %37, i64 %2, 2
+  %39 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %38, i64 %3, 3, 0
+  %40 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %39, i64 %5, 4, 0
+  %41 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %40, i64 %4, 3, 1
+  %42 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %41, i64 %6, 4, 1
+  %43 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 3, 0
+  %44 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 3, 1
+  %45 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 3, 1
+  %46 = call i64 @llvm.vscale.i64()
+  %47 = mul i64 %46, 4
+  br label %48
+
+48:                                               ; preds = %224, %21
+  %49 = phi i64 [ %225, %224 ], [ 0, %21 ]
+  %50 = icmp slt i64 %49, %43
+  br i1 %50, label %51, label %226
+
+51:                                               ; preds = %48
+  %52 = sub i64 %43, %49
+  %53 = call i64 @llvm.smin.i64(i64 %47, i64 %52)
+  %54 = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+  %55 = trunc i64 %53 to i32
+  %56 = insertelement <vscale x 4 x i32> poison, i32 %55, i32 0
+  %57 = shufflevector <vscale x 4 x i32> %56, <vscale x 4 x i32> poison, 
<vscale x 4 x i32> zeroinitializer
+  %58 = icmp slt <vscale x 4 x i32> %54, %57
+  br label %59
+
+59:                                               ; preds = %222, %51
+  %60 = phi i64 [ %223, %222 ], [ 0, %51 ]
+  %61 = icmp slt i64 %60, %45
+  br i1 %61, label %62, label %224
+
+62:                                               ; preds = %59
+  %63 = sub i64 %45, %60
+  %64 = call i64 @llvm.smin.i64(i64 %47, i64 %63)
+  %65 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, 0
+  %66 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, 1
+  %67 = insertvalue { ptr, ptr, i64 } poison, ptr %65, 0
+  %68 = insertvalue { ptr, ptr, i64 } %67, ptr %66, 1
+  %69 = insertvalue { ptr, ptr, i64 } %68, i64 0, 2
+  %70 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, 2
+  %71 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, 3, 0
+  %72 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, 3, 1
+  %73 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, 4, 0
+  %74 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, 4, 1
+  %75 = mul nsw i64 %49, %73
+  %76 = add i64 %70, %75
+  %77 = mul nsw i64 %60, %74
+  %78 = add i64 %76, %77
+  %79 = extractvalue { ptr, ptr, i64 } %69, 0
+  %80 = extractvalue { ptr, ptr, i64 } %69, 1
+  %81 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } poison, ptr %79, 0
+  %82 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %81, ptr %80, 1
+  %83 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %82, i64 %78, 2
+  %84 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %83, i64 %53, 3, 0
+  %85 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %84, i64 %73, 4, 0
+  %86 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %85, i64 %64, 3, 1
+  %87 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %86, i64 %74, 4, 1
+  %88 = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+  %89 = trunc i64 %64 to i32
+  %90 = insertelement <vscale x 4 x i32> poison, i32 %89, i32 0
+  %91 = shufflevector <vscale x 4 x i32> %90, <vscale x 4 x i32> poison, 
<vscale x 4 x i32> zeroinitializer
+  %92 = icmp slt <vscale x 4 x i32> %88, %91
+  br label %93
+
+93:                                               ; preds = %220, %62
+  %94 = phi i64 [ %221, %220 ], [ 0, %62 ]
+  %95 = icmp slt i64 %94, %44
+  br i1 %95, label %96, label %222
+
+96:                                               ; preds = %93
+  %97 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 0
+  %98 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 1
+  %99 = insertvalue { ptr, ptr, i64 } poison, ptr %97, 0
+  %100 = insertvalue { ptr, ptr, i64 } %99, ptr %98, 1
+  %101 = insertvalue { ptr, ptr, i64 } %100, i64 0, 2
+  %102 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 2
+  %103 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 3, 0
+  %104 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 3, 1
+  %105 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 4, 0
+  %106 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %42, 4, 1
+  %107 = mul nsw i64 %49, %105
+  %108 = add i64 %102, %107
+  %109 = mul nsw i64 %94, %106
+  %110 = add i64 %108, %109
+  %111 = extractvalue { ptr, ptr, i64 } %101, 0
+  %112 = extractvalue { ptr, ptr, i64 } %101, 1
+  %113 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } poison, ptr %111, 0
+  %114 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %113, ptr %112, 1
+  %115 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %114, i64 %110, 2
+  %116 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %115, i64 %53, 3, 0
+  %117 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %116, i64 %105, 
4, 0
+  br label %118
+
+118:                                              ; preds = %133, %96
+  %119 = phi i64 [ %135, %133 ], [ 0, %96 ]
+  %120 = phi <vscale x 4 x float> [ %134, %133 ], [ poison, %96 ]
+  %121 = icmp slt i64 %119, %47
+  br i1 %121, label %122, label %136
+
+122:                                              ; preds = %118
+  %123 = extractelement <vscale x 4 x i1> %58, i64 %119
+  br i1 %123, label %124, label %133
+
+124:                                              ; preds = %122
+  %125 = extractvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %117, 1
+  %126 = extractvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %117, 2
+  %127 = getelementptr float, ptr %125, i64 %126
+  %128 = extractvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %117, 4, 0
+  %129 = mul nuw nsw i64 %119, %128
+  %130 = getelementptr inbounds nuw float, ptr %127, i64 %129
+  %131 = load float, ptr %130, align 4
+  %132 = insertelement <vscale x 4 x float> %120, float %131, i64 %119
+  br label %133
+
+133:                                              ; preds = %124, %122
+  %134 = phi <vscale x 4 x float> [ %132, %124 ], [ %120, %122 ]
+  %135 = add i64 %119, 1
+  br label %118
+
+136:                                              ; preds = %118
+  %137 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 0
+  %138 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 1
+  %139 = insertvalue { ptr, ptr, i64 } poison, ptr %137, 0
+  %140 = insertvalue { ptr, ptr, i64 } %139, ptr %138, 1
+  %141 = insertvalue { ptr, ptr, i64 } %140, i64 0, 2
+  %142 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 2
+  %143 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 3, 0
+  %144 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 3, 1
+  %145 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 4, 0
+  %146 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %35, 4, 1
+  %147 = mul nsw i64 %94, %145
+  %148 = add i64 %142, %147
+  %149 = mul nsw i64 %60, %146
+  %150 = add i64 %148, %149
+  %151 = extractvalue { ptr, ptr, i64 } %141, 0
+  %152 = extractvalue { ptr, ptr, i64 } %141, 1
+  %153 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } poison, ptr %151, 0
+  %154 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %153, ptr %152, 1
+  %155 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %154, i64 %150, 2
+  %156 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %155, i64 %64, 3, 0
+  %157 = insertvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %156, i64 %146, 
4, 0
+  br label %158
+
+158:                                              ; preds = %173, %136
+  %159 = phi i64 [ %175, %173 ], [ 0, %136 ]
+  %160 = phi <vscale x 4 x float> [ %174, %173 ], [ poison, %136 ]
+  %161 = icmp slt i64 %159, %47
+  br i1 %161, label %162, label %176
+
+162:                                              ; preds = %158
+  %163 = extractelement <vscale x 4 x i1> %92, i64 %159
+  br i1 %163, label %164, label %173
+
+164:                                              ; preds = %162
+  %165 = extractvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %157, 1
+  %166 = extractvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %157, 2
+  %167 = getelementptr float, ptr %165, i64 %166
+  %168 = extractvalue { ptr, ptr, i64, [1 x i64], [1 x i64] } %157, 4, 0
+  %169 = mul nuw nsw i64 %159, %168
+  %170 = getelementptr inbounds nuw float, ptr %167, i64 %169
+  %171 = load float, ptr %170, align 4
+  %172 = insertelement <vscale x 4 x float> %160, float %171, i64 %159
+  br label %173
+
+173:                                              ; preds = %164, %162
+  %174 = phi <vscale x 4 x float> [ %172, %164 ], [ %160, %162 ]
+  %175 = add i64 %159, 1
+  br label %158
+
+176:                                              ; preds = %158
+  %177 = trunc i64 %64 to i32
+  br label %178
+
+178:                                              ; preds = %181, %176
+  %179 = phi i64 [ %202, %181 ], [ 0, %176 ]
+  %180 = icmp slt i64 %179, %47
+  br i1 %180, label %181, label %203
+
+181:                                              ; preds = %178
+  %182 = icmp ult i64 %179, %53
+  %183 = sext i1 %182 to i32
+  %184 = and i32 %183, %177
+  %185 = sext i32 %184 to i64
+  %186 = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
+  %187 = trunc i64 %185 to i32
+  %188 = insertelement <vscale x 4 x i32> poison, i32 %187, i32 0
+  %189 = shufflevector <vscale x 4 x i32> %188, <vscale x 4 x i32> poison, 
<vscale x 4 x i32> zeroinitializer
+  %190 = icmp slt <vscale x 4 x i32> %186, %189
+  %191 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 1
+  %192 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 2
+  %193 = getelementptr float, ptr %191, i64 %192
+  %194 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 4, 0
+  %195 = mul i64 %179, %194
+  %196 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 4, 1
+  %197 = mul i64 0, %196
+  %198 = add i64 %195, %197
+  %199 = getelementptr float, ptr %193, i64 %198
+  %200 = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr %199, i32 
4, <vscale x 4 x i1> %190, <vscale x 4 x float> poison)
+  %201 = trunc i64 %179 to i32
+  call void @llvm.aarch64.sme.write.horiz.nxv4f32(i32 0, i32 %201, <vscale x 4 
x i1> splat (i1 true), <vscale x 4 x float> %200)
+  %202 = add i64 %179, 1
+  br label %178
+
+203:                                              ; preds = %178
+  call void @llvm.aarch64.sme.mopa.nxv4f32(i32 0, <vscale x 4 x i1> %58, 
<vscale x 4 x i1> %92, <vscale x 4 x float> %120, <vscale x 4 x float> %160)
+  %204 = call i64 @llvm.smin.i64(i64 %53, i64 %47)
+  br label %205
+
+205:                                              ; preds = %208, %203
+  %206 = phi i64 [ %219, %208 ], [ 0, %203 ]
+  %207 = icmp slt i64 %206, %204
+  br i1 %207, label %208, label %220
+
+208:                                              ; preds = %205
+  %209 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 1
+  %210 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 2
+  %211 = getelementptr float, ptr %209, i64 %210
+  %212 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 4, 0
+  %213 = mul i64 %206, %212
+  %214 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %87, 4, 1
+  %215 = mul i64 0, %214
+  %216 = add i64 %213, %215
+  %217 = getelementptr float, ptr %211, i64 %216
+  %218 = trunc i64 %206 to i32
+  call void @llvm.aarch64.sme.st1w.horiz(<vscale x 4 x i1> %92, ptr %217, i32 
0, i32 %218)
+  %219 = add i64 %206, 1
+  br label %205
+
+220:                                              ; preds = %205
+  %221 = add i64 %94, 1
+  br label %93
+
+222:                                              ; preds = %93
+  %223 = add i64 %60, %47
+  br label %59
+
+224:                                              ; preds = %59
+  %225 = add i64 %49, %47
+  br label %48
+
+226:                                              ; preds = %48
+  %227 = alloca { ptr, ptr, i64, [2 x i64], [2 x i64] }, i64 1, align 8
+  store { ptr, ptr, i64, [2 x i64], [2 x i64] } %28, ptr %227, align 8
+  %228 = insertvalue { i64, ptr } { i64 2, ptr poison }, ptr %227, 1
+  %229 = extractvalue { i64, ptr } %228, 0
+  %230 = extractvalue { i64, ptr } %228, 1
+  call void @printMemrefF32(i64 %229, ptr %230)
+  ret void
+}
+
+declare void @printMemrefF32(i64, ptr)
+
+attributes #0 = { "aarch64_new_za" "aarch64_pstate_sm_body" }

_______________________________________________
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

Reply via email to