https://github.com/SamTebbs33 updated https://github.com/llvm/llvm-project/pull/153187
>From bbe60d67810aad6ee91eabce0f904062af9281c0 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs <samuel.te...@arm.com> Date: Tue, 12 Aug 2025 14:25:36 +0100 Subject: [PATCH] [AArch64] Split large loop dependence masks This PR adds splitting in the AArch64 backend for the LOOP_DEPENDENCE_MASK nodes so that even large vector types can be turned into whilewr/rw. --- .../Target/AArch64/AArch64ISelLowering.cpp | 97 ++- llvm/test/CodeGen/AArch64/alias_mask.ll | 757 +++++++++--------- .../CodeGen/AArch64/alias_mask_scalable.ll | 697 ++++++---------- 3 files changed, 668 insertions(+), 883 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index a8aa1f67e342d..b53071065b42e 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -5248,49 +5248,94 @@ AArch64TargetLowering::LowerLOOP_DEPENDENCE_MASK(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); uint64_t EltSize = Op.getConstantOperandVal(2); - EVT VT = Op.getValueType(); + EVT FullVT = Op.getValueType(); + unsigned NumElements = FullVT.getVectorMinNumElements(); + unsigned NumSplits = 0; + EVT EltVT; switch (EltSize) { case 1: - if (VT != MVT::v16i8 && VT != MVT::nxv16i1) - return SDValue(); + EltVT = MVT::i8; break; case 2: - if (VT != MVT::v8i8 && VT != MVT::nxv8i1) - return SDValue(); + if (NumElements >= 16) + NumSplits = NumElements / 16; + EltVT = MVT::i16; break; case 4: - if (VT != MVT::v4i16 && VT != MVT::nxv4i1) - return SDValue(); + if (NumElements >= 8) + NumSplits = NumElements / 8; + EltVT = MVT::i32; break; case 8: - if (VT != MVT::v2i32 && VT != MVT::nxv2i1) - return SDValue(); + if (NumElements >= 4) + NumSplits = NumElements / 4; + EltVT = MVT::i64; break; default: // Other element sizes are incompatible with whilewr/rw, so expand instead return SDValue(); } - SDValue PtrA = Op.getOperand(0); - SDValue PtrB = Op.getOperand(1); + auto LowerToWhile = [&](EVT VT, unsigned AddrScale) { + SDValue PtrA = Op.getOperand(0); + SDValue PtrB = Op.getOperand(1); - if (VT.isScalableVT()) - return DAG.getNode(Op.getOpcode(), DL, VT, PtrA, PtrB, Op.getOperand(2)); + EVT StoreVT = EVT::getVectorVT(*DAG.getContext(), EltVT, + VT.getVectorMinNumElements(), false); + if (AddrScale > 0) { + unsigned Offset = StoreVT.getStoreSizeInBits() / 8 * AddrScale; + SDValue Addend; - // We can use the SVE whilewr/whilerw instruction to lower this - // intrinsic by creating the appropriate sequence of scalable vector - // operations and then extracting a fixed-width subvector from the scalable - // vector. Scalable vector variants are already legal. - EVT ContainerVT = - EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), - VT.getVectorNumElements(), true); - EVT WhileVT = ContainerVT.changeElementType(MVT::i1); + if (VT.isScalableVT()) + Addend = DAG.getVScale(DL, MVT::i64, APInt(64, Offset)); + else + Addend = DAG.getConstant(Offset, DL, MVT::i64); - SDValue Mask = - DAG.getNode(Op.getOpcode(), DL, WhileVT, PtrA, PtrB, Op.getOperand(2)); - SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, ContainerVT, Mask); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, MaskAsInt, - DAG.getVectorIdxConstant(0, DL)); + PtrA = DAG.getNode(ISD::ADD, DL, MVT::i64, PtrA, Addend); + PtrB = DAG.getNode(ISD::ADD, DL, MVT::i64, PtrB, Addend); + } + + if (VT.isScalableVT()) + return DAG.getNode(Op.getOpcode(), DL, VT, PtrA, PtrB, Op.getOperand(2)); + + // We can use the SVE whilewr/whilerw instruction to lower this + // intrinsic by creating the appropriate sequence of scalable vector + // operations and then extracting a fixed-width subvector from the scalable + // vector. Scalable vector variants are already legal. + EVT ContainerVT = + EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), + VT.getVectorNumElements(), true); + EVT WhileVT = ContainerVT.changeElementType(MVT::i1); + + SDValue Mask = + DAG.getNode(Op.getOpcode(), DL, WhileVT, PtrA, PtrB, Op.getOperand(2)); + SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, ContainerVT, Mask); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, MaskAsInt, + DAG.getVectorIdxConstant(0, DL)); + }; + + if (NumSplits == 0) + return LowerToWhile(FullVT, 0); + + SDValue FullVec = DAG.getUNDEF(FullVT); + + unsigned NumElementsPerSplit = NumElements / (2 * NumSplits); + EVT PartVT = + EVT::getVectorVT(*DAG.getContext(), FullVT.getVectorElementType(), + NumElementsPerSplit, FullVT.isScalableVT()); + for (unsigned Split = 0, InsertIdx = 0; Split < NumSplits; + Split++, InsertIdx += 2) { + SDValue Low = LowerToWhile(PartVT, InsertIdx); + SDValue High = LowerToWhile(PartVT, InsertIdx + 1); + unsigned InsertIdxLow = InsertIdx * NumElementsPerSplit; + unsigned InsertIdxHigh = (InsertIdx + 1) * NumElementsPerSplit; + SDValue Insert = + DAG.getNode(ISD::INSERT_SUBVECTOR, DL, FullVT, FullVec, Low, + DAG.getVectorIdxConstant(InsertIdxLow, DL)); + FullVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, FullVT, Insert, High, + DAG.getVectorIdxConstant(InsertIdxHigh, DL)); + } + return FullVec; } SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op, diff --git a/llvm/test/CodeGen/AArch64/alias_mask.ll b/llvm/test/CodeGen/AArch64/alias_mask.ll index 7646c9d4750e0..b2ce415ebca19 100644 --- a/llvm/test/CodeGen/AArch64/alias_mask.ll +++ b/llvm/test/CodeGen/AArch64/alias_mask.ll @@ -177,286 +177,239 @@ entry: ret <64 x i1> %0 } -define <16 x i1> @whilewr_16_expand(ptr %a, ptr %b) { -; CHECK-LABEL: whilewr_16_expand: +define <16 x i1> @whilewr_16_split(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_16_split: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: index z0.d, #0, #1 -; CHECK-NEXT: sub x8, x1, x0 -; CHECK-NEXT: add x8, x8, x8, lsr #63 -; CHECK-NEXT: asr x8, x8, #1 -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: mov z2.d, z0.d -; CHECK-NEXT: mov z4.d, z0.d -; CHECK-NEXT: mov z5.d, z0.d -; CHECK-NEXT: mov z6.d, z0.d -; CHECK-NEXT: mov z7.d, z0.d -; CHECK-NEXT: mov z16.d, z0.d -; CHECK-NEXT: dup v3.2d, x8 -; CHECK-NEXT: cmp x8, #1 -; CHECK-NEXT: add z1.d, z1.d, #12 // =0xc -; CHECK-NEXT: add z2.d, z2.d, #10 // =0xa -; CHECK-NEXT: add z4.d, z4.d, #8 // =0x8 -; CHECK-NEXT: add z5.d, z5.d, #6 // =0x6 -; CHECK-NEXT: add z6.d, z6.d, #4 // =0x4 -; CHECK-NEXT: add z7.d, z7.d, #2 // =0x2 -; CHECK-NEXT: add z16.d, z16.d, #14 // =0xe -; CHECK-NEXT: cmhi v0.2d, v3.2d, v0.2d -; CHECK-NEXT: cset w8, lt -; CHECK-NEXT: cmhi v1.2d, v3.2d, v1.2d -; CHECK-NEXT: cmhi v2.2d, v3.2d, v2.2d -; CHECK-NEXT: cmhi v4.2d, v3.2d, v4.2d -; CHECK-NEXT: cmhi v5.2d, v3.2d, v5.2d -; CHECK-NEXT: cmhi v6.2d, v3.2d, v6.2d -; CHECK-NEXT: cmhi v16.2d, v3.2d, v16.2d -; CHECK-NEXT: cmhi v3.2d, v3.2d, v7.2d -; CHECK-NEXT: uzp1 v2.4s, v4.4s, v2.4s -; CHECK-NEXT: uzp1 v4.4s, v6.4s, v5.4s -; CHECK-NEXT: uzp1 v1.4s, v1.4s, v16.4s -; CHECK-NEXT: uzp1 v0.4s, v0.4s, v3.4s -; CHECK-NEXT: uzp1 v1.8h, v2.8h, v1.8h -; CHECK-NEXT: uzp1 v0.8h, v0.8h, v4.8h +; CHECK-NEXT: add x8, x1, #16 +; CHECK-NEXT: add x9, x0, #16 +; CHECK-NEXT: whilewr p0.h, x0, x1 +; CHECK-NEXT: whilewr p1.h, x9, x8 +; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b -; CHECK-NEXT: dup v1.16b, w8 -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret entry: %0 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 2) ret <16 x i1> %0 } -define <32 x i1> @whilewr_16_expand2(ptr %a, ptr %b) { -; CHECK-LABEL: whilewr_16_expand2: +define <32 x i1> @whilewr_16_split2(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_16_split2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sub x9, x1, x0 -; CHECK-NEXT: index z0.d, #0, #1 -; CHECK-NEXT: sub x10, x9, #32 -; CHECK-NEXT: add x9, x9, x9, lsr #63 -; CHECK-NEXT: add x10, x10, x10, lsr #63 -; CHECK-NEXT: asr x9, x9, #1 -; CHECK-NEXT: asr x10, x10, #1 -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: mov z2.d, z0.d -; CHECK-NEXT: mov z3.d, z0.d -; CHECK-NEXT: mov z4.d, z0.d -; CHECK-NEXT: mov z5.d, z0.d -; CHECK-NEXT: mov z6.d, z0.d -; CHECK-NEXT: dup v7.2d, x9 -; CHECK-NEXT: dup v16.2d, x10 -; CHECK-NEXT: add z1.d, z1.d, #12 // =0xc -; CHECK-NEXT: add z2.d, z2.d, #10 // =0xa -; CHECK-NEXT: cmp x10, #1 -; CHECK-NEXT: add z3.d, z3.d, #8 // =0x8 -; CHECK-NEXT: add z4.d, z4.d, #6 // =0x6 -; CHECK-NEXT: add z5.d, z5.d, #4 // =0x4 -; CHECK-NEXT: add z6.d, z6.d, #2 // =0x2 -; CHECK-NEXT: cmhi v17.2d, v7.2d, v0.2d -; CHECK-NEXT: cmhi v18.2d, v16.2d, v0.2d -; CHECK-NEXT: add z0.d, z0.d, #14 // =0xe -; CHECK-NEXT: cmhi v19.2d, v7.2d, v1.2d -; CHECK-NEXT: cmhi v20.2d, v7.2d, v2.2d -; CHECK-NEXT: cmhi v21.2d, v7.2d, v3.2d -; CHECK-NEXT: cmhi v22.2d, v7.2d, v4.2d -; CHECK-NEXT: cmhi v23.2d, v7.2d, v5.2d -; CHECK-NEXT: cmhi v24.2d, v7.2d, v6.2d -; CHECK-NEXT: cmhi v1.2d, v16.2d, v1.2d -; CHECK-NEXT: cmhi v2.2d, v16.2d, v2.2d -; CHECK-NEXT: cmhi v3.2d, v16.2d, v3.2d -; CHECK-NEXT: cmhi v4.2d, v16.2d, v4.2d -; CHECK-NEXT: cmhi v7.2d, v7.2d, v0.2d -; CHECK-NEXT: cmhi v5.2d, v16.2d, v5.2d -; CHECK-NEXT: cmhi v6.2d, v16.2d, v6.2d -; CHECK-NEXT: cset w10, lt -; CHECK-NEXT: cmhi v0.2d, v16.2d, v0.2d -; CHECK-NEXT: uzp1 v16.4s, v21.4s, v20.4s -; CHECK-NEXT: cmp x9, #1 -; CHECK-NEXT: uzp1 v20.4s, v23.4s, v22.4s -; CHECK-NEXT: uzp1 v17.4s, v17.4s, v24.4s -; CHECK-NEXT: cset w9, lt -; CHECK-NEXT: uzp1 v2.4s, v3.4s, v2.4s -; CHECK-NEXT: uzp1 v3.4s, v19.4s, v7.4s -; CHECK-NEXT: uzp1 v4.4s, v5.4s, v4.4s -; CHECK-NEXT: uzp1 v5.4s, v18.4s, v6.4s -; CHECK-NEXT: uzp1 v0.4s, v1.4s, v0.4s -; CHECK-NEXT: uzp1 v1.8h, v17.8h, v20.8h -; CHECK-NEXT: uzp1 v3.8h, v16.8h, v3.8h -; CHECK-NEXT: uzp1 v4.8h, v5.8h, v4.8h -; CHECK-NEXT: uzp1 v0.8h, v2.8h, v0.8h -; CHECK-NEXT: dup v2.16b, w9 +; CHECK-NEXT: add x9, x1, #16 +; CHECK-NEXT: add x10, x0, #48 +; CHECK-NEXT: add x11, x0, #16 +; CHECK-NEXT: whilewr p1.h, x10, x9 +; CHECK-NEXT: add x10, x0, #32 +; CHECK-NEXT: whilewr p0.h, x0, x1 +; CHECK-NEXT: whilewr p2.h, x10, x1 +; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: whilewr p3.h, x11, x9 +; CHECK-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: adrp x9, .LCPI11_0 -; CHECK-NEXT: uzp1 v1.16b, v1.16b, v3.16b -; CHECK-NEXT: dup v3.16b, w10 -; CHECK-NEXT: uzp1 v0.16b, v4.16b, v0.16b -; CHECK-NEXT: orr v1.16b, v1.16b, v2.16b +; CHECK-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z3.h, p3/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: uzp1 v0.16b, v1.16b, v0.16b +; CHECK-NEXT: uzp1 v1.16b, v2.16b, v3.16b ; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI11_0] -; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b -; CHECK-NEXT: shl v1.16b, v1.16b, #7 ; CHECK-NEXT: shl v0.16b, v0.16b, #7 -; CHECK-NEXT: cmlt v1.16b, v1.16b, #0 +; CHECK-NEXT: shl v1.16b, v1.16b, #7 ; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 -; CHECK-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-NEXT: cmlt v1.16b, v1.16b, #0 ; CHECK-NEXT: and v0.16b, v0.16b, v2.16b -; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8 -; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: zip1 v1.16b, v1.16b, v2.16b -; CHECK-NEXT: zip1 v0.16b, v0.16b, v3.16b -; CHECK-NEXT: addv h1, v1.8h +; CHECK-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: zip1 v0.16b, v0.16b, v2.16b +; CHECK-NEXT: zip1 v1.16b, v1.16b, v3.16b ; CHECK-NEXT: addv h0, v0.8h -; CHECK-NEXT: str h1, [x8] +; CHECK-NEXT: addv h1, v1.8h ; CHECK-NEXT: str h0, [x8, #2] +; CHECK-NEXT: str h1, [x8] ; CHECK-NEXT: ret entry: %0 = call <32 x i1> @llvm.loop.dependence.war.mask.v32i1(ptr %a, ptr %b, i64 2) ret <32 x i1> %0 } -define <8 x i1> @whilewr_32_expand(ptr %a, ptr %b) { -; CHECK-LABEL: whilewr_32_expand: +define <8 x i1> @whilewr_32_split(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_32_split: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: index z0.d, #0, #1 -; CHECK-NEXT: subs x8, x1, x0 -; CHECK-NEXT: add x9, x8, #3 -; CHECK-NEXT: csel x8, x9, x8, mi -; CHECK-NEXT: asr x8, x8, #2 -; CHECK-NEXT: mov z2.d, z0.d -; CHECK-NEXT: mov z3.d, z0.d -; CHECK-NEXT: mov z4.d, z0.d -; CHECK-NEXT: dup v1.2d, x8 -; CHECK-NEXT: cmp x8, #1 -; CHECK-NEXT: cset w8, lt -; CHECK-NEXT: add z4.d, z4.d, #6 // =0x6 -; CHECK-NEXT: add z2.d, z2.d, #4 // =0x4 -; CHECK-NEXT: add z3.d, z3.d, #2 // =0x2 -; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d -; CHECK-NEXT: cmhi v4.2d, v1.2d, v4.2d -; CHECK-NEXT: cmhi v2.2d, v1.2d, v2.2d -; CHECK-NEXT: cmhi v1.2d, v1.2d, v3.2d -; CHECK-NEXT: uzp1 v2.4s, v2.4s, v4.4s -; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s -; CHECK-NEXT: dup v1.8b, w8 -; CHECK-NEXT: uzp1 v0.8h, v0.8h, v2.8h -; CHECK-NEXT: xtn v0.8b, v0.8h -; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-NEXT: whilewr p0.s, x0, x1 +; CHECK-NEXT: add x10, x0, #16 +; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: mov v1.16b, v0.16b +; CHECK-NEXT: mov w9, v0.s[2] +; CHECK-NEXT: mov v1.h[1], w8 +; CHECK-NEXT: mov w8, v0.s[3] +; CHECK-NEXT: mov v1.h[2], w9 +; CHECK-NEXT: add x9, x1, #16 +; CHECK-NEXT: whilewr p0.s, x10, x9 +; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov v1.h[3], w8 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: mov w9, v0.s[1] +; CHECK-NEXT: mov v1.h[4], w8 +; CHECK-NEXT: mov w8, v0.s[2] +; CHECK-NEXT: mov v1.h[5], w9 +; CHECK-NEXT: mov w9, v0.s[3] +; CHECK-NEXT: mov v1.h[6], w8 +; CHECK-NEXT: mov v1.h[7], w9 +; CHECK-NEXT: xtn v0.8b, v1.8h ; CHECK-NEXT: ret entry: %0 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 4) ret <8 x i1> %0 } -define <16 x i1> @whilewr_32_expand2(ptr %a, ptr %b) { -; CHECK-LABEL: whilewr_32_expand2: +define <16 x i1> @whilewr_32_split2(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_32_split2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: index z0.d, #0, #1 -; CHECK-NEXT: subs x8, x1, x0 -; CHECK-NEXT: add x9, x8, #3 -; CHECK-NEXT: csel x8, x9, x8, mi -; CHECK-NEXT: asr x8, x8, #2 -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: mov z2.d, z0.d -; CHECK-NEXT: mov z4.d, z0.d -; CHECK-NEXT: mov z5.d, z0.d -; CHECK-NEXT: mov z6.d, z0.d -; CHECK-NEXT: mov z7.d, z0.d -; CHECK-NEXT: mov z16.d, z0.d -; CHECK-NEXT: dup v3.2d, x8 -; CHECK-NEXT: cmp x8, #1 -; CHECK-NEXT: add z1.d, z1.d, #12 // =0xc -; CHECK-NEXT: add z2.d, z2.d, #10 // =0xa -; CHECK-NEXT: add z4.d, z4.d, #8 // =0x8 -; CHECK-NEXT: add z5.d, z5.d, #6 // =0x6 -; CHECK-NEXT: add z6.d, z6.d, #4 // =0x4 -; CHECK-NEXT: add z7.d, z7.d, #2 // =0x2 -; CHECK-NEXT: add z16.d, z16.d, #14 // =0xe -; CHECK-NEXT: cmhi v0.2d, v3.2d, v0.2d -; CHECK-NEXT: cset w8, lt -; CHECK-NEXT: cmhi v1.2d, v3.2d, v1.2d -; CHECK-NEXT: cmhi v2.2d, v3.2d, v2.2d -; CHECK-NEXT: cmhi v4.2d, v3.2d, v4.2d -; CHECK-NEXT: cmhi v5.2d, v3.2d, v5.2d -; CHECK-NEXT: cmhi v6.2d, v3.2d, v6.2d -; CHECK-NEXT: cmhi v16.2d, v3.2d, v16.2d -; CHECK-NEXT: cmhi v3.2d, v3.2d, v7.2d -; CHECK-NEXT: uzp1 v2.4s, v4.4s, v2.4s -; CHECK-NEXT: uzp1 v4.4s, v6.4s, v5.4s -; CHECK-NEXT: uzp1 v1.4s, v1.4s, v16.4s -; CHECK-NEXT: uzp1 v0.4s, v0.4s, v3.4s -; CHECK-NEXT: uzp1 v1.8h, v2.8h, v1.8h -; CHECK-NEXT: uzp1 v0.8h, v0.8h, v4.8h -; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b -; CHECK-NEXT: dup v1.16b, w8 -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: add x8, x1, #32 +; CHECK-NEXT: add x9, x0, #32 +; CHECK-NEXT: whilewr p0.s, x0, x1 +; CHECK-NEXT: whilewr p1.s, x9, x8 +; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: mov v2.16b, v0.16b +; CHECK-NEXT: mov w10, v0.s[2] +; CHECK-NEXT: mov w9, v1.s[1] +; CHECK-NEXT: mov v3.16b, v1.16b +; CHECK-NEXT: mov w11, v1.s[3] +; CHECK-NEXT: mov v2.h[1], w8 +; CHECK-NEXT: mov w8, v1.s[2] +; CHECK-NEXT: mov v3.h[1], w9 +; CHECK-NEXT: mov w9, v0.s[3] +; CHECK-NEXT: mov v2.h[2], w10 +; CHECK-NEXT: add x10, x1, #16 +; CHECK-NEXT: mov v3.h[2], w8 +; CHECK-NEXT: add x8, x0, #16 +; CHECK-NEXT: whilewr p0.s, x8, x10 +; CHECK-NEXT: add x8, x1, #48 +; CHECK-NEXT: add x10, x0, #48 +; CHECK-NEXT: whilewr p1.s, x10, x8 +; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov v2.h[3], w9 +; CHECK-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov v3.h[3], w11 +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: mov w8, v0.s[1] +; CHECK-NEXT: fmov w10, s1 +; CHECK-NEXT: mov w11, v1.s[1] +; CHECK-NEXT: mov v2.h[4], w9 +; CHECK-NEXT: mov w9, v0.s[2] +; CHECK-NEXT: mov v3.h[4], w10 +; CHECK-NEXT: mov w10, v1.s[2] +; CHECK-NEXT: mov v2.h[5], w8 +; CHECK-NEXT: mov w8, v0.s[3] +; CHECK-NEXT: mov v3.h[5], w11 +; CHECK-NEXT: mov w11, v1.s[3] +; CHECK-NEXT: mov v2.h[6], w9 +; CHECK-NEXT: mov v3.h[6], w10 +; CHECK-NEXT: mov v2.h[7], w8 +; CHECK-NEXT: mov v3.h[7], w11 +; CHECK-NEXT: uzp1 v0.16b, v2.16b, v3.16b ; CHECK-NEXT: ret entry: %0 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 4) ret <16 x i1> %0 } -define <32 x i1> @whilewr_32_expand3(ptr %a, ptr %b) { -; CHECK-LABEL: whilewr_32_expand3: +define <32 x i1> @whilewr_32_split3(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_32_split3: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: subs x9, x1, x0 -; CHECK-NEXT: index z0.d, #0, #1 -; CHECK-NEXT: add x10, x9, #3 -; CHECK-NEXT: sub x11, x9, #61 -; CHECK-NEXT: csel x10, x10, x9, mi -; CHECK-NEXT: subs x9, x9, #64 -; CHECK-NEXT: csel x9, x11, x9, mi -; CHECK-NEXT: asr x10, x10, #2 -; CHECK-NEXT: asr x9, x9, #2 -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: mov z2.d, z0.d -; CHECK-NEXT: mov z3.d, z0.d -; CHECK-NEXT: mov z4.d, z0.d -; CHECK-NEXT: mov z5.d, z0.d -; CHECK-NEXT: mov z6.d, z0.d -; CHECK-NEXT: dup v7.2d, x10 -; CHECK-NEXT: dup v16.2d, x9 -; CHECK-NEXT: add z1.d, z1.d, #12 // =0xc -; CHECK-NEXT: add z2.d, z2.d, #10 // =0xa -; CHECK-NEXT: cmp x9, #1 -; CHECK-NEXT: add z3.d, z3.d, #8 // =0x8 -; CHECK-NEXT: add z4.d, z4.d, #6 // =0x6 -; CHECK-NEXT: add z5.d, z5.d, #4 // =0x4 -; CHECK-NEXT: add z6.d, z6.d, #2 // =0x2 -; CHECK-NEXT: cmhi v17.2d, v7.2d, v0.2d -; CHECK-NEXT: cmhi v18.2d, v16.2d, v0.2d -; CHECK-NEXT: add z0.d, z0.d, #14 // =0xe -; CHECK-NEXT: cmhi v19.2d, v7.2d, v1.2d -; CHECK-NEXT: cmhi v20.2d, v7.2d, v2.2d -; CHECK-NEXT: cmhi v21.2d, v7.2d, v3.2d -; CHECK-NEXT: cmhi v22.2d, v7.2d, v4.2d -; CHECK-NEXT: cmhi v23.2d, v7.2d, v5.2d -; CHECK-NEXT: cmhi v24.2d, v7.2d, v6.2d -; CHECK-NEXT: cmhi v1.2d, v16.2d, v1.2d -; CHECK-NEXT: cmhi v2.2d, v16.2d, v2.2d -; CHECK-NEXT: cmhi v3.2d, v16.2d, v3.2d -; CHECK-NEXT: cmhi v4.2d, v16.2d, v4.2d -; CHECK-NEXT: cmhi v7.2d, v7.2d, v0.2d -; CHECK-NEXT: cmhi v5.2d, v16.2d, v5.2d -; CHECK-NEXT: cmhi v6.2d, v16.2d, v6.2d -; CHECK-NEXT: cset w9, lt -; CHECK-NEXT: cmhi v0.2d, v16.2d, v0.2d -; CHECK-NEXT: uzp1 v16.4s, v21.4s, v20.4s -; CHECK-NEXT: cmp x10, #1 -; CHECK-NEXT: uzp1 v20.4s, v23.4s, v22.4s -; CHECK-NEXT: uzp1 v17.4s, v17.4s, v24.4s -; CHECK-NEXT: cset w10, lt -; CHECK-NEXT: uzp1 v2.4s, v3.4s, v2.4s -; CHECK-NEXT: uzp1 v3.4s, v19.4s, v7.4s -; CHECK-NEXT: uzp1 v4.4s, v5.4s, v4.4s -; CHECK-NEXT: uzp1 v5.4s, v18.4s, v6.4s -; CHECK-NEXT: uzp1 v0.4s, v1.4s, v0.4s -; CHECK-NEXT: uzp1 v1.8h, v17.8h, v20.8h -; CHECK-NEXT: uzp1 v3.8h, v16.8h, v3.8h -; CHECK-NEXT: uzp1 v4.8h, v5.8h, v4.8h -; CHECK-NEXT: uzp1 v0.8h, v2.8h, v0.8h -; CHECK-NEXT: dup v2.16b, w10 -; CHECK-NEXT: uzp1 v1.16b, v1.16b, v3.16b -; CHECK-NEXT: dup v3.16b, w9 +; CHECK-NEXT: whilewr p0.s, x0, x1 +; CHECK-NEXT: add x9, x1, #32 +; CHECK-NEXT: add x10, x0, #96 +; CHECK-NEXT: add x11, x0, #64 +; CHECK-NEXT: whilewr p1.s, x10, x9 +; CHECK-NEXT: add x10, x0, #32 +; CHECK-NEXT: whilewr p2.s, x11, x1 +; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: whilewr p0.s, x10, x9 +; CHECK-NEXT: mov z4.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z5.s, p2/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z2.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov w9, v0.s[1] +; CHECK-NEXT: mov w10, v0.s[2] +; CHECK-NEXT: mov w11, v4.s[1] +; CHECK-NEXT: mov w13, v0.s[3] +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: mov v1.16b, v4.16b +; CHECK-NEXT: mov w12, v5.s[1] +; CHECK-NEXT: mov v3.16b, v5.16b +; CHECK-NEXT: mov w17, v4.s[2] +; CHECK-NEXT: mov w14, v2.s[1] +; CHECK-NEXT: mov w15, v2.s[2] +; CHECK-NEXT: mov w16, v2.s[3] +; CHECK-NEXT: mov v0.h[1], w9 +; CHECK-NEXT: // kill: def $q2 killed $q2 killed $z2 +; CHECK-NEXT: mov w18, v5.s[2] +; CHECK-NEXT: mov w9, v5.s[3] +; CHECK-NEXT: mov v1.h[1], w11 +; CHECK-NEXT: mov w11, v4.s[3] +; CHECK-NEXT: mov v3.h[1], w12 +; CHECK-NEXT: add x12, x1, #16 +; CHECK-NEXT: add x1, x1, #48 +; CHECK-NEXT: mov v2.h[1], w14 +; CHECK-NEXT: add x14, x0, #16 +; CHECK-NEXT: whilewr p0.s, x14, x12 +; CHECK-NEXT: mov v0.h[2], w10 +; CHECK-NEXT: add x10, x0, #80 +; CHECK-NEXT: add x14, x0, #112 +; CHECK-NEXT: whilewr p2.s, x10, x12 +; CHECK-NEXT: add x10, x0, #48 +; CHECK-NEXT: whilewr p1.s, x14, x1 +; CHECK-NEXT: mov v1.h[2], w17 +; CHECK-NEXT: mov v3.h[2], w18 +; CHECK-NEXT: mov v2.h[2], w15 +; CHECK-NEXT: mov z4.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: whilewr p0.s, x10, x1 +; CHECK-NEXT: mov z5.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z6.s, p2/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov v0.h[3], w13 +; CHECK-NEXT: mov z7.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov v1.h[3], w11 +; CHECK-NEXT: mov v3.h[3], w9 +; CHECK-NEXT: fmov w10, s4 +; CHECK-NEXT: mov v2.h[3], w16 +; CHECK-NEXT: fmov w12, s5 +; CHECK-NEXT: fmov w14, s6 +; CHECK-NEXT: fmov w15, s7 +; CHECK-NEXT: mov w9, v4.s[1] +; CHECK-NEXT: mov w11, v5.s[1] +; CHECK-NEXT: mov w13, v6.s[1] +; CHECK-NEXT: mov v0.h[4], w10 +; CHECK-NEXT: mov w10, v4.s[2] +; CHECK-NEXT: mov v1.h[4], w12 +; CHECK-NEXT: mov v3.h[4], w14 +; CHECK-NEXT: mov w12, v7.s[1] +; CHECK-NEXT: mov v2.h[4], w15 +; CHECK-NEXT: mov w14, v5.s[2] +; CHECK-NEXT: mov w15, v6.s[2] +; CHECK-NEXT: mov v0.h[5], w9 +; CHECK-NEXT: mov w9, v4.s[3] +; CHECK-NEXT: mov v1.h[5], w11 +; CHECK-NEXT: mov v3.h[5], w13 +; CHECK-NEXT: mov w11, v7.s[2] +; CHECK-NEXT: mov v2.h[5], w12 +; CHECK-NEXT: mov w12, v5.s[3] +; CHECK-NEXT: mov w13, v6.s[3] +; CHECK-NEXT: mov v0.h[6], w10 +; CHECK-NEXT: mov v1.h[6], w14 +; CHECK-NEXT: mov v3.h[6], w15 +; CHECK-NEXT: mov w14, v7.s[3] +; CHECK-NEXT: mov v2.h[6], w11 +; CHECK-NEXT: mov v0.h[7], w9 ; CHECK-NEXT: adrp x9, .LCPI14_0 -; CHECK-NEXT: uzp1 v0.16b, v4.16b, v0.16b -; CHECK-NEXT: orr v1.16b, v1.16b, v2.16b +; CHECK-NEXT: mov v1.h[7], w12 +; CHECK-NEXT: mov v3.h[7], w13 +; CHECK-NEXT: mov v2.h[7], w14 +; CHECK-NEXT: uzp1 v1.16b, v3.16b, v1.16b +; CHECK-NEXT: uzp1 v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI14_0] -; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b ; CHECK-NEXT: shl v1.16b, v1.16b, #7 ; CHECK-NEXT: shl v0.16b, v0.16b, #7 ; CHECK-NEXT: cmlt v1.16b, v1.16b, #0 @@ -469,187 +422,207 @@ define <32 x i1> @whilewr_32_expand3(ptr %a, ptr %b) { ; CHECK-NEXT: zip1 v0.16b, v0.16b, v3.16b ; CHECK-NEXT: addv h1, v1.8h ; CHECK-NEXT: addv h0, v0.8h -; CHECK-NEXT: str h1, [x8] -; CHECK-NEXT: str h0, [x8, #2] +; CHECK-NEXT: str h1, [x8, #2] +; CHECK-NEXT: str h0, [x8] ; CHECK-NEXT: ret entry: %0 = call <32 x i1> @llvm.loop.dependence.war.mask.v32i1(ptr %a, ptr %b, i64 4) ret <32 x i1> %0 } -define <4 x i1> @whilewr_64_expand(ptr %a, ptr %b) { -; CHECK-LABEL: whilewr_64_expand: +define <4 x i1> @whilewr_64_split(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_64_split: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: index z0.d, #0, #1 -; CHECK-NEXT: subs x8, x1, x0 -; CHECK-NEXT: add x9, x8, #7 -; CHECK-NEXT: csel x8, x9, x8, mi -; CHECK-NEXT: asr x8, x8, #3 -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: dup v2.2d, x8 -; CHECK-NEXT: cmp x8, #1 -; CHECK-NEXT: cset w8, lt -; CHECK-NEXT: add z1.d, z1.d, #2 // =0x2 -; CHECK-NEXT: cmhi v0.2d, v2.2d, v0.2d -; CHECK-NEXT: cmhi v1.2d, v2.2d, v1.2d -; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s -; CHECK-NEXT: dup v1.4h, w8 +; CHECK-NEXT: whilewr p0.d, x0, x1 +; CHECK-NEXT: add x8, x1, #16 +; CHECK-NEXT: add x9, x0, #16 +; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: whilewr p0.d, x9, x8 +; CHECK-NEXT: mov z1.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov v0.s[1], v0.s[2] +; CHECK-NEXT: mov v0.s[2], v1.s[0] +; CHECK-NEXT: mov v0.s[3], v1.s[2] ; CHECK-NEXT: xtn v0.4h, v0.4s -; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret entry: %0 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 8) ret <4 x i1> %0 } -define <8 x i1> @whilewr_64_expand2(ptr %a, ptr %b) { -; CHECK-LABEL: whilewr_64_expand2: +define <8 x i1> @whilewr_64_split2(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_64_split2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: index z0.d, #0, #1 -; CHECK-NEXT: subs x8, x1, x0 -; CHECK-NEXT: add x9, x8, #7 -; CHECK-NEXT: csel x8, x9, x8, mi -; CHECK-NEXT: asr x8, x8, #3 -; CHECK-NEXT: mov z2.d, z0.d -; CHECK-NEXT: mov z3.d, z0.d -; CHECK-NEXT: mov z4.d, z0.d -; CHECK-NEXT: dup v1.2d, x8 -; CHECK-NEXT: cmp x8, #1 -; CHECK-NEXT: cset w8, lt -; CHECK-NEXT: add z4.d, z4.d, #6 // =0x6 -; CHECK-NEXT: add z2.d, z2.d, #4 // =0x4 -; CHECK-NEXT: add z3.d, z3.d, #2 // =0x2 -; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d -; CHECK-NEXT: cmhi v4.2d, v1.2d, v4.2d -; CHECK-NEXT: cmhi v2.2d, v1.2d, v2.2d -; CHECK-NEXT: cmhi v1.2d, v1.2d, v3.2d -; CHECK-NEXT: uzp1 v2.4s, v2.4s, v4.4s -; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s -; CHECK-NEXT: dup v1.8b, w8 -; CHECK-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-NEXT: add x8, x1, #32 +; CHECK-NEXT: add x9, x0, #32 +; CHECK-NEXT: whilewr p0.d, x0, x1 +; CHECK-NEXT: whilewr p1.d, x9, x8 +; CHECK-NEXT: add x8, x1, #16 +; CHECK-NEXT: add x9, x0, #16 +; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: whilewr p0.d, x9, x8 +; CHECK-NEXT: add x8, x1, #48 +; CHECK-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: add x9, x0, #48 +; CHECK-NEXT: whilewr p1.d, x9, x8 +; CHECK-NEXT: mov z2.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov v0.s[1], v0.s[2] +; CHECK-NEXT: mov v1.s[1], v1.s[2] +; CHECK-NEXT: mov z3.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov v0.s[2], v2.s[0] +; CHECK-NEXT: mov v1.s[2], v3.s[0] +; CHECK-NEXT: mov v0.s[3], v2.s[2] +; CHECK-NEXT: mov v1.s[3], v3.s[2] +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h ; CHECK-NEXT: xtn v0.8b, v0.8h -; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b ; CHECK-NEXT: ret entry: %0 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 8) ret <8 x i1> %0 } -define <16 x i1> @whilewr_64_expand3(ptr %a, ptr %b) { -; CHECK-LABEL: whilewr_64_expand3: +define <16 x i1> @whilewr_64_split3(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_64_split3: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: index z0.d, #0, #1 -; CHECK-NEXT: subs x8, x1, x0 -; CHECK-NEXT: add x9, x8, #7 -; CHECK-NEXT: csel x8, x9, x8, mi -; CHECK-NEXT: asr x8, x8, #3 -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: mov z2.d, z0.d -; CHECK-NEXT: mov z4.d, z0.d -; CHECK-NEXT: mov z5.d, z0.d -; CHECK-NEXT: mov z6.d, z0.d -; CHECK-NEXT: mov z7.d, z0.d -; CHECK-NEXT: mov z16.d, z0.d -; CHECK-NEXT: dup v3.2d, x8 -; CHECK-NEXT: cmp x8, #1 -; CHECK-NEXT: add z1.d, z1.d, #12 // =0xc -; CHECK-NEXT: add z2.d, z2.d, #10 // =0xa -; CHECK-NEXT: add z4.d, z4.d, #8 // =0x8 -; CHECK-NEXT: add z5.d, z5.d, #6 // =0x6 -; CHECK-NEXT: add z6.d, z6.d, #4 // =0x4 -; CHECK-NEXT: add z7.d, z7.d, #2 // =0x2 -; CHECK-NEXT: add z16.d, z16.d, #14 // =0xe -; CHECK-NEXT: cmhi v0.2d, v3.2d, v0.2d -; CHECK-NEXT: cset w8, lt -; CHECK-NEXT: cmhi v1.2d, v3.2d, v1.2d -; CHECK-NEXT: cmhi v2.2d, v3.2d, v2.2d -; CHECK-NEXT: cmhi v4.2d, v3.2d, v4.2d -; CHECK-NEXT: cmhi v5.2d, v3.2d, v5.2d -; CHECK-NEXT: cmhi v6.2d, v3.2d, v6.2d -; CHECK-NEXT: cmhi v16.2d, v3.2d, v16.2d -; CHECK-NEXT: cmhi v3.2d, v3.2d, v7.2d -; CHECK-NEXT: uzp1 v2.4s, v4.4s, v2.4s -; CHECK-NEXT: uzp1 v4.4s, v6.4s, v5.4s -; CHECK-NEXT: uzp1 v1.4s, v1.4s, v16.4s -; CHECK-NEXT: uzp1 v0.4s, v0.4s, v3.4s -; CHECK-NEXT: uzp1 v1.8h, v2.8h, v1.8h -; CHECK-NEXT: uzp1 v0.8h, v0.8h, v4.8h -; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b -; CHECK-NEXT: dup v1.16b, w8 -; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: add x8, x1, #96 +; CHECK-NEXT: add x9, x0, #96 +; CHECK-NEXT: whilewr p2.d, x0, x1 +; CHECK-NEXT: whilewr p1.d, x9, x8 +; CHECK-NEXT: add x8, x1, #112 +; CHECK-NEXT: add x9, x0, #112 +; CHECK-NEXT: whilewr p0.d, x9, x8 +; CHECK-NEXT: add x8, x1, #64 +; CHECK-NEXT: add x9, x0, #64 +; CHECK-NEXT: whilewr p3.d, x9, x8 +; CHECK-NEXT: add x8, x1, #32 +; CHECK-NEXT: add x9, x0, #32 +; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: whilewr p1.d, x9, x8 +; CHECK-NEXT: add x8, x1, #80 +; CHECK-NEXT: add x9, x0, #80 +; CHECK-NEXT: mov z1.d, p3/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z2.d, p2/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: whilewr p2.d, x9, x8 +; CHECK-NEXT: add x8, x1, #16 +; CHECK-NEXT: add x9, x0, #16 +; CHECK-NEXT: mov z3.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: whilewr p1.d, x9, x8 +; CHECK-NEXT: add x8, x1, #48 +; CHECK-NEXT: add x9, x0, #48 +; CHECK-NEXT: mov z4.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov v0.s[1], v0.s[2] +; CHECK-NEXT: whilewr p0.d, x9, x8 +; CHECK-NEXT: mov v1.s[1], v1.s[2] +; CHECK-NEXT: mov v2.s[1], v2.s[2] +; CHECK-NEXT: mov v3.s[1], v3.s[2] +; CHECK-NEXT: mov z5.d, p2/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z6.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z7.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov v0.s[2], v4.s[0] +; CHECK-NEXT: mov v2.s[2], v6.s[0] +; CHECK-NEXT: mov v1.s[2], v5.s[0] +; CHECK-NEXT: mov v3.s[2], v7.s[0] +; CHECK-NEXT: mov v0.s[3], v4.s[2] +; CHECK-NEXT: mov v1.s[3], v5.s[2] +; CHECK-NEXT: mov v2.s[3], v6.s[2] +; CHECK-NEXT: mov v3.s[3], v7.s[2] +; CHECK-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-NEXT: uzp1 v1.8h, v2.8h, v3.8h +; CHECK-NEXT: uzp1 v0.16b, v1.16b, v0.16b ; CHECK-NEXT: ret entry: %0 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 8) ret <16 x i1> %0 } -define <32 x i1> @whilewr_64_expand4(ptr %a, ptr %b) { -; CHECK-LABEL: whilewr_64_expand4: +define <32 x i1> @whilewr_64_split4(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_64_split4: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: subs x9, x1, x0 -; CHECK-NEXT: index z0.d, #0, #1 -; CHECK-NEXT: add x10, x9, #7 -; CHECK-NEXT: sub x11, x9, #121 -; CHECK-NEXT: csel x10, x10, x9, mi -; CHECK-NEXT: subs x9, x9, #128 -; CHECK-NEXT: csel x9, x11, x9, mi -; CHECK-NEXT: asr x10, x10, #3 -; CHECK-NEXT: asr x9, x9, #3 -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: mov z2.d, z0.d -; CHECK-NEXT: mov z3.d, z0.d -; CHECK-NEXT: mov z4.d, z0.d -; CHECK-NEXT: mov z5.d, z0.d -; CHECK-NEXT: mov z6.d, z0.d -; CHECK-NEXT: dup v7.2d, x10 -; CHECK-NEXT: dup v16.2d, x9 -; CHECK-NEXT: add z1.d, z1.d, #12 // =0xc -; CHECK-NEXT: add z2.d, z2.d, #10 // =0xa -; CHECK-NEXT: cmp x9, #1 -; CHECK-NEXT: add z3.d, z3.d, #8 // =0x8 -; CHECK-NEXT: add z4.d, z4.d, #6 // =0x6 -; CHECK-NEXT: add z5.d, z5.d, #4 // =0x4 -; CHECK-NEXT: add z6.d, z6.d, #2 // =0x2 -; CHECK-NEXT: cmhi v17.2d, v7.2d, v0.2d -; CHECK-NEXT: cmhi v18.2d, v16.2d, v0.2d -; CHECK-NEXT: add z0.d, z0.d, #14 // =0xe -; CHECK-NEXT: cmhi v19.2d, v7.2d, v1.2d -; CHECK-NEXT: cmhi v20.2d, v7.2d, v2.2d -; CHECK-NEXT: cmhi v21.2d, v7.2d, v3.2d -; CHECK-NEXT: cmhi v22.2d, v7.2d, v4.2d -; CHECK-NEXT: cmhi v23.2d, v7.2d, v5.2d -; CHECK-NEXT: cmhi v24.2d, v7.2d, v6.2d -; CHECK-NEXT: cmhi v1.2d, v16.2d, v1.2d -; CHECK-NEXT: cmhi v2.2d, v16.2d, v2.2d -; CHECK-NEXT: cmhi v3.2d, v16.2d, v3.2d -; CHECK-NEXT: cmhi v4.2d, v16.2d, v4.2d -; CHECK-NEXT: cmhi v7.2d, v7.2d, v0.2d -; CHECK-NEXT: cmhi v5.2d, v16.2d, v5.2d -; CHECK-NEXT: cmhi v6.2d, v16.2d, v6.2d -; CHECK-NEXT: cset w9, lt -; CHECK-NEXT: cmhi v0.2d, v16.2d, v0.2d -; CHECK-NEXT: uzp1 v16.4s, v21.4s, v20.4s -; CHECK-NEXT: cmp x10, #1 -; CHECK-NEXT: uzp1 v20.4s, v23.4s, v22.4s -; CHECK-NEXT: uzp1 v17.4s, v17.4s, v24.4s -; CHECK-NEXT: cset w10, lt -; CHECK-NEXT: uzp1 v2.4s, v3.4s, v2.4s -; CHECK-NEXT: uzp1 v3.4s, v19.4s, v7.4s -; CHECK-NEXT: uzp1 v4.4s, v5.4s, v4.4s -; CHECK-NEXT: uzp1 v5.4s, v18.4s, v6.4s -; CHECK-NEXT: uzp1 v0.4s, v1.4s, v0.4s -; CHECK-NEXT: uzp1 v1.8h, v17.8h, v20.8h -; CHECK-NEXT: uzp1 v3.8h, v16.8h, v3.8h -; CHECK-NEXT: uzp1 v4.8h, v5.8h, v4.8h -; CHECK-NEXT: uzp1 v0.8h, v2.8h, v0.8h -; CHECK-NEXT: dup v2.16b, w10 -; CHECK-NEXT: uzp1 v1.16b, v1.16b, v3.16b -; CHECK-NEXT: dup v3.16b, w9 +; CHECK-NEXT: add x11, x1, #112 +; CHECK-NEXT: add x13, x0, #112 +; CHECK-NEXT: add x10, x1, #96 +; CHECK-NEXT: add x9, x0, #96 +; CHECK-NEXT: whilewr p1.d, x13, x11 +; CHECK-NEXT: add x12, x1, #64 +; CHECK-NEXT: whilewr p0.d, x9, x10 +; CHECK-NEXT: add x9, x0, #64 +; CHECK-NEXT: add x13, x1, #80 +; CHECK-NEXT: whilewr p2.d, x9, x12 +; CHECK-NEXT: mov z1.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: add x9, x0, #80 +; CHECK-NEXT: whilewr p1.d, x0, x1 +; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: add x14, x1, #32 +; CHECK-NEXT: whilewr p0.d, x9, x13 +; CHECK-NEXT: add x9, x0, #32 +; CHECK-NEXT: add x15, x0, #224 +; CHECK-NEXT: mov z3.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: whilewr p1.d, x9, x14 +; CHECK-NEXT: mov z2.d, p2/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: add x9, x1, #16 +; CHECK-NEXT: mov v0.s[1], v0.s[2] +; CHECK-NEXT: mov z21.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z4.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: whilewr p1.d, x15, x10 +; CHECK-NEXT: add x10, x0, #240 +; CHECK-NEXT: whilewr p2.d, x10, x11 +; CHECK-NEXT: add x10, x0, #192 +; CHECK-NEXT: add x15, x1, #48 +; CHECK-NEXT: mov z5.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: whilewr p1.d, x10, x12 +; CHECK-NEXT: add x10, x0, #208 +; CHECK-NEXT: mov z6.d, p2/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: whilewr p2.d, x10, x13 +; CHECK-NEXT: add x10, x0, #160 +; CHECK-NEXT: mov z7.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: whilewr p1.d, x10, x14 +; CHECK-NEXT: add x10, x0, #128 +; CHECK-NEXT: mov z16.d, p2/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: add x11, x0, #176 +; CHECK-NEXT: whilewr p2.d, x10, x1 +; CHECK-NEXT: mov z17.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: whilewr p1.d, x11, x15 +; CHECK-NEXT: add x10, x0, #144 +; CHECK-NEXT: add x11, x0, #16 +; CHECK-NEXT: mov z18.d, p2/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: whilewr p2.d, x10, x9 +; CHECK-NEXT: mov z19.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: whilewr p1.d, x11, x9 +; CHECK-NEXT: add x9, x0, #48 +; CHECK-NEXT: mov z20.d, p2/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: whilewr p2.d, x9, x15 +; CHECK-NEXT: mov v5.s[1], v5.s[2] +; CHECK-NEXT: mov v7.s[1], v7.s[2] +; CHECK-NEXT: mov v17.s[1], v17.s[2] +; CHECK-NEXT: mov v18.s[1], v18.s[2] +; CHECK-NEXT: mov v2.s[1], v2.s[2] +; CHECK-NEXT: mov v3.s[1], v3.s[2] +; CHECK-NEXT: mov v4.s[1], v4.s[2] +; CHECK-NEXT: mov z22.d, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z23.d, p2/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov v0.s[2], v1.s[0] +; CHECK-NEXT: mov v5.s[2], v6.s[0] ; CHECK-NEXT: adrp x9, .LCPI18_0 -; CHECK-NEXT: uzp1 v0.16b, v4.16b, v0.16b -; CHECK-NEXT: orr v1.16b, v1.16b, v2.16b +; CHECK-NEXT: mov v7.s[2], v16.s[0] +; CHECK-NEXT: mov v17.s[2], v19.s[0] +; CHECK-NEXT: mov v18.s[2], v20.s[0] +; CHECK-NEXT: mov v2.s[2], v21.s[0] +; CHECK-NEXT: mov v3.s[2], v22.s[0] +; CHECK-NEXT: mov v4.s[2], v23.s[0] +; CHECK-NEXT: mov v0.s[3], v1.s[2] +; CHECK-NEXT: mov v5.s[3], v6.s[2] +; CHECK-NEXT: mov v7.s[3], v16.s[2] +; CHECK-NEXT: mov v17.s[3], v19.s[2] +; CHECK-NEXT: mov v18.s[3], v20.s[2] +; CHECK-NEXT: mov v2.s[3], v21.s[2] +; CHECK-NEXT: mov v3.s[3], v22.s[2] +; CHECK-NEXT: mov v4.s[3], v23.s[2] +; CHECK-NEXT: uzp1 v1.8h, v7.8h, v5.8h +; CHECK-NEXT: uzp1 v5.8h, v18.8h, v17.8h +; CHECK-NEXT: uzp1 v0.8h, v2.8h, v0.8h +; CHECK-NEXT: uzp1 v2.8h, v3.8h, v4.8h +; CHECK-NEXT: uzp1 v1.16b, v5.16b, v1.16b +; CHECK-NEXT: uzp1 v0.16b, v2.16b, v0.16b ; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI18_0] -; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b ; CHECK-NEXT: shl v1.16b, v1.16b, #7 ; CHECK-NEXT: shl v0.16b, v0.16b, #7 ; CHECK-NEXT: cmlt v1.16b, v1.16b, #0 @@ -662,8 +635,8 @@ define <32 x i1> @whilewr_64_expand4(ptr %a, ptr %b) { ; CHECK-NEXT: zip1 v0.16b, v0.16b, v3.16b ; CHECK-NEXT: addv h1, v1.8h ; CHECK-NEXT: addv h0, v0.8h -; CHECK-NEXT: str h1, [x8] -; CHECK-NEXT: str h0, [x8, #2] +; CHECK-NEXT: str h1, [x8, #2] +; CHECK-NEXT: str h0, [x8] ; CHECK-NEXT: ret entry: %0 = call <32 x i1> @llvm.loop.dependence.war.mask.v32i1(ptr %a, ptr %b, i64 8) diff --git a/llvm/test/CodeGen/AArch64/alias_mask_scalable.ll b/llvm/test/CodeGen/AArch64/alias_mask_scalable.ll index 179dcfa11c108..cf1fed7446104 100644 --- a/llvm/test/CodeGen/AArch64/alias_mask_scalable.ll +++ b/llvm/test/CodeGen/AArch64/alias_mask_scalable.ll @@ -110,343 +110,129 @@ entry: ret <vscale x 64 x i1> %0 } -define <vscale x 16 x i1> @whilewr_16_expand(ptr %a, ptr %b) { -; CHECK-LABEL: whilewr_16_expand: +define <vscale x 16 x i1> @whilewr_16_split(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_16_split: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: index z0.d, #0, #1 -; CHECK-NEXT: sub x8, x1, x0 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: add x8, x8, x8, lsr #63 -; CHECK-NEXT: asr x8, x8, #1 -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: mov z4.d, z0.d -; CHECK-NEXT: mov z5.d, z0.d -; CHECK-NEXT: mov z2.d, x8 -; CHECK-NEXT: incd z1.d -; CHECK-NEXT: incd z4.d, all, mul #2 -; CHECK-NEXT: incd z5.d, all, mul #4 -; CHECK-NEXT: cmphi p2.d, p0/z, z2.d, z0.d -; CHECK-NEXT: mov z3.d, z1.d -; CHECK-NEXT: cmphi p1.d, p0/z, z2.d, z1.d -; CHECK-NEXT: incd z1.d, all, mul #4 -; CHECK-NEXT: cmphi p3.d, p0/z, z2.d, z4.d -; CHECK-NEXT: incd z4.d, all, mul #4 -; CHECK-NEXT: cmphi p4.d, p0/z, z2.d, z5.d -; CHECK-NEXT: incd z3.d, all, mul #2 -; CHECK-NEXT: cmphi p5.d, p0/z, z2.d, z1.d -; CHECK-NEXT: cmphi p7.d, p0/z, z2.d, z4.d -; CHECK-NEXT: uzp1 p1.s, p2.s, p1.s -; CHECK-NEXT: mov z0.d, z3.d -; CHECK-NEXT: cmphi p6.d, p0/z, z2.d, z3.d -; CHECK-NEXT: uzp1 p2.s, p4.s, p5.s -; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: incd z0.d, all, mul #4 -; CHECK-NEXT: uzp1 p3.s, p3.s, p6.s -; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: cmphi p0.d, p0/z, z2.d, z0.d -; CHECK-NEXT: uzp1 p1.h, p1.h, p3.h -; CHECK-NEXT: cmp x8, #1 -; CHECK-NEXT: cset w8, lt -; CHECK-NEXT: sbfx x8, x8, #0, #1 -; CHECK-NEXT: uzp1 p0.s, p7.s, p0.s -; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p0.h, p2.h, p0.h -; CHECK-NEXT: uzp1 p0.b, p1.b, p0.b -; CHECK-NEXT: whilelo p1.b, xzr, x8 -; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: whilewr p0.h, x0, x1 +; CHECK-NEXT: incb x1 +; CHECK-NEXT: incb x0 +; CHECK-NEXT: whilewr p1.h, x0, x1 +; CHECK-NEXT: uzp1 p0.b, p0.b, p1.b ; CHECK-NEXT: ret entry: %0 = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr %a, ptr %b, i64 2) ret <vscale x 16 x i1> %0 } -define <vscale x 32 x i1> @whilewr_16_expand2(ptr %a, ptr %b) { -; CHECK-LABEL: whilewr_16_expand2: +define <vscale x 32 x i1> @whilewr_16_split2(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_16_split2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p9, [sp, #2, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str p8, [sp, #3, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: index z0.d, #0, #1 -; CHECK-NEXT: sub x8, x1, x0 +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: mov x10, x1 +; CHECK-NEXT: whilewr p0.h, x0, x1 +; CHECK-NEXT: addvl x9, x0, #3 ; CHECK-NEXT: incb x0, all, mul #2 -; CHECK-NEXT: add x8, x8, x8, lsr #63 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: asr x8, x8, #1 -; CHECK-NEXT: sub x9, x1, x0 -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: mov z2.d, z0.d -; CHECK-NEXT: mov z3.d, z0.d -; CHECK-NEXT: mov z5.d, x8 -; CHECK-NEXT: add x9, x9, x9, lsr #63 -; CHECK-NEXT: incd z1.d -; CHECK-NEXT: incd z2.d, all, mul #2 -; CHECK-NEXT: incd z3.d, all, mul #4 -; CHECK-NEXT: cmphi p2.d, p0/z, z5.d, z0.d -; CHECK-NEXT: asr x9, x9, #1 -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: mov z6.d, z1.d -; CHECK-NEXT: mov z7.d, z2.d -; CHECK-NEXT: cmphi p1.d, p0/z, z5.d, z1.d -; CHECK-NEXT: cmphi p3.d, p0/z, z5.d, z3.d -; CHECK-NEXT: cmphi p5.d, p0/z, z5.d, z2.d -; CHECK-NEXT: incd z4.d, all, mul #2 -; CHECK-NEXT: incd z6.d, all, mul #4 -; CHECK-NEXT: incd z7.d, all, mul #4 -; CHECK-NEXT: uzp1 p1.s, p2.s, p1.s -; CHECK-NEXT: mov z24.d, z4.d -; CHECK-NEXT: cmphi p4.d, p0/z, z5.d, z6.d -; CHECK-NEXT: cmphi p6.d, p0/z, z5.d, z4.d -; CHECK-NEXT: cmphi p7.d, p0/z, z5.d, z7.d -; CHECK-NEXT: incd z24.d, all, mul #4 -; CHECK-NEXT: uzp1 p2.s, p3.s, p4.s -; CHECK-NEXT: uzp1 p3.s, p5.s, p6.s -; CHECK-NEXT: cmphi p8.d, p0/z, z5.d, z24.d -; CHECK-NEXT: mov z5.d, x9 -; CHECK-NEXT: cmp x8, #1 -; CHECK-NEXT: uzp1 p1.h, p1.h, p3.h -; CHECK-NEXT: cset w8, lt -; CHECK-NEXT: cmphi p4.d, p0/z, z5.d, z24.d -; CHECK-NEXT: cmphi p5.d, p0/z, z5.d, z7.d -; CHECK-NEXT: cmphi p6.d, p0/z, z5.d, z6.d -; CHECK-NEXT: uzp1 p7.s, p7.s, p8.s -; CHECK-NEXT: cmphi p9.d, p0/z, z5.d, z3.d -; CHECK-NEXT: cmphi p3.d, p0/z, z5.d, z4.d -; CHECK-NEXT: cmphi p8.d, p0/z, z5.d, z2.d -; CHECK-NEXT: sbfx x8, x8, #0, #1 -; CHECK-NEXT: uzp1 p2.h, p2.h, p7.h -; CHECK-NEXT: cmphi p7.d, p0/z, z5.d, z1.d -; CHECK-NEXT: cmphi p0.d, p0/z, z5.d, z0.d -; CHECK-NEXT: uzp1 p4.s, p5.s, p4.s -; CHECK-NEXT: uzp1 p5.s, p9.s, p6.s -; CHECK-NEXT: ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: whilelo p6.b, xzr, x8 -; CHECK-NEXT: uzp1 p3.s, p8.s, p3.s -; CHECK-NEXT: cmp x9, #1 -; CHECK-NEXT: ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p0.s, p0.s, p7.s -; CHECK-NEXT: cset w8, lt -; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p4.h, p5.h, p4.h -; CHECK-NEXT: sbfx x8, x8, #0, #1 -; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p0.h, p0.h, p3.h -; CHECK-NEXT: uzp1 p1.b, p1.b, p2.b -; CHECK-NEXT: uzp1 p2.b, p0.b, p4.b -; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: whilelo p3.b, xzr, x8 -; CHECK-NEXT: sel p0.b, p1, p1.b, p6.b -; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: sel p1.b, p2, p2.b, p3.b -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: incb x8 +; CHECK-NEXT: incb x10 +; CHECK-NEXT: whilewr p1.h, x0, x1 +; CHECK-NEXT: whilewr p2.h, x8, x10 +; CHECK-NEXT: whilewr p3.h, x9, x10 +; CHECK-NEXT: uzp1 p0.b, p0.b, p2.b +; CHECK-NEXT: uzp1 p1.b, p1.b, p3.b ; CHECK-NEXT: ret entry: %0 = call <vscale x 32 x i1> @llvm.loop.dependence.war.mask.nxv32i1(ptr %a, ptr %b, i64 2) ret <vscale x 32 x i1> %0 } -define <vscale x 8 x i1> @whilewr_32_expand(ptr %a, ptr %b) { -; CHECK-LABEL: whilewr_32_expand: +define <vscale x 8 x i1> @whilewr_32_split(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_32_split: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: index z0.d, #0, #1 -; CHECK-NEXT: subs x8, x1, x0 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: add x9, x8, #3 -; CHECK-NEXT: csel x8, x9, x8, mi -; CHECK-NEXT: asr x8, x8, #2 -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: mov z2.d, z0.d -; CHECK-NEXT: mov z3.d, x8 -; CHECK-NEXT: incd z1.d -; CHECK-NEXT: incd z2.d, all, mul #2 -; CHECK-NEXT: cmphi p1.d, p0/z, z3.d, z0.d -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: cmphi p2.d, p0/z, z3.d, z1.d -; CHECK-NEXT: cmphi p3.d, p0/z, z3.d, z2.d -; CHECK-NEXT: incd z4.d, all, mul #2 -; CHECK-NEXT: uzp1 p1.s, p1.s, p2.s -; CHECK-NEXT: cmphi p0.d, p0/z, z3.d, z4.d -; CHECK-NEXT: cmp x8, #1 -; CHECK-NEXT: cset w8, lt -; CHECK-NEXT: sbfx x8, x8, #0, #1 -; CHECK-NEXT: uzp1 p0.s, p3.s, p0.s -; CHECK-NEXT: uzp1 p0.h, p1.h, p0.h -; CHECK-NEXT: whilelo p1.h, xzr, x8 -; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NEXT: whilewr p0.s, x0, x1 +; CHECK-NEXT: incb x1 +; CHECK-NEXT: incb x0 +; CHECK-NEXT: whilewr p1.s, x0, x1 +; CHECK-NEXT: uzp1 p0.h, p0.h, p1.h ; CHECK-NEXT: ret entry: %0 = call <vscale x 8 x i1> @llvm.loop.dependence.war.mask.nxv8i1(ptr %a, ptr %b, i64 4) ret <vscale x 8 x i1> %0 } -define <vscale x 16 x i1> @whilewr_32_expand2(ptr %a, ptr %b) { -; CHECK-LABEL: whilewr_32_expand2: +define <vscale x 16 x i1> @whilewr_32_split2(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_32_split2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: index z0.d, #0, #1 -; CHECK-NEXT: subs x8, x1, x0 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: add x9, x8, #3 -; CHECK-NEXT: csel x8, x9, x8, mi -; CHECK-NEXT: asr x8, x8, #2 -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: mov z4.d, z0.d -; CHECK-NEXT: mov z5.d, z0.d -; CHECK-NEXT: mov z2.d, x8 -; CHECK-NEXT: incd z1.d -; CHECK-NEXT: incd z4.d, all, mul #2 -; CHECK-NEXT: incd z5.d, all, mul #4 -; CHECK-NEXT: cmphi p2.d, p0/z, z2.d, z0.d -; CHECK-NEXT: mov z3.d, z1.d -; CHECK-NEXT: cmphi p1.d, p0/z, z2.d, z1.d -; CHECK-NEXT: incd z1.d, all, mul #4 -; CHECK-NEXT: cmphi p3.d, p0/z, z2.d, z4.d -; CHECK-NEXT: incd z4.d, all, mul #4 -; CHECK-NEXT: cmphi p4.d, p0/z, z2.d, z5.d -; CHECK-NEXT: incd z3.d, all, mul #2 -; CHECK-NEXT: cmphi p5.d, p0/z, z2.d, z1.d -; CHECK-NEXT: cmphi p7.d, p0/z, z2.d, z4.d -; CHECK-NEXT: uzp1 p1.s, p2.s, p1.s -; CHECK-NEXT: mov z0.d, z3.d -; CHECK-NEXT: cmphi p6.d, p0/z, z2.d, z3.d -; CHECK-NEXT: uzp1 p2.s, p4.s, p5.s -; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: incd z0.d, all, mul #4 -; CHECK-NEXT: uzp1 p3.s, p3.s, p6.s -; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: cmphi p0.d, p0/z, z2.d, z0.d -; CHECK-NEXT: uzp1 p1.h, p1.h, p3.h -; CHECK-NEXT: cmp x8, #1 -; CHECK-NEXT: cset w8, lt -; CHECK-NEXT: sbfx x8, x8, #0, #1 -; CHECK-NEXT: uzp1 p0.s, p7.s, p0.s -; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p0.h, p2.h, p0.h -; CHECK-NEXT: uzp1 p0.b, p1.b, p0.b -; CHECK-NEXT: whilelo p1.b, xzr, x8 -; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: whilewr p0.s, x0, x1 +; CHECK-NEXT: mov x10, x1 +; CHECK-NEXT: mov x11, x0 +; CHECK-NEXT: addvl x8, x1, #3 +; CHECK-NEXT: addvl x9, x0, #3 +; CHECK-NEXT: incb x10, all, mul #2 +; CHECK-NEXT: uzp1 p0.h, p0.h, p0.h +; CHECK-NEXT: incb x11, all, mul #2 +; CHECK-NEXT: incb x1 +; CHECK-NEXT: incb x0 +; CHECK-NEXT: whilewr p1.s, x9, x8 +; CHECK-NEXT: uzp1 p0.b, p0.b, p0.b +; CHECK-NEXT: whilewr p2.s, x11, x10 +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: whilewr p3.s, x0, x1 +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: uzp1 p1.h, p2.h, p1.h +; CHECK-NEXT: uzp1 p0.h, p0.h, p3.h +; CHECK-NEXT: uzp1 p0.b, p0.b, p1.b ; CHECK-NEXT: ret entry: %0 = call <vscale x 16 x i1> @llvm.loop.dependence.war.mask.nxv16i1(ptr %a, ptr %b, i64 4) ret <vscale x 16 x i1> %0 } -define <vscale x 32 x i1> @whilewr_32_expand3(ptr %a, ptr %b) { -; CHECK-LABEL: whilewr_32_expand3: +define <vscale x 32 x i1> @whilewr_32_split3(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_32_split3: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p10, [sp, #1, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str p9, [sp, #2, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str p8, [sp, #3, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: index z0.d, #0, #1 -; CHECK-NEXT: subs x8, x1, x0 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: add x9, x8, #3 +; CHECK-NEXT: whilewr p0.s, x0, x1 +; CHECK-NEXT: addvl x8, x0, #3 +; CHECK-NEXT: mov x9, x0 +; CHECK-NEXT: mov x10, x0 +; CHECK-NEXT: addvl x11, x0, #7 +; CHECK-NEXT: addvl x12, x0, #6 +; CHECK-NEXT: uzp1 p0.h, p0.h, p0.h +; CHECK-NEXT: addvl x13, x0, #5 ; CHECK-NEXT: incb x0, all, mul #4 -; CHECK-NEXT: csel x8, x9, x8, mi -; CHECK-NEXT: asr x8, x8, #2 -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: mov z2.d, z0.d -; CHECK-NEXT: mov z4.d, z0.d -; CHECK-NEXT: mov z5.d, x8 -; CHECK-NEXT: incd z1.d -; CHECK-NEXT: incd z2.d, all, mul #2 -; CHECK-NEXT: incd z4.d, all, mul #4 -; CHECK-NEXT: cmphi p5.d, p0/z, z5.d, z0.d -; CHECK-NEXT: mov z3.d, z1.d -; CHECK-NEXT: mov z6.d, z2.d -; CHECK-NEXT: mov z7.d, z1.d -; CHECK-NEXT: cmphi p2.d, p0/z, z5.d, z4.d -; CHECK-NEXT: cmphi p3.d, p0/z, z5.d, z2.d -; CHECK-NEXT: cmphi p4.d, p0/z, z5.d, z1.d -; CHECK-NEXT: incd z3.d, all, mul #2 -; CHECK-NEXT: incd z6.d, all, mul #4 -; CHECK-NEXT: incd z7.d, all, mul #4 -; CHECK-NEXT: uzp1 p4.s, p5.s, p4.s -; CHECK-NEXT: mov z24.d, z3.d -; CHECK-NEXT: cmphi p6.d, p0/z, z5.d, z6.d -; CHECK-NEXT: cmphi p7.d, p0/z, z5.d, z7.d -; CHECK-NEXT: cmphi p8.d, p0/z, z5.d, z3.d -; CHECK-NEXT: incd z24.d, all, mul #4 -; CHECK-NEXT: uzp1 p2.s, p2.s, p7.s -; CHECK-NEXT: uzp1 p3.s, p3.s, p8.s -; CHECK-NEXT: cmphi p9.d, p0/z, z5.d, z24.d -; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: mov x15, x1 +; CHECK-NEXT: incb x10 +; CHECK-NEXT: addvl x14, x1, #3 +; CHECK-NEXT: uzp1 p0.b, p0.b, p0.b +; CHECK-NEXT: incb x15 +; CHECK-NEXT: incb x9, all, mul #2 +; CHECK-NEXT: whilewr p2.s, x0, x1 +; CHECK-NEXT: incb x1, all, mul #2 +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: whilewr p3.s, x10, x15 +; CHECK-NEXT: uzp1 p2.h, p2.h, p0.h +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: whilewr p1.s, x8, x14 +; CHECK-NEXT: uzp1 p0.h, p0.h, p3.h +; CHECK-NEXT: whilewr p4.s, x9, x1 +; CHECK-NEXT: uzp1 p2.b, p2.b, p0.b +; CHECK-NEXT: uzp1 p1.h, p4.h, p1.h +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: whilewr p3.s, x11, x14 +; CHECK-NEXT: whilewr p4.s, x12, x1 +; CHECK-NEXT: whilewr p5.s, x13, x15 +; CHECK-NEXT: punpklo p2.h, p2.b ; CHECK-NEXT: uzp1 p3.h, p4.h, p3.h -; CHECK-NEXT: cset w8, lt -; CHECK-NEXT: sbfx x8, x8, #0, #1 -; CHECK-NEXT: uzp1 p6.s, p6.s, p9.s -; CHECK-NEXT: whilelo p1.b, xzr, x8 -; CHECK-NEXT: subs x8, x1, x0 -; CHECK-NEXT: uzp1 p2.h, p2.h, p6.h -; CHECK-NEXT: add x9, x8, #3 -; CHECK-NEXT: csel x8, x9, x8, mi -; CHECK-NEXT: uzp1 p2.b, p3.b, p2.b -; CHECK-NEXT: asr x8, x8, #2 -; CHECK-NEXT: mov z5.d, x8 -; CHECK-NEXT: cmphi p5.d, p0/z, z5.d, z24.d -; CHECK-NEXT: cmphi p7.d, p0/z, z5.d, z6.d -; CHECK-NEXT: cmphi p8.d, p0/z, z5.d, z7.d -; CHECK-NEXT: cmphi p9.d, p0/z, z5.d, z4.d -; CHECK-NEXT: cmphi p4.d, p0/z, z5.d, z3.d -; CHECK-NEXT: cmphi p10.d, p0/z, z5.d, z2.d -; CHECK-NEXT: cmphi p6.d, p0/z, z5.d, z1.d -; CHECK-NEXT: cmphi p0.d, p0/z, z5.d, z0.d -; CHECK-NEXT: cmp x8, #1 -; CHECK-NEXT: uzp1 p5.s, p7.s, p5.s -; CHECK-NEXT: cset w8, lt -; CHECK-NEXT: uzp1 p7.s, p9.s, p8.s -; CHECK-NEXT: sbfx x8, x8, #0, #1 -; CHECK-NEXT: ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p4.s, p10.s, p4.s -; CHECK-NEXT: ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p0.s, p0.s, p6.s -; CHECK-NEXT: ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p5.h, p7.h, p5.h -; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p0.h, p0.h, p4.h -; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: whilelo p4.b, xzr, x8 -; CHECK-NEXT: uzp1 p3.b, p0.b, p5.b -; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: sel p0.b, p2, p2.b, p1.b -; CHECK-NEXT: sel p1.b, p3, p3.b, p4.b ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p2.h, p2.h, p5.h +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p0.b, p0.b, p1.b +; CHECK-NEXT: uzp1 p1.b, p2.b, p3.b ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -455,120 +241,100 @@ entry: ret <vscale x 32 x i1> %0 } -define <vscale x 4 x i1> @whilewr_64_expand(ptr %a, ptr %b) { -; CHECK-LABEL: whilewr_64_expand: +define <vscale x 4 x i1> @whilewr_64_split(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_64_split: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: index z0.d, #0, #1 -; CHECK-NEXT: subs x8, x1, x0 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: add x9, x8, #7 -; CHECK-NEXT: csel x8, x9, x8, mi -; CHECK-NEXT: asr x8, x8, #3 -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: mov z2.d, x8 -; CHECK-NEXT: incd z1.d -; CHECK-NEXT: cmphi p1.d, p0/z, z2.d, z0.d -; CHECK-NEXT: cmphi p0.d, p0/z, z2.d, z1.d -; CHECK-NEXT: cmp x8, #1 -; CHECK-NEXT: cset w8, lt -; CHECK-NEXT: sbfx x8, x8, #0, #1 -; CHECK-NEXT: uzp1 p0.s, p1.s, p0.s -; CHECK-NEXT: whilelo p1.s, xzr, x8 -; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NEXT: whilewr p0.d, x0, x1 +; CHECK-NEXT: incb x1 +; CHECK-NEXT: incb x0 +; CHECK-NEXT: whilewr p1.d, x0, x1 +; CHECK-NEXT: uzp1 p0.s, p0.s, p1.s ; CHECK-NEXT: ret entry: %0 = call <vscale x 4 x i1> @llvm.loop.dependence.war.mask.nxv4i1(ptr %a, ptr %b, i64 8) ret <vscale x 4 x i1> %0 } -define <vscale x 8 x i1> @whilewr_64_expand2(ptr %a, ptr %b) { -; CHECK-LABEL: whilewr_64_expand2: +define <vscale x 8 x i1> @whilewr_64_split2(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_64_split2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: index z0.d, #0, #1 -; CHECK-NEXT: subs x8, x1, x0 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: add x9, x8, #7 -; CHECK-NEXT: csel x8, x9, x8, mi -; CHECK-NEXT: asr x8, x8, #3 -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: mov z2.d, z0.d -; CHECK-NEXT: mov z3.d, x8 -; CHECK-NEXT: incd z1.d -; CHECK-NEXT: incd z2.d, all, mul #2 -; CHECK-NEXT: cmphi p1.d, p0/z, z3.d, z0.d -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: cmphi p2.d, p0/z, z3.d, z1.d -; CHECK-NEXT: cmphi p3.d, p0/z, z3.d, z2.d -; CHECK-NEXT: incd z4.d, all, mul #2 -; CHECK-NEXT: uzp1 p1.s, p1.s, p2.s -; CHECK-NEXT: cmphi p0.d, p0/z, z3.d, z4.d -; CHECK-NEXT: cmp x8, #1 -; CHECK-NEXT: cset w8, lt -; CHECK-NEXT: sbfx x8, x8, #0, #1 -; CHECK-NEXT: uzp1 p0.s, p3.s, p0.s -; CHECK-NEXT: uzp1 p0.h, p1.h, p0.h -; CHECK-NEXT: whilelo p1.h, xzr, x8 -; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NEXT: whilewr p0.d, x0, x1 +; CHECK-NEXT: mov x10, x1 +; CHECK-NEXT: mov x11, x0 +; CHECK-NEXT: addvl x8, x1, #3 +; CHECK-NEXT: addvl x9, x0, #3 +; CHECK-NEXT: incb x10, all, mul #2 +; CHECK-NEXT: uzp1 p0.s, p0.s, p0.s +; CHECK-NEXT: incb x11, all, mul #2 +; CHECK-NEXT: incb x1 +; CHECK-NEXT: incb x0 +; CHECK-NEXT: whilewr p1.d, x9, x8 +; CHECK-NEXT: uzp1 p0.h, p0.h, p0.h +; CHECK-NEXT: whilewr p2.d, x11, x10 +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: whilewr p3.d, x0, x1 +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: uzp1 p1.s, p2.s, p1.s +; CHECK-NEXT: uzp1 p0.s, p0.s, p3.s +; CHECK-NEXT: uzp1 p0.h, p0.h, p1.h ; CHECK-NEXT: ret entry: %0 = call <vscale x 8 x i1> @llvm.loop.dependence.war.mask.nxv8i1(ptr %a, ptr %b, i64 8) ret <vscale x 8 x i1> %0 } -define <vscale x 16 x i1> @whilewr_64_expand3(ptr %a, ptr %b) { -; CHECK-LABEL: whilewr_64_expand3: +define <vscale x 16 x i1> @whilewr_64_split3(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_64_split3: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: index z0.d, #0, #1 -; CHECK-NEXT: subs x8, x1, x0 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: add x9, x8, #7 -; CHECK-NEXT: csel x8, x9, x8, mi -; CHECK-NEXT: asr x8, x8, #3 -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: mov z4.d, z0.d -; CHECK-NEXT: mov z5.d, z0.d -; CHECK-NEXT: mov z2.d, x8 -; CHECK-NEXT: incd z1.d -; CHECK-NEXT: incd z4.d, all, mul #2 -; CHECK-NEXT: incd z5.d, all, mul #4 -; CHECK-NEXT: cmphi p2.d, p0/z, z2.d, z0.d -; CHECK-NEXT: mov z3.d, z1.d -; CHECK-NEXT: cmphi p1.d, p0/z, z2.d, z1.d -; CHECK-NEXT: incd z1.d, all, mul #4 -; CHECK-NEXT: cmphi p3.d, p0/z, z2.d, z4.d -; CHECK-NEXT: incd z4.d, all, mul #4 -; CHECK-NEXT: cmphi p4.d, p0/z, z2.d, z5.d -; CHECK-NEXT: incd z3.d, all, mul #2 -; CHECK-NEXT: cmphi p5.d, p0/z, z2.d, z1.d -; CHECK-NEXT: cmphi p7.d, p0/z, z2.d, z4.d +; CHECK-NEXT: whilewr p0.d, x0, x1 +; CHECK-NEXT: mov x10, x1 +; CHECK-NEXT: mov x11, x0 +; CHECK-NEXT: mov x12, x1 +; CHECK-NEXT: mov x13, x0 +; CHECK-NEXT: incb x10, all, mul #2 +; CHECK-NEXT: incb x11, all, mul #2 +; CHECK-NEXT: incb x12 +; CHECK-NEXT: incb x13 +; CHECK-NEXT: uzp1 p0.s, p0.s, p0.s +; CHECK-NEXT: addvl x8, x1, #3 +; CHECK-NEXT: addvl x9, x0, #3 +; CHECK-NEXT: whilewr p1.d, x9, x8 +; CHECK-NEXT: addvl x8, x1, #7 +; CHECK-NEXT: addvl x9, x0, #7 +; CHECK-NEXT: uzp1 p0.h, p0.h, p0.h +; CHECK-NEXT: whilewr p2.d, x11, x10 +; CHECK-NEXT: addvl x10, x1, #6 +; CHECK-NEXT: addvl x11, x0, #6 +; CHECK-NEXT: whilewr p3.d, x13, x12 +; CHECK-NEXT: addvl x12, x1, #5 +; CHECK-NEXT: addvl x13, x0, #5 +; CHECK-NEXT: incb x1, all, mul #4 +; CHECK-NEXT: incb x0, all, mul #4 +; CHECK-NEXT: punpklo p0.h, p0.b ; CHECK-NEXT: uzp1 p1.s, p2.s, p1.s -; CHECK-NEXT: mov z0.d, z3.d -; CHECK-NEXT: cmphi p6.d, p0/z, z2.d, z3.d -; CHECK-NEXT: uzp1 p2.s, p4.s, p5.s -; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: whilewr p5.d, x0, x1 +; CHECK-NEXT: whilewr p4.d, x9, x8 +; CHECK-NEXT: uzp1 p2.s, p5.s, p0.s +; CHECK-NEXT: uzp1 p0.s, p0.s, p3.s +; CHECK-NEXT: whilewr p3.d, x11, x10 +; CHECK-NEXT: uzp1 p2.h, p2.h, p0.h +; CHECK-NEXT: whilewr p5.d, x13, x12 +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: uzp1 p3.s, p3.s, p4.s ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: incd z0.d, all, mul #4 -; CHECK-NEXT: uzp1 p3.s, p3.s, p6.s -; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: cmphi p0.d, p0/z, z2.d, z0.d -; CHECK-NEXT: uzp1 p1.h, p1.h, p3.h -; CHECK-NEXT: cmp x8, #1 -; CHECK-NEXT: cset w8, lt -; CHECK-NEXT: sbfx x8, x8, #0, #1 -; CHECK-NEXT: uzp1 p0.s, p7.s, p0.s -; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p0.h, p2.h, p0.h -; CHECK-NEXT: uzp1 p0.b, p1.b, p0.b -; CHECK-NEXT: whilelo p1.b, xzr, x8 -; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: uzp1 p0.h, p0.h, p1.h +; CHECK-NEXT: uzp1 p2.s, p2.s, p5.s +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p1.h, p2.h, p3.h +; CHECK-NEXT: uzp1 p0.b, p0.b, p1.b ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret @@ -577,94 +343,95 @@ entry: ret <vscale x 16 x i1> %0 } -define <vscale x 32 x i1> @whilewr_64_expand4(ptr %a, ptr %b) { -; CHECK-LABEL: whilewr_64_expand4: +define <vscale x 32 x i1> @whilewr_64_split4(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_64_split4: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p10, [sp, #1, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str p9, [sp, #2, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: str p8, [sp, #3, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: index z0.d, #0, #1 -; CHECK-NEXT: subs x8, x1, x0 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: add x9, x8, #7 -; CHECK-NEXT: csel x8, x9, x8, mi -; CHECK-NEXT: addvl x9, x0, #8 -; CHECK-NEXT: asr x8, x8, #3 -; CHECK-NEXT: mov z1.d, z0.d -; CHECK-NEXT: mov z2.d, z0.d -; CHECK-NEXT: mov z4.d, z0.d -; CHECK-NEXT: mov z5.d, x8 -; CHECK-NEXT: incd z1.d -; CHECK-NEXT: incd z2.d, all, mul #2 -; CHECK-NEXT: incd z4.d, all, mul #4 -; CHECK-NEXT: cmphi p5.d, p0/z, z5.d, z0.d -; CHECK-NEXT: mov z3.d, z1.d -; CHECK-NEXT: mov z6.d, z2.d -; CHECK-NEXT: mov z7.d, z1.d -; CHECK-NEXT: cmphi p2.d, p0/z, z5.d, z4.d -; CHECK-NEXT: cmphi p3.d, p0/z, z5.d, z2.d -; CHECK-NEXT: cmphi p4.d, p0/z, z5.d, z1.d -; CHECK-NEXT: incd z3.d, all, mul #2 -; CHECK-NEXT: incd z6.d, all, mul #4 -; CHECK-NEXT: incd z7.d, all, mul #4 +; CHECK-NEXT: addvl x8, x1, #3 +; CHECK-NEXT: addvl x11, x0, #3 +; CHECK-NEXT: whilewr p1.d, x0, x1 +; CHECK-NEXT: whilewr p0.d, x11, x8 +; CHECK-NEXT: addvl x11, x0, #8 +; CHECK-NEXT: mov x9, x1 +; CHECK-NEXT: mov x10, x0 +; CHECK-NEXT: mov x14, x0 +; CHECK-NEXT: incb x9 +; CHECK-NEXT: uzp1 p2.s, p1.s, p0.s +; CHECK-NEXT: incb x10 +; CHECK-NEXT: incb x14, all, mul #2 +; CHECK-NEXT: whilewr p1.d, x11, x1 +; CHECK-NEXT: mov x11, x1 +; CHECK-NEXT: mov x12, x1 +; CHECK-NEXT: uzp1 p2.h, p2.h, p0.h +; CHECK-NEXT: incb x11, all, mul #2 +; CHECK-NEXT: mov x13, x0 +; CHECK-NEXT: incb x12, all, mul #4 +; CHECK-NEXT: incb x13, all, mul #4 +; CHECK-NEXT: whilewr p3.d, x10, x9 +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: addvl x10, x1, #7 +; CHECK-NEXT: addvl x15, x0, #5 +; CHECK-NEXT: whilewr p5.d, x14, x11 +; CHECK-NEXT: addvl x14, x0, #6 +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: whilewr p4.d, x13, x12 +; CHECK-NEXT: addvl x13, x0, #7 +; CHECK-NEXT: uzp1 p0.s, p5.s, p0.s +; CHECK-NEXT: uzp1 p2.s, p2.s, p3.s +; CHECK-NEXT: uzp1 p3.s, p4.s, p0.s +; CHECK-NEXT: uzp1 p0.h, p2.h, p0.h +; CHECK-NEXT: whilewr p2.d, x13, x10 +; CHECK-NEXT: addvl x13, x1, #6 +; CHECK-NEXT: uzp1 p3.h, p3.h, p0.h +; CHECK-NEXT: whilewr p4.d, x14, x13 +; CHECK-NEXT: addvl x14, x1, #5 +; CHECK-NEXT: punpklo p3.h, p3.b +; CHECK-NEXT: whilewr p5.d, x15, x14 +; CHECK-NEXT: addvl x15, x0, #12 +; CHECK-NEXT: punpklo p3.h, p3.b +; CHECK-NEXT: uzp1 p2.s, p4.s, p2.s +; CHECK-NEXT: uzp1 p3.s, p3.s, p5.s +; CHECK-NEXT: whilewr p4.d, x15, x12 +; CHECK-NEXT: addvl x12, x0, #15 +; CHECK-NEXT: uzp1 p2.h, p3.h, p2.h +; CHECK-NEXT: uzp1 p3.s, p4.s, p0.s +; CHECK-NEXT: uzp1 p1.s, p1.s, p0.s +; CHECK-NEXT: uzp1 p3.h, p3.h, p0.h +; CHECK-NEXT: whilewr p4.d, x12, x10 +; CHECK-NEXT: addvl x10, x0, #14 +; CHECK-NEXT: punpklo p3.h, p3.b +; CHECK-NEXT: whilewr p5.d, x10, x13 +; CHECK-NEXT: addvl x10, x0, #13 +; CHECK-NEXT: uzp1 p1.h, p1.h, p0.h +; CHECK-NEXT: whilewr p6.d, x10, x14 +; CHECK-NEXT: addvl x10, x0, #11 +; CHECK-NEXT: punpklo p3.h, p3.b ; CHECK-NEXT: uzp1 p4.s, p5.s, p4.s -; CHECK-NEXT: mov z24.d, z3.d -; CHECK-NEXT: cmphi p6.d, p0/z, z5.d, z6.d -; CHECK-NEXT: cmphi p7.d, p0/z, z5.d, z7.d -; CHECK-NEXT: cmphi p8.d, p0/z, z5.d, z3.d -; CHECK-NEXT: incd z24.d, all, mul #4 -; CHECK-NEXT: uzp1 p2.s, p2.s, p7.s -; CHECK-NEXT: uzp1 p3.s, p3.s, p8.s -; CHECK-NEXT: cmphi p9.d, p0/z, z5.d, z24.d -; CHECK-NEXT: cmp x8, #1 -; CHECK-NEXT: uzp1 p3.h, p4.h, p3.h -; CHECK-NEXT: cset w8, lt -; CHECK-NEXT: sbfx x8, x8, #0, #1 -; CHECK-NEXT: uzp1 p6.s, p6.s, p9.s -; CHECK-NEXT: whilelo p1.b, xzr, x8 -; CHECK-NEXT: subs x8, x1, x9 -; CHECK-NEXT: uzp1 p2.h, p2.h, p6.h -; CHECK-NEXT: add x9, x8, #7 -; CHECK-NEXT: csel x8, x9, x8, mi -; CHECK-NEXT: uzp1 p2.b, p3.b, p2.b -; CHECK-NEXT: asr x8, x8, #3 -; CHECK-NEXT: mov z5.d, x8 -; CHECK-NEXT: cmphi p5.d, p0/z, z5.d, z24.d -; CHECK-NEXT: cmphi p7.d, p0/z, z5.d, z6.d -; CHECK-NEXT: cmphi p8.d, p0/z, z5.d, z7.d -; CHECK-NEXT: cmphi p9.d, p0/z, z5.d, z4.d -; CHECK-NEXT: cmphi p4.d, p0/z, z5.d, z3.d -; CHECK-NEXT: cmphi p10.d, p0/z, z5.d, z2.d -; CHECK-NEXT: cmphi p6.d, p0/z, z5.d, z1.d -; CHECK-NEXT: cmphi p0.d, p0/z, z5.d, z0.d -; CHECK-NEXT: cmp x8, #1 -; CHECK-NEXT: uzp1 p5.s, p7.s, p5.s -; CHECK-NEXT: cset w8, lt -; CHECK-NEXT: uzp1 p7.s, p9.s, p8.s -; CHECK-NEXT: sbfx x8, x8, #0, #1 -; CHECK-NEXT: ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p4.s, p10.s, p4.s -; CHECK-NEXT: ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p0.s, p0.s, p6.s -; CHECK-NEXT: ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p5.h, p7.h, p5.h -; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p0.h, p0.h, p4.h +; CHECK-NEXT: whilewr p5.d, x10, x8 +; CHECK-NEXT: addvl x8, x0, #10 +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: uzp1 p3.s, p3.s, p6.s +; CHECK-NEXT: whilewr p6.d, x8, x11 +; CHECK-NEXT: addvl x8, x0, #9 +; CHECK-NEXT: whilewr p7.d, x8, x9 +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: uzp1 p5.s, p6.s, p5.s ; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: whilelo p4.b, xzr, x8 -; CHECK-NEXT: uzp1 p3.b, p0.b, p5.b -; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: sel p0.b, p2, p2.b, p1.b -; CHECK-NEXT: sel p1.b, p3, p3.b, p4.b +; CHECK-NEXT: uzp1 p1.s, p1.s, p7.s +; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p3.h, p3.h, p4.h ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p1.h, p1.h, p5.h +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p0.b, p0.b, p2.b +; CHECK-NEXT: uzp1 p1.b, p1.b, p3.b ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits