llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-clang-codegen @llvm/pr-subscribers-backend-aarch64 Author: Kerry McLaughlin (kmclaughlin-arm) <details> <summary>Changes</summary> Builtins for reading the streaming vector length are canonicalised to use the aarch64.sme.cntsd intrinisic and a multiply, i.e. - cntsb -> cntsd * 8 - cntsh -> cntsd * 4 - cntsw -> cntsd * 2 This patch also removes the LLVM intrinsics for cnts[b,h,w], and adds patterns to improve codegen when cntsd is multiplied by a constant. --- Patch is 22.83 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154761.diff 12 Files Affected: - (modified) clang/include/clang/Basic/arm_sme.td (+7-8) - (modified) clang/lib/CodeGen/TargetBuiltins/ARM.cpp (+28-2) - (modified) clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_cnt.c (+24-18) - (modified) llvm/include/llvm/IR/IntrinsicsAArch64.td (+2-7) - (modified) llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp (+20) - (modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+3-20) - (modified) llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td (+25) - (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (+5-11) - (modified) llvm/test/CodeGen/AArch64/sme-intrinsics-rdsvl.ll (+64-18) - (modified) llvm/test/CodeGen/AArch64/sme-streaming-interface-remarks.ll (+2-2) - (modified) llvm/test/CodeGen/AArch64/sme-streaming-interface.ll (+4-3) - (modified) llvm/test/Transforms/InstCombine/AArch64/sme-intrinsic-opts-counting-elems.ll (-45) ``````````diff diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td index a4eb92e76968c..f853122994497 100644 --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -156,16 +156,15 @@ let SMETargetGuard = "sme2p1" in { //////////////////////////////////////////////////////////////////////////////// // SME - Counting elements in a streaming vector -multiclass ZACount<string n_suffix> { - def NAME : SInst<"sv" # n_suffix, "nv", "", MergeNone, - "aarch64_sme_" # n_suffix, - [IsOverloadNone, IsStreamingCompatible]>; +multiclass ZACount<string intr, string n_suffix> { + def NAME : SInst<"sv"#n_suffix, "nv", "", MergeNone, + intr, [IsOverloadNone, IsStreamingCompatible]>; } -defm SVCNTSB : ZACount<"cntsb">; -defm SVCNTSH : ZACount<"cntsh">; -defm SVCNTSW : ZACount<"cntsw">; -defm SVCNTSD : ZACount<"cntsd">; +defm SVCNTSB : ZACount<"", "cntsb">; +defm SVCNTSH : ZACount<"", "cntsh">; +defm SVCNTSW : ZACount<"", "cntsw">; +defm SVCNTSD : ZACount<"aarch64_sme_cntsd", "cntsd">; //////////////////////////////////////////////////////////////////////////////// // SME - ADDHA/ADDVA diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp index 60413e7b18e85..217232db44b6f 100644 --- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp @@ -4304,9 +4304,10 @@ Value *CodeGenFunction::EmitSMELd1St1(const SVETypeFlags &TypeFlags, // size in bytes. if (Ops.size() == 5) { Function *StreamingVectorLength = - CGM.getIntrinsic(Intrinsic::aarch64_sme_cntsb); + CGM.getIntrinsic(Intrinsic::aarch64_sme_cntsd); llvm::Value *StreamingVectorLengthCall = - Builder.CreateCall(StreamingVectorLength); + Builder.CreateMul(Builder.CreateCall(StreamingVectorLength), + llvm::ConstantInt::get(Int64Ty, 8), "svl"); llvm::Value *Mulvl = Builder.CreateMul(StreamingVectorLengthCall, Ops[4], "mulvl"); // The type of the ptr parameter is void *, so use Int8Ty here. @@ -4918,6 +4919,31 @@ Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID, // Handle builtins which require their multi-vector operands to be swapped swapCommutativeSMEOperands(BuiltinID, Ops); + auto isCntsBuiltin = [&](int64_t &Mul) { + switch (BuiltinID) { + default: + Mul = 0; + return false; + case SME::BI__builtin_sme_svcntsb: + Mul = 8; + return true; + case SME::BI__builtin_sme_svcntsh: + Mul = 4; + return true; + case SME::BI__builtin_sme_svcntsw: + Mul = 2; + return true; + } + }; + + int64_t Mul = 0; + if (isCntsBuiltin(Mul)) { + llvm::Value *Cntd = + Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_sme_cntsd)); + return Builder.CreateMul(Cntd, llvm::ConstantInt::get(Int64Ty, Mul), + "mulsvl", /* HasNUW */ true, /* HasNSW */ true); + } + // Should not happen! if (Builtin->LLVMIntrinsic == 0) return nullptr; diff --git a/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_cnt.c b/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_cnt.c index c0b3e1a06b0ff..049c1742e5a9d 100644 --- a/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_cnt.c +++ b/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_cnt.c @@ -6,49 +6,55 @@ #include <arm_sme.h> -// CHECK-C-LABEL: define dso_local i64 @test_svcntsb( +// CHECK-C-LABEL: define dso_local range(i64 0, -9223372036854775808) i64 @test_svcntsb( // CHECK-C-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] { // CHECK-C-NEXT: entry: -// CHECK-C-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-C-NEXT: ret i64 [[TMP0]] +// CHECK-C-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsd() +// CHECK-C-NEXT: [[MULSVL:%.*]] = shl nuw nsw i64 [[TMP0]], 3 +// CHECK-C-NEXT: ret i64 [[MULSVL]] // -// CHECK-CXX-LABEL: define dso_local noundef i64 @_Z12test_svcntsbv( +// CHECK-CXX-LABEL: define dso_local noundef range(i64 0, -9223372036854775808) i64 @_Z12test_svcntsbv( // CHECK-CXX-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] { // CHECK-CXX-NEXT: entry: -// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsb() -// CHECK-CXX-NEXT: ret i64 [[TMP0]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsd() +// CHECK-CXX-NEXT: [[MULSVL:%.*]] = shl nuw nsw i64 [[TMP0]], 3 +// CHECK-CXX-NEXT: ret i64 [[MULSVL]] // uint64_t test_svcntsb() { return svcntsb(); } -// CHECK-C-LABEL: define dso_local i64 @test_svcntsh( +// CHECK-C-LABEL: define dso_local range(i64 0, -9223372036854775808) i64 @test_svcntsh( // CHECK-C-SAME: ) local_unnamed_addr #[[ATTR0]] { // CHECK-C-NEXT: entry: -// CHECK-C-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsh() -// CHECK-C-NEXT: ret i64 [[TMP0]] +// CHECK-C-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsd() +// CHECK-C-NEXT: [[MULSVL:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +// CHECK-C-NEXT: ret i64 [[MULSVL]] // -// CHECK-CXX-LABEL: define dso_local noundef i64 @_Z12test_svcntshv( +// CHECK-CXX-LABEL: define dso_local noundef range(i64 0, -9223372036854775808) i64 @_Z12test_svcntshv( // CHECK-CXX-SAME: ) local_unnamed_addr #[[ATTR0]] { // CHECK-CXX-NEXT: entry: -// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsh() -// CHECK-CXX-NEXT: ret i64 [[TMP0]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsd() +// CHECK-CXX-NEXT: [[MULSVL:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +// CHECK-CXX-NEXT: ret i64 [[MULSVL]] // uint64_t test_svcntsh() { return svcntsh(); } -// CHECK-C-LABEL: define dso_local i64 @test_svcntsw( +// CHECK-C-LABEL: define dso_local range(i64 0, -9223372036854775808) i64 @test_svcntsw( // CHECK-C-SAME: ) local_unnamed_addr #[[ATTR0]] { // CHECK-C-NEXT: entry: -// CHECK-C-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsw() -// CHECK-C-NEXT: ret i64 [[TMP0]] +// CHECK-C-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsd() +// CHECK-C-NEXT: [[MULSVL:%.*]] = shl nuw nsw i64 [[TMP0]], 1 +// CHECK-C-NEXT: ret i64 [[MULSVL]] // -// CHECK-CXX-LABEL: define dso_local noundef i64 @_Z12test_svcntswv( +// CHECK-CXX-LABEL: define dso_local noundef range(i64 0, -9223372036854775808) i64 @_Z12test_svcntswv( // CHECK-CXX-SAME: ) local_unnamed_addr #[[ATTR0]] { // CHECK-CXX-NEXT: entry: -// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsw() -// CHECK-CXX-NEXT: ret i64 [[TMP0]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call i64 @llvm.aarch64.sme.cntsd() +// CHECK-CXX-NEXT: [[MULSVL:%.*]] = shl nuw nsw i64 [[TMP0]], 1 +// CHECK-CXX-NEXT: ret i64 [[MULSVL]] // uint64_t test_svcntsw() { return svcntsw(); diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 6d53bf8b172d8..7c9aef52b3acf 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -3147,13 +3147,8 @@ let TargetPrefix = "aarch64" in { // Counting elements // - class AdvSIMD_SME_CNTSB_Intrinsic - : DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem]>; - - def int_aarch64_sme_cntsb : AdvSIMD_SME_CNTSB_Intrinsic; - def int_aarch64_sme_cntsh : AdvSIMD_SME_CNTSB_Intrinsic; - def int_aarch64_sme_cntsw : AdvSIMD_SME_CNTSB_Intrinsic; - def int_aarch64_sme_cntsd : AdvSIMD_SME_CNTSB_Intrinsic; + def int_aarch64_sme_cntsd + : DefaultAttrsIntrinsic<[llvm_i64_ty], [], [IntrNoMem]>; // // PSTATE Functions diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index bc786f415b554..4e8255bab9437 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -71,6 +71,9 @@ class AArch64DAGToDAGISel : public SelectionDAGISel { template <signed Low, signed High, signed Scale> bool SelectRDVLImm(SDValue N, SDValue &Imm); + template <signed Low, signed High> + bool SelectRDSVLShiftImm(SDValue N, SDValue &Imm); + bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift); bool SelectArithUXTXRegister(SDValue N, SDValue &Reg, SDValue &Shift); bool SelectArithImmed(SDValue N, SDValue &Val, SDValue &Shift); @@ -937,6 +940,23 @@ bool AArch64DAGToDAGISel::SelectRDVLImm(SDValue N, SDValue &Imm) { return false; } +template <signed Low, signed High> +bool AArch64DAGToDAGISel::SelectRDSVLShiftImm(SDValue N, SDValue &Imm) { + if (!isa<ConstantSDNode>(N)) + return false; + + int64_t ShlImm = cast<ConstantSDNode>(N)->getSExtValue(); + if (ShlImm >= 3) { + int64_t MulImm = 1 << (ShlImm - 3); + if (MulImm >= Low && MulImm <= High) { + Imm = CurDAG->getSignedTargetConstant(MulImm, SDLoc(N), MVT::i32); + return true; + } + } + + return false; +} + /// SelectArithExtendedRegister - Select a "extended register" operand. This /// operand folds in an extend followed by an optional left shift. bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 23328ed57fb36..08f0ae0b2f783 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -6266,26 +6266,6 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::aarch64_sve_clz: return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, DL, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); - case Intrinsic::aarch64_sme_cntsb: - return DAG.getNode(AArch64ISD::RDSVL, DL, Op.getValueType(), - DAG.getConstant(1, DL, MVT::i32)); - case Intrinsic::aarch64_sme_cntsh: { - SDValue One = DAG.getConstant(1, DL, MVT::i32); - SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, DL, Op.getValueType(), One); - return DAG.getNode(ISD::SRL, DL, Op.getValueType(), Bytes, One); - } - case Intrinsic::aarch64_sme_cntsw: { - SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, DL, Op.getValueType(), - DAG.getConstant(1, DL, MVT::i32)); - return DAG.getNode(ISD::SRL, DL, Op.getValueType(), Bytes, - DAG.getConstant(2, DL, MVT::i32)); - } - case Intrinsic::aarch64_sme_cntsd: { - SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, DL, Op.getValueType(), - DAG.getConstant(1, DL, MVT::i32)); - return DAG.getNode(ISD::SRL, DL, Op.getValueType(), Bytes, - DAG.getConstant(3, DL, MVT::i32)); - } case Intrinsic::aarch64_sve_cnt: { SDValue Data = Op.getOperand(3); // CTPOP only supports integer operands. @@ -19200,6 +19180,9 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, if (ConstValue.sge(1) && ConstValue.sle(16)) return SDValue(); + if (getIntrinsicID(N0.getNode()) == Intrinsic::aarch64_sme_cntsd) + return SDValue(); + // Multiplication of a power of two plus/minus one can be done more // cheaply as shift+add/sub. For now, this is true unilaterally. If // future CPUs have a cheaper MADD instruction, this may need to be diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index 0d8cb3a76d0be..3b27203d45585 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -127,12 +127,37 @@ def : Pat<(AArch64_requires_za_save), (RequiresZASavePseudo)>; def SDT_AArch64RDSVL : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>]>; def AArch64rdsvl : SDNode<"AArch64ISD::RDSVL", SDT_AArch64RDSVL>; +def sme_cntsb_mul_imm : ComplexPattern<i64, 1, "SelectRDVLImm<1, 31, 8>">; +def sme_cntsh_mul_imm : ComplexPattern<i64, 1, "SelectRDVLImm<1, 31, 4>">; +def sme_cntsw_mul_imm : ComplexPattern<i64, 1, "SelectRDVLImm<1, 31, 2>">; +def sme_cntsd_mul_imm : ComplexPattern<i64, 1, "SelectRDVLImm<1, 31, 1>">; + +def sme_cnts_shl_imm : ComplexPattern<i64, 1, "SelectRDSVLShiftImm<1, 31>">; + let Predicates = [HasSMEandIsNonStreamingSafe] in { def RDSVLI_XI : sve_int_read_vl_a<0b0, 0b11111, "rdsvl", /*streaming_sve=*/0b1>; def ADDSPL_XXI : sve_int_arith_vl<0b1, "addspl", /*streaming_sve=*/0b1>; def ADDSVL_XXI : sve_int_arith_vl<0b0, "addsvl", /*streaming_sve=*/0b1>; def : Pat<(AArch64rdsvl (i32 simm6_32b:$imm)), (RDSVLI_XI simm6_32b:$imm)>; + +// e.g. cntsb() * imm +def : Pat<(i64 (mul (int_aarch64_sme_cntsd), (sme_cntsb_mul_imm i64:$imm))), + (RDSVLI_XI (!cast<SDNodeXForm>("trunc_imm") $imm))>; +def : Pat<(i64 (mul (int_aarch64_sme_cntsd), (sme_cntsh_mul_imm i64:$imm))), + (UBFMXri (RDSVLI_XI (!cast<SDNodeXForm>("trunc_imm") $imm)), 1, 63)>; +def : Pat<(i64 (mul (int_aarch64_sme_cntsd), (sme_cntsw_mul_imm i64:$imm))), + (UBFMXri (RDSVLI_XI (!cast<SDNodeXForm>("trunc_imm") $imm)), 2, 63)>; +def : Pat<(i64 (mul (int_aarch64_sme_cntsd), (sme_cntsd_mul_imm i64:$imm))), + (UBFMXri (RDSVLI_XI (!cast<SDNodeXForm>("trunc_imm") $imm)), 3, 63)>; + +def : Pat<(i64 (shl (int_aarch64_sme_cntsd), (sme_cnts_shl_imm i64:$imm))), + (RDSVLI_XI (!cast<SDNodeXForm>("trunc_imm") $imm))>; + +// cntsh, cntsw, cntsd +def : Pat<(i64 (shl (int_aarch64_sme_cntsd), (i64 2))), (UBFMXri (RDSVLI_XI 1), 1, 63)>; +def : Pat<(i64 (shl (int_aarch64_sme_cntsd), (i64 1))), (UBFMXri (RDSVLI_XI 1), 2, 63)>; +def : Pat<(i64 (int_aarch64_sme_cntsd)), (UBFMXri (RDSVLI_XI 1), 3, 63)>; } let Predicates = [HasSME] in { diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 490f6391c15a0..38958796e2fe1 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -2102,15 +2102,15 @@ instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts) { } static std::optional<Instruction *> -instCombineSMECntsElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts, +instCombineSMECntsElts(InstCombiner &IC, IntrinsicInst &II, const AArch64Subtarget *ST) { if (!ST->isStreaming()) return std::nullopt; - // In streaming-mode, aarch64_sme_cnts is equivalent to aarch64_sve_cnt + // In streaming-mode, aarch64_sme_cntds is equivalent to aarch64_sve_cntd // with SVEPredPattern::all - Value *Cnt = IC.Builder.CreateElementCount( - II.getType(), ElementCount::getScalable(NumElts)); + Value *Cnt = + IC.Builder.CreateElementCount(II.getType(), ElementCount::getScalable(2)); Cnt->takeName(&II); return IC.replaceInstUsesWith(II, Cnt); } @@ -2825,13 +2825,7 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, case Intrinsic::aarch64_sve_cntb: return instCombineSVECntElts(IC, II, 16); case Intrinsic::aarch64_sme_cntsd: - return instCombineSMECntsElts(IC, II, 2, ST); - case Intrinsic::aarch64_sme_cntsw: - return instCombineSMECntsElts(IC, II, 4, ST); - case Intrinsic::aarch64_sme_cntsh: - return instCombineSMECntsElts(IC, II, 8, ST); - case Intrinsic::aarch64_sme_cntsb: - return instCombineSMECntsElts(IC, II, 16, ST); + return instCombineSMECntsElts(IC, II, ST); case Intrinsic::aarch64_sve_ptest_any: case Intrinsic::aarch64_sve_ptest_first: case Intrinsic::aarch64_sve_ptest_last: diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-rdsvl.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-rdsvl.ll index 5d10d7e13da14..86d3e42deae09 100644 --- a/llvm/test/CodeGen/AArch64/sme-intrinsics-rdsvl.ll +++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-rdsvl.ll @@ -1,46 +1,92 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s -define i64 @sme_cntsb() { -; CHECK-LABEL: sme_cntsb: +define i64 @cntsb() { +; CHECK-LABEL: cntsb: ; CHECK: // %bb.0: ; CHECK-NEXT: rdsvl x0, #1 ; CHECK-NEXT: ret - %v = call i64 @llvm.aarch64.sme.cntsb() - ret i64 %v + %1 = call i64 @llvm.aarch64.sme.cntsd() + %res = shl nuw nsw i64 %1, 3 + ret i64 %res } -define i64 @sme_cntsh() { -; CHECK-LABEL: sme_cntsh: +define i64 @cntsh() { +; CHECK-LABEL: cntsh: ; CHECK: // %bb.0: ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: lsr x0, x8, #1 ; CHECK-NEXT: ret - %v = call i64 @llvm.aarch64.sme.cntsh() - ret i64 %v + %1 = call i64 @llvm.aarch64.sme.cntsd() + %res = shl nuw nsw i64 %1, 2 + ret i64 %res } -define i64 @sme_cntsw() { -; CHECK-LABEL: sme_cntsw: +define i64 @cntsw() { +; CHECK-LABEL: cntsw: ; CHECK: // %bb.0: ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: lsr x0, x8, #2 ; CHECK-NEXT: ret - %v = call i64 @llvm.aarch64.sme.cntsw() - ret i64 %v + %1 = call i64 @llvm.aarch64.sme.cntsd() + %res = shl nuw nsw i64 %1, 1 + ret i64 %res } -define i64 @sme_cntsd() { -; CHECK-LABEL: sme_cntsd: +define i64 @cntsd() { +; CHECK-LABEL: cntsd: ; CHECK: // %bb.0: ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: lsr x0, x8, #3 +; CHECK-NEXT: ret + %res = call i64 @llvm.aarch64.sme.cntsd() + ret i64 %res +} + +define i64 @sme_cntsb_mul() { +; CHECK-LABEL: sme_cntsb_mul: +; CHECK: // %bb.0: +; CHECK-NEXT: rdsvl x0, #2 +; CHECK-NEXT: ret + %v = call i64 @llvm.aarch64.sme.cntsd() + %shl = shl nuw nsw i64 %v, 3 + %res = mul i64 %shl, 2 + ret i64 %res +} + +define i64 @sme_cntsh_mul() { +; CHECK-LABEL: sme_cntsh_mul: +; CHECK: // %bb.0: +; CHECK-NEXT: rdsvl x8, #5 +; CHECK-NEXT: lsr x0, x8, #1 +; CHECK-NEXT: ret + %v = call i64 @llvm.aarch64.sme.cntsd() + %shl = shl nuw nsw i64 %v, 2 + %res = mul i64 %shl, 5 + ret i64 %res +} + +define i64 @sme_cntsw_mul() { +; CHECK-LABEL: sme_cntsw_mul: +; CHECK: // %bb.0: +; CHECK-NEXT: rdsvl x8, #7 +; CHECK-NEXT: lsr x0, x8, #2 +; CHECK-NEXT: ret + %v = call i64 @llvm.aarch64.sme.cntsd() + %shl = shl nuw nsw i64 %v, 1 + %res = mul i64 %shl, 7 + ret i64 %res +} + +define i64 @sme_cntsd_mul() { +; CHECK-LABEL: sme_cntsd_mul: +; CHECK: // %bb.0: +; CHECK-NEXT: rdsvl x8, #3 +; CHECK-NEXT: lsr x0, x8, #1 ; CHECK-NEXT: ret %v = call i64 @llvm.aarch64.sme.cntsd() - ret i64 %v + %res = mul i64 %v, 12 + ret i64 %res } -declare i64 @llvm.aarch64.sme.cntsb() -declare i64 @llvm.aarch64.sme.cntsh() -declare i64 @llvm.aarch64.sme.cntsw() declare i64 @llvm.aarch64.sme.cntsd() diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface-remarks.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface-remarks.ll index e1a474d898233..2806f864c7b25 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-interface-remarks.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface-remarks.ll @@ -76,14 +76,14 @@ entry: %Data1 = alloca <vscale x 16 x i8>, align 16 %Data2 = alloca <vscale x 16 x i8>, align 16 %Data3 = alloca <vscale x 16 x i8>, align 16 - %0 = tail call i64 @llvm.aarch64.sme.cntsb() + %0 = tail call i64 @llvm.aarch64.sme.cntsd() call void @foo(ptr noundef nonnull %Data1, ptr noundef nonnull %Data2, ptr noundef nonnull %Data3, i64 noundef %0) %1 = load <vscale x 16 x i8>, ptr %Data1, align 16 %vecext = extractelement <vscale x 16 x i8> %1, i64 0 ret i8 %vecext } -declare i64 @llvm.aarch64.sme.cntsb() +declare i64 @llvm.aarch64.sme.cntsd() declare void @foo(ptr noundef, ptr noundef, ptr noundef, i64 noundef) diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll index 8c4d57e244e03..505a40c16653b 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll @@ -366,9 +366,10 @@ define i8 @call_to_non_streaming_pass_sve_objects(ptr nocapture noundef readnone ; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill ; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-3 -; CHECK-NEXT: rdsvl x3, #1 +; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: addvl x0, sp, #2 ; CHECK-NEXT: addvl x1, sp, #1 +; CHECK-NEXT: lsr x3, x8, #3 ; CHECK-NEXT: mov x2, sp ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl foo @@ -386,7 +387,7 @@ entr... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/154761 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
