Author: Caroline Concatto Date: 2021-01-19T11:54:16Z New Revision: 172f1f8952c977c0101ba19e6ecb9474aa3bdd4b
URL: https://github.com/llvm/llvm-project/commit/172f1f8952c977c0101ba19e6ecb9474aa3bdd4b DIFF: https://github.com/llvm/llvm-project/commit/172f1f8952c977c0101ba19e6ecb9474aa3bdd4b.diff LOG: [AArch64][SVE]Add cost model for vector reduce for scalable vector This patch computes the cost for vector.reduce<operand> for scalable vectors. The cost is split into two parts: the legalization cost and the horizontal reduction. Differential Revision: https://reviews.llvm.org/D93639 Added: llvm/test/Analysis/CostModel/AArch64/sve-getIntrinsicInstrCost-vector-reduce.ll Modified: llvm/include/llvm/CodeGen/BasicTTIImpl.h llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h Removed: ################################################################################ diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 9776c20400d6..3f016d85d8ed 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1288,15 +1288,11 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { case Intrinsic::vector_reduce_fmin: case Intrinsic::vector_reduce_umax: case Intrinsic::vector_reduce_umin: { - if (isa<ScalableVectorType>(RetTy)) - return BaseT::getIntrinsicInstrCost(ICA, CostKind); IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, 1, I); return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); } case Intrinsic::vector_reduce_fadd: case Intrinsic::vector_reduce_fmul: { - if (isa<ScalableVectorType>(RetTy)) - return BaseT::getIntrinsicInstrCost(ICA, CostKind); IntrinsicCostAttributes Attrs( IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, 1, I); return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 68d382fb784b..ffa045846e59 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1096,11 +1096,70 @@ bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty, return false; } +int AArch64TTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, + bool IsPairwise, bool IsUnsigned, + TTI::TargetCostKind CostKind) { + if (!isa<ScalableVectorType>(Ty)) + return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned, + CostKind); + assert((isa<ScalableVectorType>(Ty) && isa<ScalableVectorType>(CondTy)) && + "Both vector needs to be scalable"); + + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); + int LegalizationCost = 0; + if (LT.first > 1) { + Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext()); + unsigned CmpOpcode = + Ty->isFPOrFPVectorTy() ? Instruction::FCmp : Instruction::ICmp; + LegalizationCost = + getCmpSelInstrCost(CmpOpcode, LegalVTy, LegalVTy, + CmpInst::BAD_ICMP_PREDICATE, CostKind) + + getCmpSelInstrCost(Instruction::Select, LegalVTy, LegalVTy, + CmpInst::BAD_ICMP_PREDICATE, CostKind); + LegalizationCost *= LT.first - 1; + } + + return LegalizationCost + /*Cost of horizontal reduction*/ 2; +} + +int AArch64TTIImpl::getArithmeticReductionCostSVE( + unsigned Opcode, VectorType *ValTy, bool IsPairwise, + TTI::TargetCostKind CostKind) { + assert(!IsPairwise && "Cannot be pair wise to continue"); + + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); + int LegalizationCost = 0; + if (LT.first > 1) { + Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext()); + LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind); + LegalizationCost *= LT.first - 1; + } + + int ISD = TLI->InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + // Add the final reduction cost for the legal horizontal reduction + switch (ISD) { + case ISD::ADD: + case ISD::AND: + case ISD::OR: + case ISD::XOR: + case ISD::FADD: + return LegalizationCost + 2; + default: + // TODO: Replace for invalid when InstructionCost is used + // cases not supported by SVE + return 16; + } +} + int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, bool IsPairwiseForm, TTI::TargetCostKind CostKind) { + if (isa<ScalableVectorType>(ValTy)) + return getArithmeticReductionCostSVE(Opcode, ValTy, IsPairwiseForm, + CostKind); if (IsPairwiseForm) return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm, CostKind); diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 7dded02b2a6f..7c9360ada92e 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -139,6 +139,14 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> { int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index); + int getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, + bool IsPairwise, bool IsUnsigned, + TTI::TargetCostKind CostKind); + + int getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, + bool IsPairwiseForm, + TTI::TargetCostKind CostKind); + int getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-getIntrinsicInstrCost-vector-reduce.ll b/llvm/test/Analysis/CostModel/AArch64/sve-getIntrinsicInstrCost-vector-reduce.ll new file mode 100644 index 000000000000..486e7aaac68a --- /dev/null +++ b/llvm/test/Analysis/CostModel/AArch64/sve-getIntrinsicInstrCost-vector-reduce.ll @@ -0,0 +1,251 @@ +; Check getIntrinsicInstrCost in BasicTTIImpl.h with SVE for vector.reduce.<operand> +; Checks legal and not legal vector size + +; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s + + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +define i32 @add.i32.nxv4i32(<vscale x 4 x i32> %v) { +; CHECK-LABEL: 'add.i32.nxv4i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %v) +; CHECK-NEXT:Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r + + %r = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %v) + ret i32 %r +} + +define i64 @add.i64.nxv4i64(<vscale x 4 x i64> %v) { +; CHECK-LABEL: 'add.i64.nxv4i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> %v) +; CHECK-NEXT:Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r + + %r = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> %v) + ret i64 %r +} + +define i32 @mul.i32.nxv4i32(<vscale x 4 x i32> %v) { +; CHECK-LABEL: 'mul.i32.nxv4i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %r = call i32 @llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32> %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r + + %r = call i32 @llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32> %v) + ret i32 %r +} + +define i64 @mul.i64.nxv4i64(<vscale x 4 x i64> %v) { +; CHECK-LABEL: 'mul.i64.nxv4i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %r = call i64 @llvm.vector.reduce.mul.nxv4i64(<vscale x 4 x i64> %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r + + %r = call i64 @llvm.vector.reduce.mul.nxv4i64(<vscale x 4 x i64> %v) + ret i64 %r +} + +define i32 @and.i32.nxv4i32(<vscale x 4 x i32> %v) { +; CHECK-LABEL: 'and.i32.nxv4i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i32 @llvm.vector.reduce.and.nxv4i32(<vscale x 4 x i32> %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r + + %r = call i32 @llvm.vector.reduce.and.nxv4i32(<vscale x 4 x i32> %v) + ret i32 %r +} + +define i64 @and.i64.nxv4i64(<vscale x 4 x i64> %v) { +; CHECK-LABEL: 'and.i64.nxv4i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = call i64 @llvm.vector.reduce.and.nxv4i64(<vscale x 4 x i64> %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r + + %r = call i64 @llvm.vector.reduce.and.nxv4i64(<vscale x 4 x i64> %v) + ret i64 %r +} + +define i32 @or.i32.nxv4i32(<vscale x 4 x i32> %v) { +; CHECK-LABEL: 'or.i32.nxv4i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r + + %r = call i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32> %v) + ret i32 %r +} + +define i64 @or.i64.nxv4i64(<vscale x 4 x i64> %v) { +; CHECK-LABEL: 'or.i64.nxv4i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = call i64 @llvm.vector.reduce.or.nxv4i64(<vscale x 4 x i64> %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r + + %r = call i64 @llvm.vector.reduce.or.nxv4i64(<vscale x 4 x i64> %v) + ret i64 %r +} + +define i32 @xor.i32.nxv4i32(<vscale x 4 x i32> %v) { +; CHECK-LABEL: 'xor.i32.nxv4i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r + + %r = call i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32> %v) + ret i32 %r +} + +define i64 @xor.i64.nxv4i64(<vscale x 4 x i64> %v) { +; CHECK-LABEL: 'xor.i64.nxv4i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = call i64 @llvm.vector.reduce.xor.nxv4i64(<vscale x 4 x i64> %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r + + %r = call i64 @llvm.vector.reduce.xor.nxv4i64(<vscale x 4 x i64> %v) + ret i64 %r +} + +define i32 @umin.i32.nxv4i32(<vscale x 4 x i32> %v) { +; CHECK-LABEL: 'umin.i32.nxv4i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i32 @llvm.vector.reduce.umin.nxv4i32(<vscale x 4 x i32> %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r + + %r = call i32 @llvm.vector.reduce.umin.nxv4i32(<vscale x 4 x i32> %v) + ret i32 %r +} + +define i64 @umin.i64.nxv4i64(<vscale x 4 x i64> %v) { +; CHECK-LABEL: 'umin.i64.nxv4i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = call i64 @llvm.vector.reduce.umin.nxv4i64(<vscale x 4 x i64> %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r + + %r = call i64 @llvm.vector.reduce.umin.nxv4i64(<vscale x 4 x i64> %v) + ret i64 %r +} + +define float @fmax.f32.nxv4f32(<vscale x 4 x float> %v) { +; CHECK-LABEL: 'fmax.f32.nxv4f32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r + + %r = call float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float> %v) + ret float %r +} + +define double @fmax.f64.nxv4f64(<vscale x 4 x double> %v) { +; CHECK-LABEL: 'fmax.f64.nxv4f64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = call double @llvm.vector.reduce.fmax.nxv4f64(<vscale x 4 x double> %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r + + %r = call double @llvm.vector.reduce.fmax.nxv4f64(<vscale x 4 x double> %v) + ret double %r +} + +define float @fmin.f32.nxv4f32(<vscale x 4 x float> %v) { +; CHECK-LABEL: 'fmin.f32.nxv4f32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r + + %r = call float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float> %v) + ret float %r +} + +define double @fmin.f64.nxv4f64(<vscale x 4 x double> %v) { +; CHECK-LABEL: 'fmin.f64.nxv4f64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = call double @llvm.vector.reduce.fmin.nxv4f64(<vscale x 4 x double> %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r + + %r = call double @llvm.vector.reduce.fmin.nxv4f64(<vscale x 4 x double> %v) + ret double %r +} + +define i32 @umax.i32.nxv4i32(<vscale x 4 x i32> %v) { +; CHECK-LABEL: 'umax.i32.nxv4i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r + + %r = call i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32> %v) + ret i32 %r +} + +define i64 @umax.i64.nxv4i64(<vscale x 4 x i64> %v) { +; CHECK-LABEL: 'umax.i64.nxv4i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = call i64 @llvm.vector.reduce.umax.nxv4i64(<vscale x 4 x i64> %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r + + %r = call i64 @llvm.vector.reduce.umax.nxv4i64(<vscale x 4 x i64> %v) + ret i64 %r +} + +define i32 @smin.i32.nxv4i32(<vscale x 4 x i32> %v) { +; CHECK-LABEL: 'smin.i32.nxv4i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r + + %r = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> %v) + ret i32 %r +} + +define i64 @smin.i64.nxv4i64(<vscale x 4 x i64> %v) { +; CHECK-LABEL: 'smin.i64.nxv4i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = call i64 @llvm.vector.reduce.smin.nxv4i64(<vscale x 4 x i64> %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r + + %r = call i64 @llvm.vector.reduce.smin.nxv4i64(<vscale x 4 x i64> %v) + ret i64 %r +} + +define i32 @smax.i32.nxv4i32(<vscale x 4 x i32> %v) { +; CHECK-LABEL: 'smax.i32.nxv4i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r + + %r = call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> %v) + ret i32 %r +} + +define i64 @smax.i64.nxv4i64(<vscale x 4 x i64> %v) { +; CHECK-LABEL: 'smax.i64.nxv4i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = call i64 @llvm.vector.reduce.smax.nxv4i64(<vscale x 4 x i64> %v) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r + + %r = call i64 @llvm.vector.reduce.smax.nxv4i64(<vscale x 4 x i64> %v) + ret i64 %r +} + +define float @fadda_nxv4f32(float %start, <vscale x 4 x float> %a) #0 { +; CHECK-LABEL: 'fadda_nxv4f32 +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = call float @llvm.vector.reduce.fadd.nxv4f32(float %start, <vscale x 4 x float> %a) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %res + + %res = call float @llvm.vector.reduce.fadd.nxv4f32(float %start, <vscale x 4 x float> %a) + ret float %res +} + +define double @fadda_nxv4f64(double %start, <vscale x 4 x double> %a) #0 { +; CHECK-LABEL: 'fadda_nxv4f64 +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %res = call double @llvm.vector.reduce.fadd.nxv4f64(double %start, <vscale x 4 x double> %a) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %res + + %res = call double @llvm.vector.reduce.fadd.nxv4f64(double %start, <vscale x 4 x double> %a) + ret double %res +} + + +declare i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32>) +declare i32 @llvm.vector.reduce.mul.nxv4i32(<vscale x 4 x i32>) +declare i32 @llvm.vector.reduce.and.nxv4i32(<vscale x 4 x i32>) +declare i32 @llvm.vector.reduce.or.nxv4i32(<vscale x 4 x i32>) +declare i32 @llvm.vector.reduce.xor.nxv4i32(<vscale x 4 x i32>) +declare float @llvm.vector.reduce.fmax.nxv4f32(<vscale x 4 x float>) +declare float @llvm.vector.reduce.fmin.nxv4f32(<vscale x 4 x float>) +declare i32 @llvm.vector.reduce.fmin.nxv4i32(<vscale x 4 x i32>) +declare i32 @llvm.vector.reduce.umin.nxv4i32(<vscale x 4 x i32>) +declare i32 @llvm.vector.reduce.umax.nxv4i32(<vscale x 4 x i32>) +declare i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32>) +declare i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32>) +declare float @llvm.vector.reduce.fadd.nxv4f32(float, <vscale x 4 x float>) +declare i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64>) +declare i64 @llvm.vector.reduce.mul.nxv4i64(<vscale x 4 x i64>) +declare i64 @llvm.vector.reduce.and.nxv4i64(<vscale x 4 x i64>) +declare i64 @llvm.vector.reduce.or.nxv4i64(<vscale x 4 x i64>) +declare i64 @llvm.vector.reduce.xor.nxv4i64(<vscale x 4 x i64>) +declare double @llvm.vector.reduce.fmax.nxv4f64(<vscale x 4 x double>) +declare double @llvm.vector.reduce.fmin.nxv4f64(<vscale x 4 x double>) +declare i64 @llvm.vector.reduce.umin.nxv4i64(<vscale x 4 x i64>) +declare i64 @llvm.vector.reduce.umax.nxv4i64(<vscale x 4 x i64>) +declare i64 @llvm.vector.reduce.smin.nxv4i64(<vscale x 4 x i64>) +declare i64 @llvm.vector.reduce.smax.nxv4i64(<vscale x 4 x i64>) +declare double @llvm.vector.reduce.fadd.nxv4f64(double, <vscale x 4 x double>) _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits