Author: Caroline Concatto Date: 2021-01-04T13:59:58Z New Revision: 060cfd97954835c3be18e47c631d3efb3e374439
URL: https://github.com/llvm/llvm-project/commit/060cfd97954835c3be18e47c631d3efb3e374439 DIFF: https://github.com/llvm/llvm-project/commit/060cfd97954835c3be18e47c631d3efb3e374439.diff LOG: [AArch64][SVE]Add cost model for masked gather and scatter for scalable vector. A new TTI interface has been added 'Optional <unsigned>getMaxVScale' that returns the maximum vscale for a given target. When known getMaxVScale is used to compute the cost of masked gather scatter for scalable vector. Depends on D92094 Differential Revision: https://reviews.llvm.org/D93030 Added: llvm/test/Analysis/CostModel/AArch64/sve-getIntrinsicInstrCost-gather.ll llvm/test/Analysis/CostModel/AArch64/sve-getIntrinsicInstrCost-scatter.ll Modified: llvm/include/llvm/Analysis/TargetTransformInfo.h llvm/include/llvm/Analysis/TargetTransformInfoImpl.h llvm/include/llvm/CodeGen/BasicTTIImpl.h llvm/lib/Analysis/TargetTransformInfo.cpp llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h Removed: ################################################################################ diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 0953a3b3f451..d9d04429b181 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -928,6 +928,10 @@ class TargetTransformInfo { /// \return The width of the smallest vector register type. unsigned getMinVectorRegisterBitWidth() const; + /// \return The maximum value of vscale if the target specifies an + /// architectural maximum vector length, and None otherwise. + Optional<unsigned> getMaxVScale() const; + /// \return True if the vectorization factor should be chosen to /// make the vector of the smallest element type match the size of a /// vector register. For wider element types, this could result in @@ -1504,6 +1508,7 @@ class TargetTransformInfo::Concept { virtual const char *getRegisterClassName(unsigned ClassID) const = 0; virtual unsigned getRegisterBitWidth(bool Vector) const = 0; virtual unsigned getMinVectorRegisterBitWidth() = 0; + virtual Optional<unsigned> getMaxVScale() const = 0; virtual bool shouldMaximizeVectorBandwidth(bool OptSize) const = 0; virtual unsigned getMinimumVF(unsigned ElemWidth) const = 0; virtual unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const = 0; @@ -1921,6 +1926,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { unsigned getMinVectorRegisterBitWidth() override { return Impl.getMinVectorRegisterBitWidth(); } + Optional<unsigned> getMaxVScale() const override { + return Impl.getMaxVScale(); + } bool shouldMaximizeVectorBandwidth(bool OptSize) const override { return Impl.shouldMaximizeVectorBandwidth(OptSize); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 620bfb885b54..ef0653d0d9f4 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -369,6 +369,8 @@ class TargetTransformInfoImplBase { unsigned getMinVectorRegisterBitWidth() const { return 128; } + Optional<unsigned> getMaxVScale() const { return None; } + bool shouldMaximizeVectorBandwidth(bool OptSize) const { return false; } unsigned getMinimumVF(unsigned ElemWidth) const { return 0; } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 02f1b73226fc..9776c20400d6 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -571,6 +571,8 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { unsigned getRegisterBitWidth(bool Vector) const { return 32; } + Optional<unsigned> getMaxVScale() const { return None; } + /// Estimate the overhead of scalarizing an instruction. Insert and Extract /// are set if the demanded result elements need to be inserted and/or /// extracted from vectors. @@ -1239,8 +1241,6 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { return thisT()->getMemcpyCost(ICA.getInst()); case Intrinsic::masked_scatter: { - if (isa<ScalableVectorType>(RetTy)) - return BaseT::getIntrinsicInstrCost(ICA, CostKind); assert(VF.isScalar() && "Can't vectorize types here."); const Value *Mask = Args[3]; bool VarMask = !isa<Constant>(Mask); @@ -1250,8 +1250,6 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { VarMask, Alignment, CostKind, I); } case Intrinsic::masked_gather: { - if (isa<ScalableVectorType>(RetTy)) - return BaseT::getIntrinsicInstrCost(ICA, CostKind); assert(VF.isScalar() && "Can't vectorize types here."); const Value *Mask = Args[2]; bool VarMask = !isa<Constant>(Mask); diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index becf74c64fd5..5100109959d6 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -627,6 +627,10 @@ unsigned TargetTransformInfo::getMinVectorRegisterBitWidth() const { return TTIImpl->getMinVectorRegisterBitWidth(); } +Optional<unsigned> TargetTransformInfo::getMaxVScale() const { + return TTIImpl->getMaxVScale(); +} + bool TargetTransformInfo::shouldMaximizeVectorBandwidth(bool OptSize) const { return TTIImpl->shouldMaximizeVectorBandwidth(OptSize); } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index d97570755291..aaf7371c7933 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -770,6 +770,26 @@ AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { return Options; } +unsigned AArch64TTIImpl::getGatherScatterOpCost( + unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, + Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { + auto *VT = cast<VectorType>(DataTy); + auto LT = TLI->getTypeLegalizationCost(DL, DataTy); + ElementCount LegalVF = LT.second.getVectorElementCount(); + if (!LegalVF.isScalable()) + return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, + Alignment, CostKind, I); + + Optional<unsigned> MaxNumVScale = getMaxVScale(); + assert(MaxNumVScale && "Expected valid max vscale value"); + + unsigned MemOpCost = + getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I); + unsigned MaxNumElementsPerGather = + MaxNumVScale.getValue() * LegalVF.getKnownMinValue(); + return LT.first * MaxNumElementsPerGather * MemOpCost; +} + bool AArch64TTIImpl::useNeonVector(const Type *Ty) const { return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors(); } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index c8e721b1fb9f..7dded02b2a6f 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -115,8 +115,19 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> { return ST->getMinVectorRegisterBitWidth(); } + Optional<unsigned> getMaxVScale() const { + if (ST->hasSVE()) + return AArch64::SVEMaxBitsPerVector / AArch64::SVEBitsPerBlock; + return BaseT::getMaxVScale(); + } + unsigned getMaxInterleaveFactor(unsigned VF); + unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, + const Value *Ptr, bool VariableMask, + Align Alignment, TTI::TargetCostKind CostKind, + const Instruction *I = nullptr); + int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I = nullptr); diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-getIntrinsicInstrCost-gather.ll b/llvm/test/Analysis/CostModel/AArch64/sve-getIntrinsicInstrCost-gather.ll new file mode 100644 index 000000000000..38b41b731dd0 --- /dev/null +++ b/llvm/test/Analysis/CostModel/AArch64/sve-getIntrinsicInstrCost-gather.ll @@ -0,0 +1,37 @@ +; Check getIntrinsicInstrCost in BasicTTIImpl.h for masked gather + +; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s + +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +define <vscale x 4 x i32> @masked_gather_nxv4i32(<vscale x 4 x i32*> %ld, <vscale x 4 x i1> %masks, <vscale x 4 x i32> %passthru) { +; CHECK-LABEL: 'masked_gather_nxv4i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %res = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> %ld, i32 0, <vscale x 4 x i1> %masks, <vscale x 4 x i32> %passthru) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <vscale x 4 x i32> %res + %res = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*> %ld, i32 0, <vscale x 4 x i1> %masks, <vscale x 4 x i32> %passthru) + ret <vscale x 4 x i32> %res +} + +define <vscale x 8 x i32> @masked_gather_nxv8i32(<vscale x 8 x i32*> %ld, <vscale x 8 x i1> %masks, <vscale x 8 x i32> %passthru) { +; CHECK-LABEL: 'masked_gather_nxv8i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %res = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32.nxv8p0i32(<vscale x 8 x i32*> %ld, i32 0, <vscale x 8 x i1> %masks, <vscale x 8 x i32> %passthru) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <vscale x 8 x i32> %res + %res = call <vscale x 8 x i32> @llvm.masked.gather.nxv8i32(<vscale x 8 x i32*> %ld, i32 0, <vscale x 8 x i1> %masks, <vscale x 8 x i32> %passthru) + ret <vscale x 8 x i32> %res +} + +define <4 x i32> @masked_gather_v4i32(<4 x i32*> %ld, <4 x i1> %masks, <4 x i32> %passthru) { +; CHECK-LABEL: 'masked_gather_v4i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ld, i32 0, <4 x i1> %masks, <4 x i32> %passthru) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %res + + %res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ld, i32 0, <4 x i1> %masks, <4 x i32> %passthru) + ret <4 x i32> %res +} + +declare <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*> %ptrs, i32 %align, <vscale x 4 x i1> %masks, <vscale x 4 x i32> %passthru) +declare <vscale x 8 x i32> @llvm.masked.gather.nxv8i32(<vscale x 8 x i32*> %ptrs, i32 %align, <vscale x 8 x i1> %masks, <vscale x 8 x i32> %passthru) +declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 %align, <4 x i1> %masks, <4 x i32> %passthru) diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-getIntrinsicInstrCost-scatter.ll b/llvm/test/Analysis/CostModel/AArch64/sve-getIntrinsicInstrCost-scatter.ll new file mode 100644 index 000000000000..4370922e4bf7 --- /dev/null +++ b/llvm/test/Analysis/CostModel/AArch64/sve-getIntrinsicInstrCost-scatter.ll @@ -0,0 +1,40 @@ +; Check getIntrinsicInstrCost in BasicTTIImpl.h with for masked scatter + +; RUN: opt -cost-model -analyze -mtriple=aarch64--linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s + +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + + +define void @masked_scatter_nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x i32*> %ptrs, <vscale x 4 x i1> %masks) { +; CHECK-LABEL: 'masked_scatter_nxv4i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> %data, <vscale x 4 x i32*> %ptrs, i32 0, <vscale x 4 x i1> %masks) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void + + call void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x i32*> %ptrs, i32 0, <vscale x 4 x i1> %masks) + ret void +} + +define void @masked_scatter_nxv8i32(<vscale x 8 x i32> %data, <vscale x 8 x i32*> %ptrs, <vscale x 8 x i1> %masks) { +; CHECK-LABEL: 'masked_scatter_nxv8i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 128 for instruction: call void @llvm.masked.scatter.nxv8i32.nxv8p0i32(<vscale x 8 x i32> %data, <vscale x 8 x i32*> %ptrs, i32 0, <vscale x 8 x i1> %masks) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void + + call void @llvm.masked.scatter.nxv8i32(<vscale x 8 x i32> %data, <vscale x 8 x i32*> %ptrs, i32 0, <vscale x 8 x i1> %masks) + ret void +} + +define void @masked_scatter_v4i32(<4 x i32> %data, <4 x i32*> %ptrs, <4 x i1> %masks) { +; CHECK-LABEL: 'masked_scatter_v4i32' +; CHECK-NEXT: Cost Model: Found an estimated cost of 29 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %data, <4 x i32*> %ptrs, i32 0, <4 x i1> %masks) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void + + call void @llvm.masked.scatter.v4i32(<4 x i32> %data, <4 x i32*> %ptrs, i32 0, <4 x i1> %masks) + ret void +} + +declare void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x i32*> %ptrs, i32 %align, <vscale x 4 x i1> %masks) +declare void @llvm.masked.scatter.nxv8i32(<vscale x 8 x i32> %data, <vscale x 8 x i32*> %ptrs, i32 %align, <vscale x 8 x i1> %masks) +declare void @llvm.masked.scatter.v4i32(<4 x i32> %data, <4 x i32*> %ptrs, i32 %align, <4 x i1> %masks) _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits