extract costs and Apple CPUs. (#146526) (PR #149815)

via llvm-branch-commits Mon, 21 Jul 2025 06:24:38 -0700

llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-aarch64

Author: Florian Hahn (fhahn)

<details>
<summary>Changes</summary>

Back-port https://github.com/llvm/llvm-project/pull/146526 
(https://github.com/llvm/llvm-project/commit/02d3738be92eac38cebfb7b670673abb9538ca76)
 for the 21.x release, just for Apple
CPUs. As discussed during the review, the patch was landed just after
the branch, to avoid regressions. We already did a careful performance
analysis on Apple M series CPUs with this change and are seeing
significant gains on a number of workloads, which we would like to
enable for 21.x

Original message:
getVectorInstrCostHelper would return costs of zero for vector
inserts/extracts that move data between GPR and vector registers, if
there was no 'real' use, i.e. there was no corresponding existing
instruction.

This meant that passes like LoopVectorize and SLPVectorizer, which
likely are the main users of the interface, would understimate the cost
of insert/extracts that move data between GPR and vector registers,
which has non-trivial costs.

The patch removes the special case and only returns costs of zero for
lane 0 if it there is no need to transfer between integer and vector
registers.

This impacts a number of SLP test, and most of them look like general
improvements.I think the change should make things more accurate for any
AArch64 target, but if not it could also just be Apple CPU specific.

I am seeing +2% end-to-end improvements on SLP-heavy workloads.

PR: https://github.com/llvm/llvm-project/pull/146526

---
Full diff: https://github.com/llvm/llvm-project/pull/149815.diff


3 Files Affected:

- (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (+24-2) 
- (modified) llvm/test/Transforms/SLPVectorizer/AArch64/external-use-icmp.ll 
(+53-1) 
- (modified) 
llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll 
(+10-10) 


``````````diff
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp 
b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 07baf29ce7016..a0d71acce5b9d 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3894,13 +3894,34 @@ InstructionCost 
AArch64TTIImpl::getVectorInstrCostHelper(
                                        : ST->getVectorInsertExtractBaseCost();
 }
 
+/// Returns true of \p ProcFamily is Apple M1-M4 or any of the aligned A series
+/// CPUs.
+static bool isAppleMCoreLike(unsigned ProcFamily) {
+  switch (ProcFamily) {
+  case AArch64Subtarget::AppleA14:
+  case AArch64Subtarget::AppleA15:
+  case AArch64Subtarget::AppleA16:
+  case AArch64Subtarget::AppleM4:
+    return true;
+  default:
+    return false;
+  };
+}
+
 InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
                                                    TTI::TargetCostKind 
CostKind,
                                                    unsigned Index,
                                                    const Value *Op0,
                                                    const Value *Op1) const {
+
   bool HasRealUse =
       Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
+  if (isAppleMCoreLike(ST->getProcFamily())) {
+    if (Opcode == Instruction::InsertElement && Index == 0 && Op0 &&
+        isa<PoisonValue>(Op0))
+      return 0;
+    HasRealUse = true;
+  }
   return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, HasRealUse);
 }
 
@@ -3908,8 +3929,9 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(
     unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
     Value *Scalar,
     ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
-  return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, false, nullptr,
-                                  Scalar, ScalarUserAndIdx);
+  bool HasRealUse = isAppleMCoreLike(ST->getProcFamily());
+  return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, HasRealUse,
+                                  nullptr, Scalar, ScalarUserAndIdx);
 }
 
 InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/external-use-icmp.ll 
b/llvm/test/Transforms/SLPVectorizer/AArch64/external-use-icmp.ll
index 2b5ee59aeb163..e3eff89f071c7 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/external-use-icmp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/external-use-icmp.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 
UTC_ARGS: --version 4
-; RUN: opt -S --passes=slp-vectorizer -mtriple=aarch64 -slp-threshold=-20 
-slp-vectorize-hor=0 < %s | FileCheck %s
+; RUN: opt -S --passes=slp-vectorizer -mtriple=aarch64 -slp-threshold=-20 
-slp-vectorize-hor=0 < %s | FileCheck --check-prefix=DEFAULT %s
+; RUN: opt -S --passes=slp-vectorizer -mtriple=aarch64 -slp-threshold=-20 
-slp-vectorize-hor=0 -mcpu=apple-m1 < %s | FileCheck --check-prefix=APPLE-M1 %s
 
 define i16 @foo(i16 %in1, i16 %in2) {
 ; CHECK-LABEL: define i16 @foo(
@@ -29,6 +30,57 @@ define i16 @foo(i16 %in1, i16 %in2) {
 ; CHECK-NEXT:    [[ADD3:%.*]] = add nuw nsw i16 [[ADD2]], [[ZEXT3_2]]
 ; CHECK-NEXT:    ret i16 [[ADD3]]
 ;
+; DEFAULT-LABEL: define i16 @foo(
+; DEFAULT-SAME: i16 [[IN1:%.*]], i16 [[IN2:%.*]]) {
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = insertelement <2 x i16> poison, i16 [[IN1]], 
i32 0
+; DEFAULT-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> 
poison, <2 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP2:%.*]] = zext <2 x i16> [[TMP1]] to <2 x i64>
+; DEFAULT-NEXT:    [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[IN2]], 
i32 0
+; DEFAULT-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> 
poison, <2 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP5:%.*]] = zext <2 x i16> [[TMP4]] to <2 x i64>
+; DEFAULT-NEXT:    [[TMP6:%.*]] = mul nuw nsw <2 x i64> [[TMP5]], [[TMP2]]
+; DEFAULT-NEXT:    [[TMP7:%.*]] = and <2 x i64> [[TMP6]], splat (i64 65535)
+; DEFAULT-NEXT:    [[TMP8:%.*]] = icmp ne <2 x i64> [[TMP7]], splat (i64 65533)
+; DEFAULT-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
+; DEFAULT-NEXT:    [[ZEXT3_1:%.*]] = zext i1 [[TMP9]] to i16
+; DEFAULT-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
+; DEFAULT-NEXT:    [[CMP2_1:%.*]] = icmp ne i64 [[TMP10]], 196605
+; DEFAULT-NEXT:    [[ZEXT4_1:%.*]] = zext i1 [[CMP2_1]] to i16
+; DEFAULT-NEXT:    [[ADD1:%.*]] = add nuw nsw i16 [[ZEXT3_1]], [[ZEXT4_1]]
+; DEFAULT-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
+; DEFAULT-NEXT:    [[ZEXT3_2:%.*]] = zext i1 [[TMP11]] to i16
+; DEFAULT-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
+; DEFAULT-NEXT:    [[CMP2_2:%.*]] = icmp ne i64 [[TMP12]], 196605
+; DEFAULT-NEXT:    [[ZEXT4_2:%.*]] = zext i1 [[CMP2_2]] to i16
+; DEFAULT-NEXT:    [[ADD2:%.*]] = add nuw nsw i16 [[ADD1]], [[ZEXT4_2]]
+; DEFAULT-NEXT:    [[ADD3:%.*]] = add nuw nsw i16 [[ADD2]], [[ZEXT3_2]]
+; DEFAULT-NEXT:    ret i16 [[ADD3]]
+;
+; APPLE-M1-LABEL: define i16 @foo(
+; APPLE-M1-SAME: i16 [[IN1:%.*]], i16 [[IN2:%.*]]) #[[ATTR0:[0-9]+]] {
+; APPLE-M1-NEXT:  entry:
+; APPLE-M1-NEXT:    [[ZEXT1_1:%.*]] = zext i16 [[IN1]] to i64
+; APPLE-M1-NEXT:    [[ZEXT2_1:%.*]] = zext i16 [[IN2]] to i64
+; APPLE-M1-NEXT:    [[TMP10:%.*]] = mul nuw nsw i64 [[ZEXT2_1]], [[ZEXT1_1]]
+; APPLE-M1-NEXT:    [[AND1:%.*]] = and i64 [[TMP10]], 65535
+; APPLE-M1-NEXT:    [[TMP9:%.*]] = icmp ne i64 [[AND1]], 65533
+; APPLE-M1-NEXT:    [[ZEXT3_1:%.*]] = zext i1 [[TMP9]] to i16
+; APPLE-M1-NEXT:    [[CMP2_1:%.*]] = icmp ne i64 [[TMP10]], 196605
+; APPLE-M1-NEXT:    [[ZEXT4_1:%.*]] = zext i1 [[CMP2_1]] to i16
+; APPLE-M1-NEXT:    [[ADD1:%.*]] = add nuw nsw i16 [[ZEXT3_1]], [[ZEXT4_1]]
+; APPLE-M1-NEXT:    [[ZEXT1_2:%.*]] = zext i16 [[IN1]] to i64
+; APPLE-M1-NEXT:    [[ZEXT2_2:%.*]] = zext i16 [[IN2]] to i64
+; APPLE-M1-NEXT:    [[TMP12:%.*]] = mul nuw nsw i64 [[ZEXT2_2]], [[ZEXT1_2]]
+; APPLE-M1-NEXT:    [[AND2:%.*]] = and i64 [[TMP12]], 65535
+; APPLE-M1-NEXT:    [[TMP11:%.*]] = icmp ne i64 [[AND2]], 65533
+; APPLE-M1-NEXT:    [[ZEXT3_2:%.*]] = zext i1 [[TMP11]] to i16
+; APPLE-M1-NEXT:    [[CMP2_2:%.*]] = icmp ne i64 [[TMP12]], 196605
+; APPLE-M1-NEXT:    [[ZEXT4_2:%.*]] = zext i1 [[CMP2_2]] to i16
+; APPLE-M1-NEXT:    [[ADD2:%.*]] = add nuw nsw i16 [[ADD1]], [[ZEXT4_2]]
+; APPLE-M1-NEXT:    [[ADD3:%.*]] = add nuw nsw i16 [[ADD2]], [[ZEXT3_2]]
+; APPLE-M1-NEXT:    ret i16 [[ADD3]]
+;
 entry:
   %zext1_1 = zext i16 %in1 to i64
   %zext2_1 = zext i16 %in2 to i64
diff --git 
a/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll
 
b/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll
index e6e5f5196d3da..867c607e7b1d8 100644
--- 
a/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll
+++ 
b/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=vector-combine -mtriple=arm64-apple-darwinos -S %s | 
FileCheck --check-prefixes=CHECK,LIMIT-DEFAULT %s
-; RUN: opt -passes=vector-combine -mtriple=arm64-apple-darwinos 
-vector-combine-max-scan-instrs=2 -S %s | FileCheck 
--check-prefixes=CHECK,LIMIT2 %s
+; RUN: opt -passes=vector-combine -mtriple=arm64-apple-darwinos -mcpu=apple-m1 
-S %s | FileCheck --check-prefixes=CHECK,LIMIT-DEFAULT %s
+; RUN: opt -passes=vector-combine -mtriple=arm64-apple-darwinos -mcpu=apple-m1 
-vector-combine-max-scan-instrs=2 -S %s | FileCheck 
--check-prefixes=CHECK,LIMIT2 %s
 
 define i32 @load_extract_idx_0(ptr %x) {
 ; CHECK-LABEL: @load_extract_idx_0(
@@ -669,10 +669,10 @@ define i1 @load_with_non_power_of_2_element_type_2(ptr 
%x) {
 ; Scalarizing the load for multiple constant indices may not be profitable.
 define i32 @load_multiple_extracts_with_constant_idx(ptr %x) {
 ; CHECK-LABEL: @load_multiple_extracts_with_constant_idx(
-; CHECK-NEXT:    [[LV:%.*]] = load <4 x i32>, ptr [[X:%.*]], align 16
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[LV]], <4 x i32> 
poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[LV]], [[SHIFT]]
-; CHECK-NEXT:    [[RES:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[E_0:%.*]] = load i32, ptr [[TMP1:%.*]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr 
[[TMP1]], i32 0, i32 1
+; CHECK-NEXT:    [[E_1:%.*]] = load i32, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[RES:%.*]] = add i32 [[E_0]], [[E_1]]
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
   %lv = load <4 x i32>, ptr %x
@@ -686,10 +686,10 @@ define i32 @load_multiple_extracts_with_constant_idx(ptr 
%x) {
 ; because the vector large vector requires 2 vector registers.
 define i32 @load_multiple_extracts_with_constant_idx_profitable(ptr %x) {
 ; CHECK-LABEL: @load_multiple_extracts_with_constant_idx_profitable(
-; CHECK-NEXT:    [[LV:%.*]] = load <8 x i32>, ptr [[X:%.*]], align 16
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i32> [[LV]], <8 x i32> 
poison, <8 x i32> <i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 
poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[LV]], [[SHIFT]]
-; CHECK-NEXT:    [[RES:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[E_0:%.*]] = load i32, ptr [[TMP1:%.*]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x i32>, ptr 
[[TMP1]], i32 0, i32 6
+; CHECK-NEXT:    [[E_1:%.*]] = load i32, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[RES:%.*]] = add i32 [[E_0]], [[E_1]]
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
   %lv = load <8 x i32>, ptr %x, align 16

``````````

</details>


https://github.com/llvm/llvm-project/pull/149815
_______________________________________________
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

[llvm-branch-commits] [llvm] release/21.x: [AArch64, TTI] Disable RealUse check for vector insert/extract costs and Apple CPUs. (#146526) (PR #149815)

Reply via email to