The final loop in the PR s313 is fixed by removing he else branch that shouldn't
be there now that we look at a ratio of dups instead of a yes/no.
By setting m_loop_fully_scalar_dup the final costing doesn't think there's any
dup at all in the inner loop costing.
This returns the codegen to what it was for GCC 15. However as the PR mentions
with the broken costing -O3 did improve. The reason for this was that it was
preventing unrolling at -O3.
The loop generates at -O3:
.L5:
add w2, w2, w7
ld1w z26.s, p7/z, [x1]
ld1w z29.s, p7/z, [x0]
ld1w z2.s, p7/z, [x1, #1, mul vl]
ld1w z27.s, p7/z, [x0, #1, mul vl]
ld1w z1.s, p7/z, [x1, #2, mul vl]
ld1w z30.s, p7/z, [x0, #2, mul vl]
ld1w z0.s, p7/z, [x1, #3, mul vl]
ld1w z28.s, p7/z, [x0, #3, mul vl]
fmul z29.s, z26.s, z29.s
fmul z27.s, z2.s, z27.s
fadda s31, p7, s31, z29.s
fmul z30.s, z1.s, z30.s
fadda s31, p7, s31, z27.s
fmul z28.s, z0.s, z28.s
fadda s31, p7, s31, z30.s
add x1, x1, x3
add x0, x0, x3
fadda s31, p7, s31, z28.s
cmp w2, w6
bls .L5
Which is silly due to the limited throughput of the instructions in this loop.
the left fold reduction is a single cycle reduction since the unrolling needs
the accumulation to happen in the same scalar.
However aarch64_force_single_cycle doesn't detected this and so the costing of
the loop is wrong. Even fixing that though the cost model does still think it's
beneficial because the throughput restrictions for the reductions aren't modeled
but to get it to reject the unrolling the cost have to be increased
unrealistically.
The better approach I think would be for us to model pipeline restrictions more
such that we have the ability to say that the above is a linear reduction chain.
However since that's not a regression and needs quite a bit of work, punted to
GCC 17.
Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
Pushed.
Thanks,
Tamar
gcc/ChangeLog:
PR target/121290
* config/aarch64/aarch64.cc (aarch64_vector_costs::add_stmt_cost):
Remove else.
gcc/testsuite/ChangeLog:
PR target/121290
* gcc.target/aarch64/pr121290_3.c: New test.
---
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index
0712739579091d4696463746a9c5f7b65f5e97ad..29d2400a6e9d6277ee5ed283ff8e11041d920435
100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -18614,8 +18614,6 @@ aarch64_vector_costs::add_stmt_cost (int count,
vect_cost_for_stmt kind,
|| SLP_TREE_DEF_TYPE (node) == vect_external_def)
&& !aarch64_possible_by_lane_insn_p (m_vinfo, stmt))
m_num_dup_stmts++;
- else
- m_loop_fully_scalar_dup = false;
}
/* Apply the heuristic described above m_stp_sequence_cost. */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr121290_3.c
b/gcc/testsuite/gcc.target/aarch64/pr121290_3.c
new file mode 100644
index
0000000000000000000000000000000000000000..1cb760ede4141adde839efa89dddce4bdb6eb244
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr121290_3.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -mcpu=neoverse-v2 -fdump-tree-vect-all
-std=c99" } */
+
+#define iterations 100000
+#define LEN_1D 32000
+
+float a[LEN_1D];
+float b[LEN_1D];
+
+int main()
+{
+ float dot;
+ for (int nl = 0; nl < iterations*5; nl++) {
+ dot = 0.0f;
+ for (int i = 0; i < LEN_1D; i++) {
+ dot += a[i] * b[i];
+ }
+ }
+
+ return dot;
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump-not "OUTER LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "low throughput of per iteration due to splats"
"vect" } } */
--
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 0712739579091d4696463746a9c5f7b65f5e97ad..29d2400a6e9d6277ee5ed283ff8e11041d920435 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -18614,8 +18614,6 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
|| SLP_TREE_DEF_TYPE (node) == vect_external_def)
&& !aarch64_possible_by_lane_insn_p (m_vinfo, stmt))
m_num_dup_stmts++;
- else
- m_loop_fully_scalar_dup = false;
}
/* Apply the heuristic described above m_stp_sequence_cost. */
diff --git a/gcc/testsuite/gcc.target/aarch64/pr121290_3.c b/gcc/testsuite/gcc.target/aarch64/pr121290_3.c
new file mode 100644
index 0000000000000000000000000000000000000000..1cb760ede4141adde839efa89dddce4bdb6eb244
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr121290_3.c
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -mcpu=neoverse-v2 -fdump-tree-vect-all -std=c99" } */
+
+#define iterations 100000
+#define LEN_1D 32000
+
+float a[LEN_1D];
+float b[LEN_1D];
+
+int main()
+{
+ float dot;
+ for (int nl = 0; nl < iterations*5; nl++) {
+ dot = 0.0f;
+ for (int i = 0; i < LEN_1D; i++) {
+ dot += a[i] * b[i];
+ }
+ }
+
+ return dot;
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump-not "OUTER LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "low throughput of per iteration due to splats" "vect" } } */