Consider the example: void f (int *restrict x, int *restrict y, int *restrict z, int n) { for (int i = 0; i < 4; ++i) { int res = 0; for (int j = 0; j < 100; ++j) res += y[j] * z[i]; x[i] = res; } }
we currently vectorize as f: movi v30.4s, 0 ldr q31, [x2] add x2, x1, 400 .L2: ld1r {v29.4s}, [x1], 4 mla v30.4s, v29.4s, v31.4s cmp x2, x1 bne .L2 str q30, [x0] ret which is not useful because by doing inner-loop vectorization we're performing less work per iteration than we would had we done outer-loop vectorization and simply unrolled the inner loop. This patch teaches the cost model that if all your leafs are invariant, then adjust the loop cost by * VF, since every vector iteration is really just doing 1 scalar. We now cost the loop as note: Cost model analysis: Vector inside of loop cost: 2000 Vector prologue cost: 4 Vector epilogue cost: 0 Scalar iteration cost: 300 Scalar outside cost: 0 Vector outside cost: 4 prologue iterations: 0 epilogue iterations: 0 missed: cost model: the vector iteration cost = 2000 divided by the scalar iteration cost = 300 is greater or equal to the vectorization factor = 4. missed: not vectorized: vectorization not profitable. missed: not vectorized: vector version will never be profitable. missed: Loop costings may not be worthwhile. And subsequently generate: .L5: add w4, w4, w7 ld1w z24.s, p6/z, [x0, #1, mul vl] ld1w z23.s, p6/z, [x0, #2, mul vl] ld1w z22.s, p6/z, [x0, #3, mul vl] ld1w z29.s, p6/z, [x0] mla z26.s, p6/m, z24.s, z30.s add x0, x0, x8 mla z27.s, p6/m, z23.s, z30.s mla z28.s, p6/m, z22.s, z30.s mla z25.s, p6/m, z29.s, z30.s cmp w4, w6 bls .L5 and avoids the load and replicate if it knows it has enough vector pipes to do so. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues, and fixes all the reported TSVC regressions. Ok for master? Thanks, Tamar gcc/ChangeLog: PR target/121290 * config/aarch64/aarch64.cc (class aarch64_vector_costs ): Add m_loop_fully_scalar_dup. (aarch64_vector_costs::add_stmt_cost): Detect invariant inner loops. (adjust_body_cost): Adjust final costing if m_loop_fully_scalar_dup. gcc/testsuite/ChangeLog: PR target/121290 * gcc.target/aarch64/pr121290.c: Update testcase. --- diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 1278d563d5efa08b6a9871f69b63f0786877bf99..12ef977854a14d92f94b8938bee0d28890b85e0d 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -17057,6 +17057,14 @@ private: or vector loop. There is one entry for each tuning option of interest. */ auto_vec<aarch64_vec_op_count, 2> m_ops; + + /* When doing inner-loop vectorization the constraints on the data-refs in the + outer-loop could limit the inner loop references. i.e. the outerloop can + force the inner-loop to do a load and splat which will result in the loop + being entirely scalar as all lanes work on a duplicate. This flag + indicates when such cases occur and the loop is working on fully splatted + lanes. */ + bool m_loop_fully_scalar_dup = false; }; aarch64_vector_costs::aarch64_vector_costs (vec_info *vinfo, @@ -18079,6 +18087,29 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, analyze_loop_vinfo (loop_vinfo); m_analyzed_vinfo = true; + if (in_inner_loop_p) + m_loop_fully_scalar_dup = true; + } + + /* Detect whether the loop is working on fully duplicated lanes. This would + only be possible with inner loop vectorization since otherwise we wouldn't + try to vectorize. */ + if (in_inner_loop_p + && node + && m_loop_fully_scalar_dup + && SLP_TREE_LANES (node) == 1 + && !SLP_TREE_CHILDREN (node).exists ()) + { + /* Check if load is a duplicate. */ + if (gimple_vuse (stmt_info->stmt)) + m_loop_fully_scalar_dup + = m_loop_fully_scalar_dup + && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_INVARIANT; + else if (SLP_TREE_DEF_TYPE (node) == vect_constant_def + || SLP_TREE_DEF_TYPE (node) == vect_external_def) + ; + else + m_loop_fully_scalar_dup = false; } /* Apply the heuristic described above m_stp_sequence_cost. */ @@ -18445,8 +18476,22 @@ adjust_body_cost (loop_vec_info loop_vinfo, if (m_vec_flags & VEC_ANY_SVE) threshold = CEIL (threshold, aarch64_estimated_sve_vq ()); - if (m_num_vector_iterations >= 1 - && m_num_vector_iterations < threshold) + /* Increase the cost of the vector code if it looks like the vector code is + just executing the scalar code but as vector. i.e. has every lane + duplicated from scalar. In this case throughput of the vector code is 1 + scalar element per iteration and vectorization is not profitable. Scale + the code by VF to ensure it never is. */ + if (m_loop_fully_scalar_dup) + { + body_cost *= estimated_vf; + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Increasing body cost to %d because vector code has" + " throughput of 1 scalar element per iteration\n", + body_cost); + } + else if (m_num_vector_iterations >= 1 + && m_num_vector_iterations < threshold) { if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, diff --git a/gcc/testsuite/gcc.target/aarch64/pr121290.c b/gcc/testsuite/gcc.target/aarch64/pr121290.c index c88cb7e6979ef43ebaf14c3d3f3c4c561bff3e76..566ed0007f851e7f8831f487fa3f60a17aa5b19a 100644 --- a/gcc/testsuite/gcc.target/aarch64/pr121290.c +++ b/gcc/testsuite/gcc.target/aarch64/pr121290.c @@ -14,4 +14,6 @@ f (int *restrict x, int *restrict y, int *restrict z, int n) } /* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ +/* { dg-final { scan-tree-dump-not "OUTER LOOP VECTORIZED" "vect" } } */ /* { dg-final { scan-tree-dump-not "load operations = 0" "vect" } } */ +/* { dg-final { scan-tree-dump "throughput of 1 scalar element per iteration" "vect" } } */ --
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 1278d563d5efa08b6a9871f69b63f0786877bf99..12ef977854a14d92f94b8938bee0d28890b85e0d 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -17057,6 +17057,14 @@ private: or vector loop. There is one entry for each tuning option of interest. */ auto_vec<aarch64_vec_op_count, 2> m_ops; + + /* When doing inner-loop vectorization the constraints on the data-refs in the + outer-loop could limit the inner loop references. i.e. the outerloop can + force the inner-loop to do a load and splat which will result in the loop + being entirely scalar as all lanes work on a duplicate. This flag + indicates when such cases occur and the loop is working on fully splatted + lanes. */ + bool m_loop_fully_scalar_dup = false; }; aarch64_vector_costs::aarch64_vector_costs (vec_info *vinfo, @@ -18079,6 +18087,29 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, analyze_loop_vinfo (loop_vinfo); m_analyzed_vinfo = true; + if (in_inner_loop_p) + m_loop_fully_scalar_dup = true; + } + + /* Detect whether the loop is working on fully duplicated lanes. This would + only be possible with inner loop vectorization since otherwise we wouldn't + try to vectorize. */ + if (in_inner_loop_p + && node + && m_loop_fully_scalar_dup + && SLP_TREE_LANES (node) == 1 + && !SLP_TREE_CHILDREN (node).exists ()) + { + /* Check if load is a duplicate. */ + if (gimple_vuse (stmt_info->stmt)) + m_loop_fully_scalar_dup + = m_loop_fully_scalar_dup + && SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_INVARIANT; + else if (SLP_TREE_DEF_TYPE (node) == vect_constant_def + || SLP_TREE_DEF_TYPE (node) == vect_external_def) + ; + else + m_loop_fully_scalar_dup = false; } /* Apply the heuristic described above m_stp_sequence_cost. */ @@ -18445,8 +18476,22 @@ adjust_body_cost (loop_vec_info loop_vinfo, if (m_vec_flags & VEC_ANY_SVE) threshold = CEIL (threshold, aarch64_estimated_sve_vq ()); - if (m_num_vector_iterations >= 1 - && m_num_vector_iterations < threshold) + /* Increase the cost of the vector code if it looks like the vector code is + just executing the scalar code but as vector. i.e. has every lane + duplicated from scalar. In this case throughput of the vector code is 1 + scalar element per iteration and vectorization is not profitable. Scale + the code by VF to ensure it never is. */ + if (m_loop_fully_scalar_dup) + { + body_cost *= estimated_vf; + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Increasing body cost to %d because vector code has" + " throughput of 1 scalar element per iteration\n", + body_cost); + } + else if (m_num_vector_iterations >= 1 + && m_num_vector_iterations < threshold) { if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, diff --git a/gcc/testsuite/gcc.target/aarch64/pr121290.c b/gcc/testsuite/gcc.target/aarch64/pr121290.c index c88cb7e6979ef43ebaf14c3d3f3c4c561bff3e76..566ed0007f851e7f8831f487fa3f60a17aa5b19a 100644 --- a/gcc/testsuite/gcc.target/aarch64/pr121290.c +++ b/gcc/testsuite/gcc.target/aarch64/pr121290.c @@ -14,4 +14,6 @@ f (int *restrict x, int *restrict y, int *restrict z, int n) } /* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ +/* { dg-final { scan-tree-dump-not "OUTER LOOP VECTORIZED" "vect" } } */ /* { dg-final { scan-tree-dump-not "load operations = 0" "vect" } } */ +/* { dg-final { scan-tree-dump "throughput of 1 scalar element per iteration" "vect" } } */