> On 19 Dec 2024, at 11:14, Richard Sandiford <richard.sandif...@arm.com> wrote:
> 
> External email: Use caution opening links or attachments
> 
> 
> Jennifer Schmitz <jschm...@nvidia.com> writes:
>> @@ -8834,22 +8834,7 @@ vectorizable_store (vec_info *vinfo,
>>              {
>>                if (costing_p)
>>                  {
>> -                   /* Only need vector extracting when there are more
>> -                      than one stores.  */
>> -                   if (nstores > 1)
>> -                     inside_cost
>> -                       += record_stmt_cost (cost_vec, 1, vec_to_scalar,
>> -                                            stmt_info, slp_node,
>> -                                            0, vect_body);
>> -                   /* Take a single lane vector type store as scalar
>> -                      store to avoid ICE like 110776.  */
>> -                   if (VECTOR_TYPE_P (ltype)
>> -                       && known_ne (TYPE_VECTOR_SUBPARTS (ltype), 1U))
>> -                     n_adjacent_stores++;
>> -                   else
>> -                     inside_cost
>> -                       += record_stmt_cost (cost_vec, 1, scalar_store,
>> -                                            stmt_info, 0, vect_body);
>> +                   n_adjacent_stores++;
>>                    continue;
>>                  }
>>                tree newref, newoff;
>> @@ -8905,9 +8890,26 @@ vectorizable_store (vec_info *vinfo,
>>       if (costing_p)
>>      {
>>        if (n_adjacent_stores > 0)
>> -         vect_get_store_cost (vinfo, stmt_info, slp_node, n_adjacent_stores,
>> -                              alignment_support_scheme, misalignment,
>> -                              &inside_cost, cost_vec);
>> +         {
>> +           /* Take a single lane vector type store as scalar
>> +              store to avoid ICE like 110776.  */
>> +           if (VECTOR_TYPE_P (ltype)
>> +               && known_ne (TYPE_VECTOR_SUBPARTS (ltype), 1U))
> 
> Sorry to ask, since it's pre-existing, but could you change this to
> maybe_ne while you're there?  nunits==1+1X should be treated as a vector
> rather than a scalar.
Sure, I made the change (see patch below) and re-validated on aarch64.

It would also be good to check for performance regressions, now that we have a 
patch to test:
I will run SPEC2017 with -mcpu=generic and -mcpu=native on Grace, but we would 
appreciate help with benchmarking on other platforms.
Tamar, would you still be willing to test the patch on other platforms?

If there are no other changes necessary and assuming there are no performance 
regressions, I was planning to commit the patch in January after returning from 
christmas break.

In the meantime I wish everyone happy holidays.
Jennifer

This patch removes the AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS tunable and
use_new_vector_costs entry in aarch64-tuning-flags.def and makes the
AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS paths in the backend the
default. To that end, the function aarch64_use_new_vector_costs_p and its uses
were removed. To prevent costing vec_to_scalar operations with 0, as
described in
https://gcc.gnu.org/pipermail/gcc-patches/2024-October/665481.html,
we adjusted vectorizable_store such that the variable n_adjacent_stores
also covers vec_to_scalar operations. This way vec_to_scalar operations
are not costed individually, but as a group.
As suggested by Richard Sandiford, the "known_ne" in the multilane-check
was replaced by "maybe_ne" in order to treat nunits==1+1X as a vector
rather than a scalar.

Two tests were adjusted due to changes in codegen. In both cases, the
old code performed loop unrolling once, but the new code does not:
Example from gcc.target/aarch64/sve/strided_load_2.c (compiled with
-O2 -ftree-vectorize -march=armv8.2-a+sve -mtune=generic -moverride=tune=none):
f_int64_t_32:
        cbz     w3, .L92
        mov     x4, 0
        uxtw    x3, w3
+       cntd    x5
+       whilelo p7.d, xzr, x3
+       mov     z29.s, w5
        mov     z31.s, w2
-       whilelo p6.d, xzr, x3
-       mov     x2, x3
-       index   z30.s, #0, #1
-       uqdecd  x2
-       ptrue   p5.b, all
-       whilelo p7.d, xzr, x2
+       index   z30.d, #0, #1
+       ptrue   p6.b, all
        .p2align 3,,7
 .L94:
-       ld1d    z27.d, p7/z, [x0, #1, mul vl]
-       ld1d    z28.d, p6/z, [x0]
-       movprfx z29, z31
-       mul     z29.s, p5/m, z29.s, z30.s
-       incw    x4
-       uunpklo z0.d, z29.s
-       uunpkhi z29.d, z29.s
-       ld1d    z25.d, p6/z, [x1, z0.d, lsl 3]
-       ld1d    z26.d, p7/z, [x1, z29.d, lsl 3]
-       add     z25.d, z28.d, z25.d
+       ld1d    z27.d, p7/z, [x0, x4, lsl 3]
+       movprfx z28, z31
+       mul     z28.s, p6/m, z28.s, z30.s
+       ld1d    z26.d, p7/z, [x1, z28.d, uxtw 3]
        add     z26.d, z27.d, z26.d
-       st1d    z26.d, p7, [x0, #1, mul vl]
-       whilelo p7.d, x4, x2
-       st1d    z25.d, p6, [x0]
-       incw    z30.s
-       incb    x0, all, mul #2
-       whilelo p6.d, x4, x3
+       st1d    z26.d, p7, [x0, x4, lsl 3]
+       add     z30.s, z30.s, z29.s
+       incd    x4
+       whilelo p7.d, x4, x3
        b.any   .L94
 .L92:
        ret

Example from gcc.target/aarch64/sve/strided_store_2.c (compiled with
-O2 -ftree-vectorize -march=armv8.2-a+sve -mtune=generic -moverride=tune=none):
f_int64_t_32:
        cbz     w3, .L84
-       addvl   x5, x1, #1
        mov     x4, 0
        uxtw    x3, w3
-       mov     z31.s, w2
+       cntd    x5
        whilelo p7.d, xzr, x3
-       mov     x2, x3
-       index   z30.s, #0, #1
-       uqdecd  x2
-       ptrue   p5.b, all
-       whilelo p6.d, xzr, x2
+       mov     z29.s, w5
+       mov     z31.s, w2
+       index   z30.d, #0, #1
+       ptrue   p6.b, all
        .p2align 3,,7
 .L86:
-       ld1d    z28.d, p7/z, [x1, x4, lsl 3]
-       ld1d    z27.d, p6/z, [x5, x4, lsl 3]
-       movprfx z29, z30
-       mul     z29.s, p5/m, z29.s, z31.s
-       add     z28.d, z28.d, #1
-       uunpklo z26.d, z29.s
-       st1d    z28.d, p7, [x0, z26.d, lsl 3]
-       incw    x4
-       uunpkhi z29.d, z29.s
+       ld1d    z27.d, p7/z, [x1, x4, lsl 3]
+       movprfx z28, z30
+       mul     z28.s, p6/m, z28.s, z31.s
        add     z27.d, z27.d, #1
-       whilelo p6.d, x4, x2
-       st1d    z27.d, p7, [x0, z29.d, lsl 3]
-       incw    z30.s
+       st1d    z27.d, p7, [x0, z28.d, uxtw 3]
+       incd    x4
+       add     z30.s, z30.s, z29.s
        whilelo p7.d, x4, x3
        b.any   .L86
 .L84:
        ret

The patch was bootstrapped and tested on aarch64-linux-gnu, no
regression.
OK for mainline?

Signed-off-by: Jennifer Schmitz <jschm...@nvidia.com>

gcc/
        * tree-vect-stmts.cc (vectorizable_store): Extend the use of
        n_adjacent_stores to also cover vec_to_scalar operations.
        * config/aarch64/aarch64-tuning-flags.def: Remove
        use_new_vector_costs as tuning option.
        * config/aarch64/aarch64.cc (aarch64_use_new_vector_costs_p):
        Remove.
        (aarch64_vector_costs::add_stmt_cost): Remove use of
        aarch64_use_new_vector_costs_p.
        (aarch64_vector_costs::finish_cost): Remove use of
        aarch64_use_new_vector_costs_p.
        * config/aarch64/tuning_models/cortexx925.h: Remove
        AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS.
        * config/aarch64/tuning_models/fujitsu_monaka.h: Likewise.
        * config/aarch64/tuning_models/generic_armv8_a.h: Likewise.
        * config/aarch64/tuning_models/generic_armv9_a.h: Likewise.
        * config/aarch64/tuning_models/neoverse512tvb.h: Likewise.
        * config/aarch64/tuning_models/neoversen2.h: Likewise.
        * config/aarch64/tuning_models/neoversen3.h: Likewise.
        * config/aarch64/tuning_models/neoversev1.h: Likewise.
        * config/aarch64/tuning_models/neoversev2.h: Likewise.
        * config/aarch64/tuning_models/neoversev3.h: Likewise.
        * config/aarch64/tuning_models/neoversev3ae.h: Likewise.

gcc/testsuite/
        * gcc.target/aarch64/sve/strided_load_2.c: Adjust expected outcome.
        * gcc.target/aarch64/sve/strided_store_2.c: Likewise.
---
 gcc/config/aarch64/aarch64-tuning-flags.def   |  2 -
 gcc/config/aarch64/aarch64.cc                 | 20 ++--------
 gcc/config/aarch64/tuning_models/cortexx925.h |  1 -
 .../aarch64/tuning_models/fujitsu_monaka.h    |  1 -
 .../aarch64/tuning_models/generic_armv8_a.h   |  1 -
 .../aarch64/tuning_models/generic_armv9_a.h   |  1 -
 .../aarch64/tuning_models/neoverse512tvb.h    |  1 -
 gcc/config/aarch64/tuning_models/neoversen2.h |  1 -
 gcc/config/aarch64/tuning_models/neoversen3.h |  1 -
 gcc/config/aarch64/tuning_models/neoversev1.h |  1 -
 gcc/config/aarch64/tuning_models/neoversev2.h |  1 -
 gcc/config/aarch64/tuning_models/neoversev3.h |  1 -
 .../aarch64/tuning_models/neoversev3ae.h      |  1 -
 .../gcc.target/aarch64/sve/strided_load_2.c   |  2 +-
 .../gcc.target/aarch64/sve/strided_store_2.c  |  2 +-
 gcc/tree-vect-stmts.cc                        | 40 ++++++++++---------
 16 files changed, 27 insertions(+), 50 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def 
b/gcc/config/aarch64/aarch64-tuning-flags.def
index ffbff20e29c..1de633c739b 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -38,8 +38,6 @@ AARCH64_EXTRA_TUNING_OPTION ("cheap_shift_extend", 
CHEAP_SHIFT_EXTEND)
 
 AARCH64_EXTRA_TUNING_OPTION ("cse_sve_vl_constants", CSE_SVE_VL_CONSTANTS)
 
-AARCH64_EXTRA_TUNING_OPTION ("use_new_vector_costs", USE_NEW_VECTOR_COSTS)
-
 AARCH64_EXTRA_TUNING_OPTION ("matched_vector_throughput", 
MATCHED_VECTOR_THROUGHPUT)
 
 AARCH64_EXTRA_TUNING_OPTION ("avoid_cross_loop_fma", AVOID_CROSS_LOOP_FMA)
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 77a2a6bfa3a..71fba9cc63b 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -16627,16 +16627,6 @@ aarch64_vectorize_create_costs (vec_info *vinfo, bool 
costing_for_scalar)
   return new aarch64_vector_costs (vinfo, costing_for_scalar);
 }
 
-/* Return true if the current CPU should use the new costs defined
-   in GCC 11.  This should be removed for GCC 12 and above, with the
-   costs applying to all CPUs instead.  */
-static bool
-aarch64_use_new_vector_costs_p ()
-{
-  return (aarch64_tune_params.extra_tuning_flags
-         & AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS);
-}
-
 /* Return the appropriate SIMD costs for vectors of type VECTYPE.  */
 static const simd_vec_cost *
 aarch64_simd_vec_costs (tree vectype)
@@ -17555,7 +17545,7 @@ aarch64_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
 
   /* Do one-time initialization based on the vinfo.  */
   loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
-  if (!m_analyzed_vinfo && aarch64_use_new_vector_costs_p ())
+  if (!m_analyzed_vinfo)
     {
       if (loop_vinfo)
        analyze_loop_vinfo (loop_vinfo);
@@ -17573,7 +17563,7 @@ aarch64_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
 
   /* Try to get a more accurate cost by looking at STMT_INFO instead
      of just looking at KIND.  */
-  if (stmt_info && aarch64_use_new_vector_costs_p ())
+  if (stmt_info)
     {
       /* If we scalarize a strided store, the vectorizer costs one
         vec_to_scalar for each element.  However, we can store the first
@@ -17638,7 +17628,7 @@ aarch64_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
   else
     m_num_last_promote_demote = 0;
 
-  if (stmt_info && aarch64_use_new_vector_costs_p ())
+  if (stmt_info)
     {
       /* Account for any extra "embedded" costs that apply additively
         to the base cost calculated above.  */
@@ -17999,9 +17989,7 @@ aarch64_vector_costs::finish_cost (const vector_costs 
*uncast_scalar_costs)
 
   auto *scalar_costs
     = static_cast<const aarch64_vector_costs *> (uncast_scalar_costs);
-  if (loop_vinfo
-      && m_vec_flags
-      && aarch64_use_new_vector_costs_p ())
+  if (loop_vinfo && m_vec_flags)
     {
       m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
                                             m_costs[vect_body]);
diff --git a/gcc/config/aarch64/tuning_models/cortexx925.h 
b/gcc/config/aarch64/tuning_models/cortexx925.h
index 5ebaf66e986..74772f3e15f 100644
--- a/gcc/config/aarch64/tuning_models/cortexx925.h
+++ b/gcc/config/aarch64/tuning_models/cortexx925.h
@@ -221,7 +221,6 @@ static const struct tune_params cortexx925_tunings =
   tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_BASE
    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
-   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
    | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW),       /* tune_flags.  */
   &generic_armv9a_prefetch_tune,
diff --git a/gcc/config/aarch64/tuning_models/fujitsu_monaka.h 
b/gcc/config/aarch64/tuning_models/fujitsu_monaka.h
index 2d704ecd110..a564528f43d 100644
--- a/gcc/config/aarch64/tuning_models/fujitsu_monaka.h
+++ b/gcc/config/aarch64/tuning_models/fujitsu_monaka.h
@@ -55,7 +55,6 @@ static const struct tune_params fujitsu_monaka_tunings =
   0,   /* max_case_values.  */
   tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_BASE
-   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),    /* tune_flags.  */
   &generic_prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
diff --git a/gcc/config/aarch64/tuning_models/generic_armv8_a.h 
b/gcc/config/aarch64/tuning_models/generic_armv8_a.h
index bdd309ab03d..f090d5cde50 100644
--- a/gcc/config/aarch64/tuning_models/generic_armv8_a.h
+++ b/gcc/config/aarch64/tuning_models/generic_armv8_a.h
@@ -183,7 +183,6 @@ static const struct tune_params generic_armv8_a_tunings =
   tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_BASE
    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
-   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),    /* tune_flags.  */
   &generic_prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
diff --git a/gcc/config/aarch64/tuning_models/generic_armv9_a.h 
b/gcc/config/aarch64/tuning_models/generic_armv9_a.h
index 785e00946bc..7b5821183bc 100644
--- a/gcc/config/aarch64/tuning_models/generic_armv9_a.h
+++ b/gcc/config/aarch64/tuning_models/generic_armv9_a.h
@@ -251,7 +251,6 @@ static const struct tune_params generic_armv9_a_tunings =
   0,   /* max_case_values.  */
   tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_BASE
-   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),    /* tune_flags.  */
   &generic_armv9a_prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
diff --git a/gcc/config/aarch64/tuning_models/neoverse512tvb.h 
b/gcc/config/aarch64/tuning_models/neoverse512tvb.h
index 007f987154c..f7457df59e5 100644
--- a/gcc/config/aarch64/tuning_models/neoverse512tvb.h
+++ b/gcc/config/aarch64/tuning_models/neoverse512tvb.h
@@ -156,7 +156,6 @@ static const struct tune_params neoverse512tvb_tunings =
   0,   /* max_case_values.  */
   tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
-   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),    /* tune_flags.  */
   &generic_armv9a_prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
diff --git a/gcc/config/aarch64/tuning_models/neoversen2.h 
b/gcc/config/aarch64/tuning_models/neoversen2.h
index 32560d2f5f8..541b61c8179 100644
--- a/gcc/config/aarch64/tuning_models/neoversen2.h
+++ b/gcc/config/aarch64/tuning_models/neoversen2.h
@@ -219,7 +219,6 @@ static const struct tune_params neoversen2_tunings =
   tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_BASE
    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
-   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
    | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW),       /* tune_flags.  */
   &generic_armv9a_prefetch_tune,
diff --git a/gcc/config/aarch64/tuning_models/neoversen3.h 
b/gcc/config/aarch64/tuning_models/neoversen3.h
index 2010bc4645b..eff668132a8 100644
--- a/gcc/config/aarch64/tuning_models/neoversen3.h
+++ b/gcc/config/aarch64/tuning_models/neoversen3.h
@@ -219,7 +219,6 @@ static const struct tune_params neoversen3_tunings =
   tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_BASE
    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
-   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),    /* tune_flags.  */
   &generic_armv9a_prefetch_tune,
   AARCH64_LDP_STP_POLICY_ALWAYS,   /* ldp_policy_model.  */
diff --git a/gcc/config/aarch64/tuning_models/neoversev1.h 
b/gcc/config/aarch64/tuning_models/neoversev1.h
index c3751e32696..d11472b6e1e 100644
--- a/gcc/config/aarch64/tuning_models/neoversev1.h
+++ b/gcc/config/aarch64/tuning_models/neoversev1.h
@@ -228,7 +228,6 @@ static const struct tune_params neoversev1_tunings =
   tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_BASE
    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
-   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
    | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW),       /* tune_flags.  */
   &generic_armv9a_prefetch_tune,
diff --git a/gcc/config/aarch64/tuning_models/neoversev2.h 
b/gcc/config/aarch64/tuning_models/neoversev2.h
index 80dbe5c806c..ee77ffdd3bc 100644
--- a/gcc/config/aarch64/tuning_models/neoversev2.h
+++ b/gcc/config/aarch64/tuning_models/neoversev2.h
@@ -219,7 +219,6 @@ static const struct tune_params neoversev2_tunings =
   tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_BASE
    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
-   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
    | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW
    | AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA),  /* tune_flags.  */
diff --git a/gcc/config/aarch64/tuning_models/neoversev3.h 
b/gcc/config/aarch64/tuning_models/neoversev3.h
index efe09e16d1e..6ef143ef7d5 100644
--- a/gcc/config/aarch64/tuning_models/neoversev3.h
+++ b/gcc/config/aarch64/tuning_models/neoversev3.h
@@ -219,7 +219,6 @@ static const struct tune_params neoversev3_tunings =
   tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_BASE
    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
-   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
    | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW),       /* tune_flags.  */
   &generic_armv9a_prefetch_tune,
diff --git a/gcc/config/aarch64/tuning_models/neoversev3ae.h 
b/gcc/config/aarch64/tuning_models/neoversev3ae.h
index 66849f30889..96bdbf971f1 100644
--- a/gcc/config/aarch64/tuning_models/neoversev3ae.h
+++ b/gcc/config/aarch64/tuning_models/neoversev3ae.h
@@ -219,7 +219,6 @@ static const struct tune_params neoversev3ae_tunings =
   tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
   (AARCH64_EXTRA_TUNE_BASE
    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
-   | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
    | AARCH64_EXTRA_TUNE_AVOID_PRED_RMW),       /* tune_flags.  */
   &generic_armv9a_prefetch_tune,
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/strided_load_2.c 
b/gcc/testsuite/gcc.target/aarch64/sve/strided_load_2.c
index 762805ff54b..c334b7a6875 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/strided_load_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/strided_load_2.c
@@ -15,4 +15,4 @@
    so we vectorize the offset calculation.  This means that the
    64-bit version needs two copies.  */
 /* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, 
z[0-9]+.s, uxtw 2\]\n} 3 } } */
-/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, 
z[0-9]+.d, lsl 3\]\n} 15 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, 
z[0-9]+.d, lsl 3\]\n} 9 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/strided_store_2.c 
b/gcc/testsuite/gcc.target/aarch64/sve/strided_store_2.c
index f0ea58e38e2..94cc63049bc 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/strided_store_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/strided_store_2.c
@@ -15,4 +15,4 @@
    so we vectorize the offset calculation.  This means that the
    64-bit version needs two copies.  */
 /* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, 
z[0-9]+.s, uxtw 2\]\n} 3 } } */
-/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, 
z[0-9]+.d, lsl 3\]\n} 15 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, 
z[0-9]+.d, lsl 3\]\n} 9 } } */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index be1139a423c..09c048ec00c 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -8834,22 +8834,7 @@ vectorizable_store (vec_info *vinfo,
                {
                  if (costing_p)
                    {
-                     /* Only need vector extracting when there are more
-                        than one stores.  */
-                     if (nstores > 1)
-                       inside_cost
-                         += record_stmt_cost (cost_vec, 1, vec_to_scalar,
-                                              stmt_info, slp_node,
-                                              0, vect_body);
-                     /* Take a single lane vector type store as scalar
-                        store to avoid ICE like 110776.  */
-                     if (VECTOR_TYPE_P (ltype)
-                         && known_ne (TYPE_VECTOR_SUBPARTS (ltype), 1U))
-                       n_adjacent_stores++;
-                     else
-                       inside_cost
-                         += record_stmt_cost (cost_vec, 1, scalar_store,
-                                              stmt_info, 0, vect_body);
+                     n_adjacent_stores++;
                      continue;
                    }
                  tree newref, newoff;
@@ -8905,9 +8890,26 @@ vectorizable_store (vec_info *vinfo,
       if (costing_p)
        {
          if (n_adjacent_stores > 0)
-           vect_get_store_cost (vinfo, stmt_info, slp_node, n_adjacent_stores,
-                                alignment_support_scheme, misalignment,
-                                &inside_cost, cost_vec);
+           {
+             /* Take a single lane vector type store as scalar
+                store to avoid ICE like 110776.  */
+             if (VECTOR_TYPE_P (ltype)
+                 && maybe_ne (TYPE_VECTOR_SUBPARTS (ltype), 1U))
+               vect_get_store_cost (vinfo, stmt_info, slp_node,
+                                    n_adjacent_stores, 
alignment_support_scheme,
+                                    misalignment, &inside_cost, cost_vec);
+             else
+               inside_cost
+                 += record_stmt_cost (cost_vec, n_adjacent_stores,
+                                      scalar_store, stmt_info, 0, vect_body);
+             /* Only need vector extracting when there are more
+                than one stores.  */
+             if (nstores > 1)
+               inside_cost
+                 += record_stmt_cost (cost_vec, n_adjacent_stores,
+                                      vec_to_scalar, stmt_info, slp_node,
+                                      0, vect_body);
+           }
          if (dump_enabled_p ())
            dump_printf_loc (MSG_NOTE, vect_location,
                             "vect_model_store_cost: inside_cost = %d, "
-- 
2.44.0

> 
> Thanks,
> Richard
> 
>> +             vect_get_store_cost (vinfo, stmt_info, slp_node,
>> +                                  n_adjacent_stores, 
>> alignment_support_scheme,
>> +                                  misalignment, &inside_cost, cost_vec);
>> +           else
>> +             inside_cost
>> +               += record_stmt_cost (cost_vec, n_adjacent_stores,
>> +                                    scalar_store, stmt_info, 0, vect_body);
>> +           /* Only need vector extracting when there are more
>> +              than one stores.  */
>> +           if (nstores > 1)
>> +             inside_cost
>> +               += record_stmt_cost (cost_vec, n_adjacent_stores,
>> +                                    vec_to_scalar, stmt_info, slp_node,
>> +                                    0, vect_body);
>> +         }
>>        if (dump_enabled_p ())
>>          dump_printf_loc (MSG_NOTE, vect_location,
>>                           "vect_model_store_cost: inside_cost = %d, "

Attachment: smime.p7s
Description: S/MIME cryptographic signature

Reply via email to