In some benchmark, I notice stv failed due to cost unprofitable, but the igain is inside the loop, but sse<->integer conversion is outside the loop, current cost model doesn't consider the frequency of those gain/cost. The patch weights those cost with frequency just like LRA does.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. Ok for GCC16? gcc/ChangeLog: * config/i386/i386-features.cc (scalar_chain::mark_dual_mode_def): (general_scalar_chain::compute_convert_gain): --- gcc/config/i386/i386-features.cc | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc index c35ac24fd8a..ae0844a70c2 100644 --- a/gcc/config/i386/i386-features.cc +++ b/gcc/config/i386/i386-features.cc @@ -337,18 +337,20 @@ scalar_chain::mark_dual_mode_def (df_ref def) /* Record the def/insn pair so we can later efficiently iterate over the defs to convert on insns not in the chain. */ bool reg_new = bitmap_set_bit (defs_conv, DF_REF_REGNO (def)); + unsigned frequency + = REG_FREQ_FROM_BB (BLOCK_FOR_INSN (DF_REF_INSN (def))); if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def))) { if (!bitmap_set_bit (insns_conv, DF_REF_INSN_UID (def)) && !reg_new) return; - n_integer_to_sse++; + n_integer_to_sse += frequency; } else { if (!reg_new) return; - n_sse_to_integer++; + n_sse_to_integer += frequency; } if (dump_file) @@ -556,6 +558,8 @@ general_scalar_chain::compute_convert_gain () rtx src = SET_SRC (def_set); rtx dst = SET_DEST (def_set); int igain = 0; + unsigned frequency + = REG_FREQ_FROM_BB (BLOCK_FOR_INSN (insn)); if (REG_P (src) && REG_P (dst)) igain += 2 * m - ix86_cost->xmm_move; @@ -755,6 +759,7 @@ general_scalar_chain::compute_convert_gain () } } + igain *= frequency; if (igain != 0 && dump_file) { fprintf (dump_file, " Instruction gain %d for ", igain); -- 2.34.1