> > I am generally trying to get rid of remaing uses of REG_FREQ since the > > 10000 based fixed point arithmetics iot always working that well. > > > > You can do the sums in profile_count type (doing something reasonable > > when count is uninitialized) and then convert it to sreal for the final > > heuristics. > > Changed, when count_max is initialized and function is not optimized to size, > use > bb_count.initialized_p > ? bb_count.probability_in (count_max).to_sreal () > : 0.1; > > Otherwise, just use 1. > > > n some benchmark, I notice stv failed due to cost unprofitable, but the igain > is inside the loop, but sse<->integer conversion is outside the loop, current > cost > model doesn't consider the frequency of those gain/cost. > The patch weights those cost with frequency. > > The patch regressed gcc.target/i386/minmax-6.c under -m32 Since the > place of integer<->sse is before the branch, and the conversion to > min/max is in the branch, with static profile, the cost model is not > profitable anymore which is exactly the patch try to do. > Considering the original testcase is to guard RA issue, so restrict > the testcase under ! ia32 should still be ok. > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > Ok for trunk? > > gcc/ChangeLog: > > * config/i386/i386-features.cc (get_bb_probability): New function. > (scalar_chain::mark_dual_mode_def): Weight > n_integer_to_sse/n_sse_to_integer with bb frequency. > (general_scalar_chain::compute_convert_gain): Ditto, and > adjust function prototype to return true/false when cost model > is profitable or not. > (timode_scalar_chain::compute_convert_gain): Ditto. > (convert_scalars_to_vector): Adjust after the upper two > function prototype are changed. > * config/i386/i386-features.h (class scalar_chain): Change > n_integer_to_sse/n_sse_to_integer from int to sreal. > (class general_scalar_chain): Adjust prototype to return bool > intead of int. > (class timode_scalar_chain): Ditto. > > gcc/testsuite/ChangeLog: > * gcc.target/i386/minmax-6.c: Adjust testcase. > --- > gcc/config/i386/i386-features.cc | 83 +++++++++++++++++------- > gcc/config/i386/i386-features.h | 10 +-- > gcc/testsuite/gcc.target/i386/minmax-6.c | 2 +- > 3 files changed, 65 insertions(+), 30 deletions(-) > > diff --git a/gcc/config/i386/i386-features.cc > b/gcc/config/i386/i386-features.cc > index c35ac24fd8a..57dcd94c211 100644 > --- a/gcc/config/i386/i386-features.cc > +++ b/gcc/config/i386/i386-features.cc > @@ -326,6 +326,30 @@ scalar_chain::add_to_queue (unsigned insn_uid) > insn_uid, chain_id); > } > > +/* Return BB probablity in cfun->cfg->count_max, > + if count_max is uninitialized or optimized_for_size, return 1. > + else if bb count is uninitialized, return 0.1 as a rough estimation. > + else return bb_count.probablity_in (count_max). */ > +static sreal > +get_bb_probability (basic_block bb)
I think you want relative bb frequency that you can get by: sreal_freq = bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (fun)->count) The value returned will be 1 if BB is executed once per invocation of functions and >1 if it is in loop etc. Also if BB count is uninitialized it will return 1 that is probably reasonable deafult. > +{ > + profile_count bb_count = bb->count; > + profile_count count_max = cfun->cfg->count_max; > + sreal sreal_count = 1; > + if (count_max.initialized_p () > + && !optimize_function_for_size_p (cfun)) > + { > + if (bb_count.initialized_p ()) > + sreal_count = bb_count.probability_in (count_max).to_sreal (); > + else > + /* For unintialized BB, weight it with 0.1, > + ??? just like REG_FREQ_MAX / BB_FREQ_MAX. */ > + sreal_count = (sreal) 1 / (sreal) 10; > + } > + > + return sreal_count; > +} > + > /* For DImode conversion, mark register defined by DEF as requiring > conversion. */ > > @@ -337,18 +361,20 @@ scalar_chain::mark_dual_mode_def (df_ref def) > /* Record the def/insn pair so we can later efficiently iterate over > the defs to convert on insns not in the chain. */ > bool reg_new = bitmap_set_bit (defs_conv, DF_REF_REGNO (def)); > + sreal sreal_count = get_bb_probability (BLOCK_FOR_INSN (DF_REF_INSN > (def))); > + > if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def))) > { > if (!bitmap_set_bit (insns_conv, DF_REF_INSN_UID (def)) > && !reg_new) > return; > - n_integer_to_sse++; > + n_integer_to_sse += sreal_count; > } > else > { > if (!reg_new) > return; > - n_sse_to_integer++; > + n_sse_to_integer += sreal_count; Generaly I am trying to sum counts in profile_count data type and only later convert them to relative frequencies. This will work too, but + of sreal is bit more expensive. > } > > if (dump_file) > @@ -529,15 +555,15 @@ general_scalar_chain::vector_const_cost (rtx exp) > return ix86_cost->sse_load[smode == DImode ? 1 : 0]; > } > > -/* Compute a gain for chain conversion. */ > +/* Return true if it's cost profitable for chain conversion. */ > > -int > +bool > general_scalar_chain::compute_convert_gain () > { > bitmap_iterator bi; > unsigned insn_uid; > - int gain = 0; > - int cost = 0; > + sreal gain = 0; > + sreal cost = 0; so gain is the difference of runtime of integer variant compared to vector vairant and cost are the extra int->see and sse->int conversions needed? If you scale everything by a BB frequency, you will get a weird behaviour if chain happens to consist only of instructions in BBs with 0 frequency (that commonly happens in FDO). I think you want to look at the chain first and see if it contains a hot instruction. If it does, use performance metrics (ix86_cost and scaling by BB frequencies) if it does not use size metrics (ix86_size_cost and all scales 1) Honza > > if (dump_file) > fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id); > @@ -556,6 +582,7 @@ general_scalar_chain::compute_convert_gain () > rtx src = SET_SRC (def_set); > rtx dst = SET_DEST (def_set); > int igain = 0; > + sreal sreal_count = get_bb_probability (BLOCK_FOR_INSN (insn)); > > if (REG_P (src) && REG_P (dst)) > igain += 2 * m - ix86_cost->xmm_move; > @@ -755,16 +782,19 @@ general_scalar_chain::compute_convert_gain () > } > } > > + sreal real_gain = sreal_count * igain; > if (igain != 0 && dump_file) > { > - fprintf (dump_file, " Instruction gain %d for ", igain); > + fprintf (dump_file, " Instruction gain %.2f for ", > + real_gain.to_double ()); > dump_insn_slim (dump_file, insn); > } > - gain += igain; > + gain += real_gain; > } > > if (dump_file) > - fprintf (dump_file, " Instruction conversion gain: %d\n", gain); > + fprintf (dump_file, " Instruction conversion gain: %.2f\n", > + gain.to_double ()); > > /* Cost the integer to sse and sse to integer moves. */ > if (!optimize_function_for_size_p (cfun)) > @@ -796,14 +826,14 @@ general_scalar_chain::compute_convert_gain () > } > > if (dump_file) > - fprintf (dump_file, " Registers conversion cost: %d\n", cost); > - > - gain -= cost; > + fprintf (dump_file, " Registers conversion cost: %.2f\n", > + cost.to_double ()); > > if (dump_file) > - fprintf (dump_file, " Total gain: %d\n", gain); > + fprintf (dump_file, " Total gain: %.2f\n, total cost: %.2f", > + gain.to_double (), cost.to_double ()); > > - return gain; > + return gain > cost ; > } > > /* Insert generated conversion instruction sequence INSNS > @@ -1520,21 +1550,22 @@ timode_immed_const_gain (rtx cst) > return 0; > } > > -/* Compute a gain for chain conversion. */ > +/* Return true it's cost profitable for for chain conversion. */ > > -int > +bool > timode_scalar_chain::compute_convert_gain () > { > /* Assume that if we have to move TImode values between units, > then transforming this chain isn't worth it. */ > - if (n_sse_to_integer || n_integer_to_sse) > - return -1; > + if (n_sse_to_integer.to_int () > + || n_integer_to_sse.to_int ()) > + return false; > > bitmap_iterator bi; > unsigned insn_uid; > > /* Split ties to prefer V1TImode when not optimizing for size. */ > - int gain = optimize_size ? 0 : 1; > + sreal gain = optimize_size ? 0 : 1; > > if (dump_file) > fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id); > @@ -1548,6 +1579,7 @@ timode_scalar_chain::compute_convert_gain () > HOST_WIDE_INT op1val; > int scost, vcost; > int igain = 0; > + sreal sreal_count = get_bb_probability (BLOCK_FOR_INSN (insn)); > > switch (GET_CODE (src)) > { > @@ -1754,18 +1786,21 @@ timode_scalar_chain::compute_convert_gain () > break; > } > > + sreal real_gain = sreal_count * igain; > if (igain != 0 && dump_file) > { > - fprintf (dump_file, " Instruction gain %d for ", igain); > + > + fprintf (dump_file, " Instruction gain %.2f for ", > real_gain.to_double ()); > dump_insn_slim (dump_file, insn); > } > - gain += igain; > + > + gain += real_gain; > } > > if (dump_file) > - fprintf (dump_file, " Total gain: %d\n", gain); > + fprintf (dump_file, " Total gain: %.2f\n", gain.to_double ()); > > - return gain; > + return gain > (sreal) 0; > } > > /* Fix uses of converted REG in debug insns. */ > @@ -2561,7 +2596,7 @@ convert_scalars_to_vector (bool timode_p) > conversions. */ > if (chain->build (&candidates[i], uid, disallowed)) > { > - if (chain->compute_convert_gain () > 0) > + if (chain->compute_convert_gain ()) > converted_insns += chain->convert (); > else if (dump_file) > fprintf (dump_file, "Chain #%d conversion is not profitable\n", > diff --git a/gcc/config/i386/i386-features.h b/gcc/config/i386/i386-features.h > index 24b0c4ed0cd..cc40c5c9841 100644 > --- a/gcc/config/i386/i386-features.h > +++ b/gcc/config/i386/i386-features.h > @@ -153,12 +153,12 @@ class scalar_chain > > bitmap insns_conv; > hash_map<rtx, rtx> defs_map; > - unsigned n_sse_to_integer; > - unsigned n_integer_to_sse; > + sreal n_sse_to_integer; > + sreal n_integer_to_sse; > auto_vec<rtx_insn *> control_flow_insns; > > bool build (bitmap candidates, unsigned insn_uid, bitmap disallowed); > - virtual int compute_convert_gain () = 0; > + virtual bool compute_convert_gain () = 0; > int convert (); > > protected: > @@ -184,7 +184,7 @@ class general_scalar_chain : public scalar_chain > public: > general_scalar_chain (enum machine_mode smode_, enum machine_mode vmode_) > : scalar_chain (smode_, vmode_) {} > - int compute_convert_gain () final override; > + bool compute_convert_gain () final override; > > private: > void convert_insn (rtx_insn *insn) final override; > @@ -196,7 +196,7 @@ class timode_scalar_chain : public scalar_chain > { > public: > timode_scalar_chain () : scalar_chain (TImode, V1TImode) {} > - int compute_convert_gain () final override; > + bool compute_convert_gain () final override; > > private: > void fix_debug_reg_uses (rtx reg); > diff --git a/gcc/testsuite/gcc.target/i386/minmax-6.c > b/gcc/testsuite/gcc.target/i386/minmax-6.c > index 615f919ba0a..b5b4363f931 100644 > --- a/gcc/testsuite/gcc.target/i386/minmax-6.c > +++ b/gcc/testsuite/gcc.target/i386/minmax-6.c > @@ -15,4 +15,4 @@ UMVLine16Y_11 (short unsigned int * Pic, int y, int width) > /* We do not want the RA to spill %esi for it's dual-use but using > pmaxsd is OK. */ > /* { dg-final { scan-assembler-not "rsp" { target { ! { ia32 } } } } } */ > -/* { dg-final { scan-assembler "pmaxsd" } } */ > +/* { dg-final { scan-assembler "pmaxsd" { target { ! ia32 } } } } */ > -- > 2.34.1 >