Update in V3 > > > + cost_sse_integer = 0; > > > + weighted_cost_sse_integer = 0 ; > Extra space here. Changed.
> > > + : ix86_size_cost.sse_to_integer; > > Please be sure to not revert the changes from my patch adding > COSTS_N_INSNS (...) / 2 > here and some other places. Yes, keep the original cost, and weights it with bb_freq. > > I think with your change the size metrics will get less accurate. Old > code had hard-wired sizes which depends on ISA we use: > > else if (TARGET_64BIT || smode == SImode) > { > cost += n_sse_to_integer * COSTS_N_BYTES (4); > cost += n_integer_to_sse * COSTS_N_BYTES (4); > } > else if (TARGET_SSE4_1) > { > /* vmovd (4 bytes) + vpextrd (6 bytes). */ > cost += n_sse_to_integer * COSTS_N_BYTES (10); > /* vmovd (4 bytes) + vpinsrd (6 bytes). */ > cost += n_integer_to_sse * COSTS_N_BYTES (10); > } > else > { > /* movd (4 bytes) + psrlq (5 bytes) + movd (4 bytes). */ > cost += n_sse_to_integer * COSTS_N_BYTES (13); > /* movd (4 bytes) + movd (4 bytes) + unpckldq (4 bytes). */ > cost += n_integer_to_sse * COSTS_N_BYTES (12); > } > > While you now use common size_costs->sse_to_integer which will not be > able to distinguish these. > > Perhaps we want to have a function returning correct sizes depending on > ISA and use it here? (In future it may be useful to get correct size > costs for vectorizer too.) Also keep the hard-wired sizes. > > > } > > > > > > + if (speed_p) > > > + weighted_cost_sse_integer += bb_freq * cost; > > You use bb_freq just once, so it may be easier to just call the > function. Changed > > > - if (optimize_insn_for_size_p ()) > > > + if (!speed_p) > > If the size tables are right, we should be able to eliminate some of > these tests since same computation as for speed_p should work for size. > However sizes are somewhat approximate, so this needs to be done > carefully, so it is probably better done incremeentally. Yes, still use ix86_cost, and just refactor optimize_insn_for_size_p () with !speed_p. > Patch is OK with these changes. Thansk for review. Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. Ok for trunk? n some benchmark, I notice stv failed due to cost unprofitable, but the igain is inside the loop, but sse<->integer conversion is outside the loop, current cost model doesn't consider the frequency of those gain/cost. The patch weights those cost with frequency. gcc/ChangeLog: * config/i386/i386-features.cc (scalar_chain::mark_dual_mode_def): Weight cost of integer<->sse move with bb frequency when it's optimized_for_speed_p. (general_scalar_chain::compute_convert_gain): Ditto, and adjust function prototype to return true/false when cost model is profitable or not. (timode_scalar_chain::compute_convert_gain): Ditto. (convert_scalars_to_vector): Adjust after the upper two function prototype are changed. * config/i386/i386-features.h (class scalar_chain): Change n_integer_to_sse/n_sse_to_integer to cost_sse_integer, and add weighted_cost_sse_integer. (class general_scalar_chain): Adjust prototype to return bool intead of int. (class timode_scalar_chain): Ditto. --- gcc/config/i386/i386-features.cc | 177 +++++++++++++++++-------------- gcc/config/i386/i386-features.h | 11 +- 2 files changed, 105 insertions(+), 83 deletions(-) diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc index cc8313bd292..6491c6b063f 100644 --- a/gcc/config/i386/i386-features.cc +++ b/gcc/config/i386/i386-features.cc @@ -296,9 +296,8 @@ scalar_chain::scalar_chain (enum machine_mode smode_, enum machine_mode vmode_) insns_conv = BITMAP_ALLOC (NULL); queue = NULL; - n_sse_to_integer = 0; - n_integer_to_sse = 0; - + cost_sse_integer = 0; + weighted_cost_sse_integer = 0 ; max_visits = x86_stv_max_visits; } @@ -337,20 +336,52 @@ scalar_chain::mark_dual_mode_def (df_ref def) /* Record the def/insn pair so we can later efficiently iterate over the defs to convert on insns not in the chain. */ bool reg_new = bitmap_set_bit (defs_conv, DF_REF_REGNO (def)); + basic_block bb = BLOCK_FOR_INSN (DF_REF_INSN (def)); + profile_count entry_count = ENTRY_BLOCK_PTR_FOR_FN (cfun)->count; + bool speed_p = optimize_bb_for_speed_p (bb); + int cost = 0; + if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def))) { if (!bitmap_set_bit (insns_conv, DF_REF_INSN_UID (def)) && !reg_new) return; - n_integer_to_sse++; + + /* Cost integer to sse moves. */ + if (speed_p) + cost = COSTS_N_INSNS (ix86_cost->integer_to_sse) / 2; + else if (TARGET_64BIT || smode == SImode) + cost = COSTS_N_BYTES (4); + /* vmovd (4 bytes) + vpinsrd (6 bytes). */ + else if (TARGET_SSE4_1) + cost = COSTS_N_BYTES (10); + /* movd (4 bytes) + movd (4 bytes) + unpckldq (4 bytes). */ + else + cost = COSTS_N_BYTES (12); } else { if (!reg_new) return; - n_sse_to_integer++; + + /* Cost sse to integer moves. */ + if (speed_p) + cost = COSTS_N_INSNS (ix86_cost->sse_to_integer) / 2; + else if (TARGET_64BIT || smode == SImode) + cost = COSTS_N_BYTES (4); + /* vmovd (4 bytes) + vpextrd (6 bytes). */ + else if (TARGET_SSE4_1) + cost = COSTS_N_BYTES (10); + /* movd (4 bytes) + psrlq (5 bytes) + movd (4 bytes). */ + else + cost = COSTS_N_BYTES (13); } + if (speed_p) + weighted_cost_sse_integer += bb->count.to_sreal_scale (entry_count) * cost; + + cost_sse_integer += cost; + if (dump_file) fprintf (dump_file, " Mark r%d def in insn %d as requiring both modes in chain #%d\n", @@ -531,15 +562,15 @@ general_scalar_chain::vector_const_cost (rtx exp, basic_block bb) return COSTS_N_INSNS (ix86_cost->sse_load[smode == DImode ? 1 : 0]) / 2; } -/* Compute a gain for chain conversion. */ +/* Return true if it's cost profitable for chain conversion. */ -int +bool general_scalar_chain::compute_convert_gain () { bitmap_iterator bi; unsigned insn_uid; int gain = 0; - int cost = 0; + sreal weighted_gain = 0; if (dump_file) fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id); @@ -559,10 +590,13 @@ general_scalar_chain::compute_convert_gain () rtx dst = SET_DEST (def_set); basic_block bb = BLOCK_FOR_INSN (insn); int igain = 0; + profile_count entry_count = ENTRY_BLOCK_PTR_FOR_FN (cfun)->count; + bool speed_p = optimize_bb_for_speed_p (bb); + sreal bb_freq = bb->count.to_sreal_scale (entry_count); if (REG_P (src) && REG_P (dst)) { - if (optimize_bb_for_size_p (bb)) + if (!speed_p) /* reg-reg move is 2 bytes, while SSE 3. */ igain += COSTS_N_BYTES (2 * m - 3); else @@ -571,7 +605,7 @@ general_scalar_chain::compute_convert_gain () } else if (REG_P (src) && MEM_P (dst)) { - if (optimize_bb_for_size_p (bb)) + if (!speed_p) /* Integer load/store is 3+ bytes and SSE 4+. */ igain += COSTS_N_BYTES (3 * m - 4); else @@ -581,7 +615,7 @@ general_scalar_chain::compute_convert_gain () } else if (MEM_P (src) && REG_P (dst)) { - if (optimize_bb_for_size_p (bb)) + if (!speed_p) igain += COSTS_N_BYTES (3 * m - 4); else igain += COSTS_N_INSNS (m * ix86_cost->int_load[2] @@ -593,7 +627,7 @@ general_scalar_chain::compute_convert_gain () of explicit load and store instructions. */ if (MEM_P (dst)) { - if (optimize_bb_for_size_p (bb)) + if (!speed_p) /* ??? This probably should account size difference of SSE and integer load rather than full SSE load. */ igain -= COSTS_N_BYTES (8); @@ -667,7 +701,7 @@ general_scalar_chain::compute_convert_gain () igain -= vector_const_cost (XEXP (src, 1), bb); if (MEM_P (XEXP (src, 1))) { - if (optimize_bb_for_size_p (bb)) + if (!speed_p) igain -= COSTS_N_BYTES (m == 2 ? 3 : 5); else igain += COSTS_N_INSNS @@ -730,7 +764,7 @@ general_scalar_chain::compute_convert_gain () case CONST_INT: if (REG_P (dst)) { - if (optimize_bb_for_size_p (bb)) + if (!speed_p) { /* xor (2 bytes) vs. xorps (3 bytes). */ if (src == const0_rtx) @@ -769,14 +803,14 @@ general_scalar_chain::compute_convert_gain () if (XVECEXP (XEXP (src, 1), 0, 0) == const0_rtx) { // movd (4 bytes) replaced with movdqa (4 bytes). - if (!optimize_bb_for_size_p (bb)) + if (!!speed_p) igain += COSTS_N_INSNS (ix86_cost->sse_to_integer - ix86_cost->xmm_move) / 2; } else { // pshufd; movd replaced with pshufd. - if (optimize_bb_for_size_p (bb)) + if (!speed_p) igain += COSTS_N_BYTES (4); else igain += ix86_cost->sse_to_integer; @@ -788,55 +822,34 @@ general_scalar_chain::compute_convert_gain () } } + if (speed_p) + weighted_gain += bb_freq * igain; + gain += igain; + if (igain != 0 && dump_file) { - fprintf (dump_file, " Instruction gain %d for ", igain); + fprintf (dump_file, " Instruction gain %d with bb_freq %.2f for", + igain, bb_freq.to_double ()); dump_insn_slim (dump_file, insn); } - gain += igain; } if (dump_file) - fprintf (dump_file, " Instruction conversion gain: %d\n", gain); - - /* Cost the integer to sse and sse to integer moves. */ - if (!optimize_function_for_size_p (cfun)) { - cost += n_sse_to_integer * COSTS_N_INSNS (ix86_cost->sse_to_integer) / 2; - /* ??? integer_to_sse but we only have that in the RA cost table. - Assume sse_to_integer/integer_to_sse are the same which they - are at the moment. */ - cost += n_integer_to_sse * COSTS_N_INSNS (ix86_cost->integer_to_sse) / 2; + fprintf (dump_file, " Instruction conversion gain: %d, \n", + gain); + fprintf (dump_file, " Registers conversion cost: %d\n", + cost_sse_integer); + fprintf (dump_file, " Weighted instruction conversion gain: %.2f, \n", + weighted_gain.to_double ()); + fprintf (dump_file, " Weighted registers conversion cost: %.2f\n", + weighted_cost_sse_integer.to_double ()); } - else if (TARGET_64BIT || smode == SImode) - { - cost += n_sse_to_integer * COSTS_N_BYTES (4); - cost += n_integer_to_sse * COSTS_N_BYTES (4); - } - else if (TARGET_SSE4_1) - { - /* vmovd (4 bytes) + vpextrd (6 bytes). */ - cost += n_sse_to_integer * COSTS_N_BYTES (10); - /* vmovd (4 bytes) + vpinsrd (6 bytes). */ - cost += n_integer_to_sse * COSTS_N_BYTES (10); - } - else - { - /* movd (4 bytes) + psrlq (5 bytes) + movd (4 bytes). */ - cost += n_sse_to_integer * COSTS_N_BYTES (13); - /* movd (4 bytes) + movd (4 bytes) + unpckldq (4 bytes). */ - cost += n_integer_to_sse * COSTS_N_BYTES (12); - } - - if (dump_file) - fprintf (dump_file, " Registers conversion cost: %d\n", cost); - - gain -= cost; - if (dump_file) - fprintf (dump_file, " Total gain: %d\n", gain); - - return gain; + if (weighted_gain != weighted_cost_sse_integer) + return weighted_gain > weighted_cost_sse_integer; + else + return gain > cost_sse_integer;; } /* Insert generated conversion instruction sequence INSNS @@ -1553,21 +1566,22 @@ timode_immed_const_gain (rtx cst, basic_block bb) return 0; } -/* Compute a gain for chain conversion. */ +/* Return true it's cost profitable for for chain conversion. */ -int +bool timode_scalar_chain::compute_convert_gain () { /* Assume that if we have to move TImode values between units, then transforming this chain isn't worth it. */ - if (n_sse_to_integer || n_integer_to_sse) - return -1; + if (cost_sse_integer) + return false; bitmap_iterator bi; unsigned insn_uid; /* Split ties to prefer V1TImode when not optimizing for size. */ int gain = optimize_size ? 0 : 1; + sreal weighted_gain = 0; if (dump_file) fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id); @@ -1582,32 +1596,33 @@ timode_scalar_chain::compute_convert_gain () basic_block bb = BLOCK_FOR_INSN (insn); int scost, vcost; int igain = 0; + profile_count entry_count = ENTRY_BLOCK_PTR_FOR_FN (cfun)->count; + bool speed_p = optimize_bb_for_speed_p (bb); + sreal bb_freq = bb->count.to_sreal_scale (entry_count); switch (GET_CODE (src)) { case REG: - if (optimize_bb_for_size_p (bb)) + if (!speed_p) igain = MEM_P (dst) ? COSTS_N_BYTES (6) : COSTS_N_BYTES (3); else igain = COSTS_N_INSNS (1); break; case MEM: - igain = optimize_bb_for_size_p (bb) ? COSTS_N_BYTES (7) - : COSTS_N_INSNS (1); + igain = !speed_p ? COSTS_N_BYTES (7) : COSTS_N_INSNS (1); break; case CONST_INT: if (MEM_P (dst) && standard_sse_constant_p (src, V1TImode)) - igain = optimize_bb_for_size_p (bb) ? COSTS_N_BYTES (11) : 1; + igain = !speed_p ? COSTS_N_BYTES (11) : 1; break; case CONST_WIDE_INT: /* 2 x mov vs. vmovdqa. */ if (MEM_P (dst)) - igain = optimize_bb_for_size_p (bb) ? COSTS_N_BYTES (3) - : COSTS_N_INSNS (1); + igain = !speed_p ? COSTS_N_BYTES (3) : COSTS_N_INSNS (1); break; case NOT: @@ -1628,7 +1643,7 @@ timode_scalar_chain::compute_convert_gain () case LSHIFTRT: /* See ix86_expand_v1ti_shift. */ op1val = INTVAL (XEXP (src, 1)); - if (optimize_bb_for_size_p (bb)) + if (!speed_p) { if (op1val == 64 || op1val == 65) scost = COSTS_N_BYTES (5); @@ -1662,7 +1677,7 @@ timode_scalar_chain::compute_convert_gain () case ASHIFTRT: /* See ix86_expand_v1ti_ashiftrt. */ op1val = INTVAL (XEXP (src, 1)); - if (optimize_bb_for_size_p (bb)) + if (!speed_p) { if (op1val == 64 || op1val == 127) scost = COSTS_N_BYTES (7); @@ -1740,7 +1755,7 @@ timode_scalar_chain::compute_convert_gain () case ROTATERT: /* See ix86_expand_v1ti_rotate. */ op1val = INTVAL (XEXP (src, 1)); - if (optimize_bb_for_size_p (bb)) + if (!speed_p) { scost = COSTS_N_BYTES (13); if ((op1val & 31) == 0) @@ -1772,34 +1787,40 @@ timode_scalar_chain::compute_convert_gain () { if (GET_CODE (XEXP (src, 0)) == AND) /* and;and;or (9 bytes) vs. ptest (5 bytes). */ - igain = optimize_bb_for_size_p (bb) ? COSTS_N_BYTES (4) - : COSTS_N_INSNS (2); + igain = !speed_p ? COSTS_N_BYTES (4) : COSTS_N_INSNS (2); /* or (3 bytes) vs. ptest (5 bytes). */ - else if (optimize_bb_for_size_p (bb)) + else if (!speed_p) igain = -COSTS_N_BYTES (2); } else if (XEXP (src, 1) == const1_rtx) /* and;cmp -1 (7 bytes) vs. pcmpeqd;pxor;ptest (13 bytes). */ - igain = optimize_bb_for_size_p (bb) ? -COSTS_N_BYTES (6) - : -COSTS_N_INSNS (1); + igain = !speed_p ? -COSTS_N_BYTES (6) : -COSTS_N_INSNS (1); break; default: break; } + gain += igain; + if (speed_p) + weighted_gain += bb_freq * igain; + if (igain != 0 && dump_file) { - fprintf (dump_file, " Instruction gain %d for ", igain); + fprintf (dump_file, " Instruction gain %d with bb_freq %.2f for ", + igain, bb_freq.to_double ()); dump_insn_slim (dump_file, insn); } - gain += igain; } if (dump_file) - fprintf (dump_file, " Total gain: %d\n", gain); + fprintf (dump_file, " Total gain: %d, weighted gain %.2f\n", + gain, weighted_gain.to_double ()); - return gain; + if (weighted_gain > (sreal) 0) + return true; + else + return gain > 0; } /* Fix uses of converted REG in debug insns. */ @@ -2595,7 +2616,7 @@ convert_scalars_to_vector (bool timode_p) conversions. */ if (chain->build (&candidates[i], uid, disallowed)) { - if (chain->compute_convert_gain () > 0) + if (chain->compute_convert_gain ()) converted_insns += chain->convert (); else if (dump_file) fprintf (dump_file, "Chain #%d conversion is not profitable\n", diff --git a/gcc/config/i386/i386-features.h b/gcc/config/i386/i386-features.h index 7f7c0f78c96..e3719b31529 100644 --- a/gcc/config/i386/i386-features.h +++ b/gcc/config/i386/i386-features.h @@ -153,12 +153,13 @@ class scalar_chain bitmap insns_conv; hash_map<rtx, rtx> defs_map; - unsigned n_sse_to_integer; - unsigned n_integer_to_sse; + /* Cost of inserted conversion between ineteger and sse. */ + int cost_sse_integer; + sreal weighted_cost_sse_integer; auto_vec<rtx_insn *> control_flow_insns; bool build (bitmap candidates, unsigned insn_uid, bitmap disallowed); - virtual int compute_convert_gain () = 0; + virtual bool compute_convert_gain () = 0; int convert (); protected: @@ -184,7 +185,7 @@ class general_scalar_chain : public scalar_chain public: general_scalar_chain (enum machine_mode smode_, enum machine_mode vmode_) : scalar_chain (smode_, vmode_) {} - int compute_convert_gain () final override; + bool compute_convert_gain () final override; private: void convert_insn (rtx_insn *insn) final override; @@ -196,7 +197,7 @@ class timode_scalar_chain : public scalar_chain { public: timode_scalar_chain () : scalar_chain (TImode, V1TImode) {} - int compute_convert_gain () final override; + bool compute_convert_gain () final override; private: void fix_debug_reg_uses (rtx reg); -- 2.34.1