On Tue, Nov 5, 2019 at 3:29 PM Richard Sandiford <richard.sandif...@arm.com> wrote: > > This patch adds a mode in which the vectoriser tries each available > base vector mode and picks the one with the lowest cost. For now > the behaviour is behind a default-off --param, but a later patch > enables it by default for SVE. > > The patch keeps the current behaviour of preferring a VF of > loop->simdlen over any larger or smaller VF, regardless of costs > or target preferences.
Can you avoid using a --param for this? Instead I'd suggest to amend the vectorize_modes target hook to return some flags like VECT_FIRST_MODE_WINS. We'd eventually want to make the target able to say do-not-vectorize-epiloges-of-MODE (I think we may not want to vectorize SSE vectorized loop epilogues with MMX-with-SSE or GPRs for example). I guess for the latter we'd use a new target hook. Otherwise looks reasonable. Richard. > > 2019-11-05 Richard Sandiford <richard.sandif...@arm.com> > > gcc/ > * params.def (vect-compare-loop-costs): New param. > * doc/invoke.texi: Document it. > * tree-vectorizer.h (_loop_vec_info::vec_outside_cost) > (_loop_vec_info::vec_inside_cost): New member variables. > * tree-vect-loop.c (_loop_vec_info::_loop_vec_info): Initialize them. > (vect_better_loop_vinfo_p, vect_joust_loop_vinfos): New functions. > (vect_analyze_loop): When the new parameter allows, try vectorizing > the loop with each available vector mode and picking the one with > the lowest cost. > (vect_estimate_min_profitable_iters): Record the computed costs > in the loop_vec_info. > > Index: gcc/params.def > =================================================================== > --- gcc/params.def 2019-10-31 17:15:25.470517368 +0000 > +++ gcc/params.def 2019-11-05 14:19:58.781197820 +0000 > @@ -661,6 +661,13 @@ DEFPARAM(PARAM_VECT_MAX_PEELING_FOR_ALIG > "Maximum number of loop peels to enhance alignment of data > references in a loop.", > -1, -1, 64) > > +DEFPARAM(PARAM_VECT_COMPARE_LOOP_COSTS, > + "vect-compare-loop-costs", > + "Whether to try vectorizing a loop using each supported" > + " combination of vector types and picking the version with the" > + " lowest cost.", > + 0, 0, 1) > + > DEFPARAM(PARAM_MAX_CSELIB_MEMORY_LOCATIONS, > "max-cselib-memory-locations", > "The maximum memory locations recorded by cselib.", > Index: gcc/doc/invoke.texi > =================================================================== > --- gcc/doc/invoke.texi 2019-11-04 21:13:57.611756365 +0000 > +++ gcc/doc/invoke.texi 2019-11-05 14:19:58.777197850 +0000 > @@ -11563,6 +11563,12 @@ doing loop versioning for alias in the v > The maximum number of loop peels to enhance access alignment > for vectorizer. Value -1 means no limit. > > +@item vect-compare-loop-costs > +Whether to try vectorizing a loop using each supported combination of > +vector types and picking the version with the lowest cost. This parameter > +has no effect when @option{-fno-vect-cost-model} or > +@option{-fvect-cost-model=unlimited} are used. > + > @item max-iterations-to-track > The maximum number of iterations of a loop the brute-force algorithm > for analysis of the number of iterations of the loop tries to evaluate. > Index: gcc/tree-vectorizer.h > =================================================================== > --- gcc/tree-vectorizer.h 2019-11-05 14:19:33.829371745 +0000 > +++ gcc/tree-vectorizer.h 2019-11-05 14:19:58.781197820 +0000 > @@ -601,6 +601,13 @@ typedef class _loop_vec_info : public ve > /* Cost of a single scalar iteration. */ > int single_scalar_iteration_cost; > > + /* The cost of the vector prologue and epilogue, including peeled > + iterations and set-up code. */ > + int vec_outside_cost; > + > + /* The cost of the vector loop body. */ > + int vec_inside_cost; > + > /* Is the loop vectorizable? */ > bool vectorizable; > > Index: gcc/tree-vect-loop.c > =================================================================== > --- gcc/tree-vect-loop.c 2019-11-05 14:19:33.829371745 +0000 > +++ gcc/tree-vect-loop.c 2019-11-05 14:19:58.781197820 +0000 > @@ -830,6 +830,8 @@ _loop_vec_info::_loop_vec_info (class lo > scan_map (NULL), > slp_unrolling_factor (1), > single_scalar_iteration_cost (0), > + vec_outside_cost (0), > + vec_inside_cost (0), > vectorizable (false), > can_fully_mask_p (true), > fully_masked_p (false), > @@ -2373,6 +2375,80 @@ vect_analyze_loop_2 (loop_vec_info loop_ > goto start_over; > } > > +/* Return true if vectorizing a loop using NEW_LOOP_VINFO appears > + to be better than vectorizing it using OLD_LOOP_VINFO. Assume that > + OLD_LOOP_VINFO is better unless something specifically indicates > + otherwise. > + > + Note that this deliberately isn't a partial order. */ > + > +static bool > +vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo, > + loop_vec_info old_loop_vinfo) > +{ > + struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo); > + gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop); > + > + poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo); > + poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo); > + > + /* Always prefer a VF of loop->simdlen over any other VF. */ > + if (loop->simdlen) > + { > + bool new_simdlen_p = known_eq (new_vf, loop->simdlen); > + bool old_simdlen_p = known_eq (old_vf, loop->simdlen); > + if (new_simdlen_p != old_simdlen_p) > + return new_simdlen_p; > + } > + > + /* Limit the VFs to what is likely to be the maximum number of iterations, > + to handle cases in which at least one loop_vinfo is fully-masked. */ > + HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop); > + if (estimated_max_niter != -1) > + { > + if (known_le (estimated_max_niter, new_vf)) > + new_vf = estimated_max_niter; > + if (known_le (estimated_max_niter, old_vf)) > + old_vf = estimated_max_niter; > + } > + > + /* Check whether the (fractional) cost per scalar iteration is lower > + or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */ > + poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost > + * poly_widest_int (old_vf)); > + poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost > + * poly_widest_int (new_vf)); > + if (maybe_lt (rel_old, rel_new)) > + return false; > + if (known_lt (rel_new, rel_old)) > + return true; > + > + /* If there's nothing to choose between the loop bodies, see whether > + there's a difference in the prologue and epilogue costs. */ > + if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost) > + return new_loop_vinfo->vec_outside_cost < > old_loop_vinfo->vec_outside_cost; > + > + return false; > +} > + > +/* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return > + true if we should. */ > + > +static bool > +vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo, > + loop_vec_info old_loop_vinfo) > +{ > + if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo)) > + return false; > + > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_NOTE, vect_location, > + "***** Preferring vector mode %s to vector mode %s\n", > + GET_MODE_NAME (new_loop_vinfo->vector_mode), > + GET_MODE_NAME (old_loop_vinfo->vector_mode)); > + return true; > +} > + > /* Function vect_analyze_loop. > > Apply a set of analyses on LOOP, and create a loop_vec_info struct > @@ -2408,6 +2484,8 @@ vect_analyze_loop (class loop *loop, vec > machine_mode next_vector_mode = VOIDmode; > poly_uint64 lowest_th = 0; > unsigned vectorized_loops = 0; > + bool pick_lowest_cost_p = (PARAM_VALUE (PARAM_VECT_COMPARE_LOOP_COSTS) > + && !unlimited_cost_model (loop)); > > bool vect_epilogues = false; > opt_result res = opt_result::success (); > @@ -2428,6 +2506,34 @@ vect_analyze_loop (class loop *loop, vec > > bool fatal = false; > > + /* When pick_lowest_cost_p is true, we should in principle iterate > + over all the loop_vec_infos that LOOP_VINFO could replace and > + try to vectorize LOOP_VINFO under the same conditions. > + E.g. when trying to replace an epilogue loop, we should vectorize > + LOOP_VINFO as an epilogue loop with the same VF limit. When trying > + to replace the main loop, we should vectorize LOOP_VINFO as a main > + loop too. > + > + However, autovectorize_vector_modes is usually sorted as follows: > + > + - Modes that naturally produce lower VFs usually follow modes that > + naturally produce higher VFs. > + > + - When modes naturally produce the same VF, maskable modes > + usually follow unmaskable ones, so that the maskable mode > + can be used to vectorize the epilogue of the unmaskable mode. > + > + This order is preferred because it leads to the maximum > + epilogue vectorization opportunities. Targets should only use > + a different order if they want to make wide modes available while > + disparaging them relative to earlier, smaller modes. The assumption > + in that case is that the wider modes are more expensive in some > + way that isn't reflected directly in the costs. > + > + There should therefore be few interesting cases in which > + LOOP_VINFO fails when treated as an epilogue loop, succeeds when > + treated as a standalone loop, and ends up being genuinely cheaper > + than FIRST_LOOP_VINFO. */ > if (vect_epilogues) > LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo; > > @@ -2475,13 +2581,34 @@ vect_analyze_loop (class loop *loop, vec > LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL; > simdlen = 0; > } > + else if (pick_lowest_cost_p && first_loop_vinfo) > + { > + /* Keep trying to roll back vectorization attempts while the > + loop_vec_infos they produced were worse than this one. */ > + vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos; > + while (!vinfos.is_empty () > + && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ())) > + { > + gcc_assert (vect_epilogues); > + delete vinfos.pop (); > + } > + if (vinfos.is_empty () > + && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo)) > + { > + delete first_loop_vinfo; > + first_loop_vinfo = opt_loop_vec_info::success (NULL); > + LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL; > + } > + } > > if (first_loop_vinfo == NULL) > { > first_loop_vinfo = loop_vinfo; > lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo); > } > - else if (vect_epilogues) > + else if (vect_epilogues > + /* For now only allow one epilogue loop. */ > + && first_loop_vinfo->epilogue_vinfos.is_empty ()) > { > first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo); > poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo); > @@ -2501,12 +2628,14 @@ vect_analyze_loop (class loop *loop, vec > && loop->inner == NULL > && PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK) > && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo) > - /* For now only allow one epilogue loop. */ > - && first_loop_vinfo->epilogue_vinfos.is_empty ()); > + /* For now only allow one epilogue loop, but allow > + pick_lowest_cost_p to replace it. */ > + && (first_loop_vinfo->epilogue_vinfos.is_empty () > + || pick_lowest_cost_p)); > > /* Commit to first_loop_vinfo if we have no reason to try > alternatives. */ > - if (!simdlen && !vect_epilogues) > + if (!simdlen && !vect_epilogues && !pick_lowest_cost_p) > break; > } > else > @@ -3454,7 +3583,11 @@ vect_estimate_min_profitable_iters (loop > &vec_inside_cost, &vec_epilogue_cost); > > vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost); > - > + > + /* Stash the costs so that we can compare two loop_vec_infos. */ > + loop_vinfo->vec_inside_cost = vec_inside_cost; > + loop_vinfo->vec_outside_cost = vec_outside_cost; > + > if (dump_enabled_p ()) > { > dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");