Hi,
Rebased on top of the epilogue mode patch.
OK for trunk?
gcc/ChangeLog:
* tree-vect-loop.c (vect_estimate_min_profitable_iters): Pass
new argument
suggested_unroll_factor.
(vect_analyze_loop_costing): Likewise.
(_loop_vec_info::_loop_vec_info): Initialize new member
suggested_unroll_factor.
(vect_determine_partial_vectors_and_peeling): Make epilogue of
unrolled
main loop use partial vectors.
(vect_analyze_loop_2): Pass and use new argument
suggested_unroll_factor.
(vect_analyze_loop_1): Likewise.
(vect_analyze_loop): Change to intialize local
suggested_unroll_factor and use it.
(vectorizable_reduction): Don't use single_defuse_cycle when
unrolling.
* tree-vectorizer.h (_loop_vec_info::_loop_vec_info): Add new
member suggested_unroll_factor.
(vector_costs::vector_costs): Add new member
m_suggested_unroll_factor.
(vector_costs::suggested_unroll_factor): New getter function.
(finish_cost): Set return argument suggested_unroll_factor.
Regards,
Andre
On 07/12/2021 11:27, Andre Vieira (lists) via Gcc-patches wrote:
Hi,
I've split this particular part off, since it's not only relevant to
unrolling. The new test shows how this is useful for existing
(non-unrolling) cases. I also had to fix the costing function, the
main_vf / epilogue_vf calculations for old and new didn't take into
consideration that the main_vf could be lower, nor did it take into
consideration that they were not necessarily always a multiple of each
other. So using CEIL here is the correct approach.
Bootstrapped and regression tested on aarch64-none-linux-gnu.
OK for trunk?
gcc/ChangeLog:
* tree-vect-loop.c (vect_better_loop_vinfo_p): Round factors
up for epilogue costing.
(vect_analyze_loop): Re-analyze all modes for epilogues.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/masked_epilogue.c: New test.
On 30/11/2021 13:56, Richard Biener wrote:
On Tue, 30 Nov 2021, Andre Vieira (lists) wrote:
On 25/11/2021 12:46, Richard Biener wrote:
Oops, my fault, yes, it does. I would suggest to refactor things so
that the mode_i = first_loop_i case is there only once. I also wonder
if all the argument about starting at 0 doesn't apply to the
not unrolled LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P as well? So
what's the reason to differ here? So in the end I'd just change
the existing
if (LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (first_loop_vinfo))
{
to
if (LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (first_loop_vinfo)
|| first_loop_vinfo->suggested_unroll_factor > 1)
{
and maybe revisit this when we have an actual testcase showing that
doing sth else has a positive effect?
Thanks,
Richard.
So I had a quick chat with Richard Sandiford and he is suggesting
resetting
mode_i to 0 for all cases.
He pointed out that for some tunings the SVE mode might come after
the NEON
mode, which means that even for not-unrolled loop_vinfos we could
end up with
a suboptimal choice of mode for the epilogue. I.e. it could be that
we pick
V16QI for main vectorization, but that's VNx16QI + 1 in the array,
so we'd not
try VNx16QI for the epilogue.
This would simplify the mode selecting cases, by just simply
restarting at
mode_i in all epilogue cases. Is that something you'd be OK?
Works for me with an updated comment. Even better with showing a
testcase exercising such tuning.
Richard.
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index
17b090170d4a5dad22097a727bc25a63e230e278..29cf14c83ac02e2f1372e56f7f96427dafcd4d11
100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -153,7 +153,8 @@ along with GCC; see the file COPYING3. If not see
http://gcc.gnu.org/projects/tree-ssa/vectorization.html
*/
-static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
+static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
+ unsigned *);
static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
bool *, bool *);
@@ -828,6 +829,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in,
vec_info_shared *shared)
skip_main_loop_edge (nullptr),
skip_this_loop_edge (nullptr),
reusable_accumulators (),
+ suggested_unroll_factor (1),
max_vectorization_factor (0),
mask_skip_niters (NULL_TREE),
rgroup_compare_type (NULL_TREE),
@@ -1811,7 +1813,8 @@ vect_known_niters_smaller_than_vf (loop_vec_info
loop_vinfo)
definitely no, or -1 if it's worth retrying. */
static int
-vect_analyze_loop_costing (loop_vec_info loop_vinfo)
+vect_analyze_loop_costing (loop_vec_info loop_vinfo,
+ unsigned *suggested_unroll_factor)
{
class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
@@ -1845,7 +1848,8 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo)
int min_profitable_iters, min_profitable_estimate;
vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
- &min_profitable_estimate);
+ &min_profitable_estimate,
+ suggested_unroll_factor);
if (min_profitable_iters < 0)
{
@@ -2129,10 +2133,16 @@ vect_determine_partial_vectors_and_peeling
(loop_vec_info loop_vinfo,
vectors to the epilogue, with the main loop continuing to operate
on full vectors.
+ If we are unrolling we also do not want to use partial vectors. This
+ is to avoid the overhead of generating multiple masks and also to
+ avoid having to execute entire iterations of FALSE masked instructions
+ when dealing with one or less full iterations.
+
??? We could then end up failing to use partial vectors if we
decide to peel iterations into a prologue, and if the main loop
then ends up processing fewer than VF iterations. */
- if (param_vect_partial_vector_usage == 1
+ if ((param_vect_partial_vector_usage == 1
+ || loop_vinfo->suggested_unroll_factor > 1)
&& !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
&& !vect_known_niters_smaller_than_vf (loop_vinfo))
LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
@@ -2199,7 +2209,8 @@ vect_determine_partial_vectors_and_peeling (loop_vec_info
loop_vinfo,
for it. The different analyses will record information in the
loop_vec_info struct. */
static opt_result
-vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
+vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
+ unsigned *suggested_unroll_factor)
{
opt_result ok = opt_result::success ();
int res;
@@ -2359,6 +2370,12 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool
&fatal)
set of rgroups. */
gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
+ /* Apply the suggested unrolling factor, this was determined by the backend
+ during finish_cost the first time we ran the analyzis for this
+ vector mode. */
+ if (loop_vinfo->suggested_unroll_factor > 1)
+ LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
+
/* This is the point where we can re-start analysis with SLP forced off. */
start_over:
@@ -2550,7 +2567,7 @@ start_over:
return ok;
/* Check the costings of the loop make vectorizing worthwhile. */
- res = vect_analyze_loop_costing (loop_vinfo);
+ res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
if (res < 0)
{
ok = opt_result::failure_at (vect_location,
@@ -2953,7 +2970,7 @@ vect_analyze_loop_1 (class loop *loop, vec_info_shared
*shared,
loop_vec_info main_loop_vinfo,
const vector_modes &vector_modes, unsigned &mode_i,
machine_mode &autodetected_vector_mode,
- bool &fatal)
+ bool &fatal, unsigned int *suggested_unroll_factor = NULL)
{
loop_vec_info loop_vinfo
= vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
@@ -2961,8 +2978,18 @@ vect_analyze_loop_1 (class loop *loop, vec_info_shared
*shared,
machine_mode vector_mode = vector_modes[mode_i];
loop_vinfo->vector_mode = vector_mode;
+ /* Don't ask for a suggested unroll factor for an already unrolled loop
+ vinfo and reset the value for the next analysis. */
+ if (suggested_unroll_factor && *suggested_unroll_factor > 1)
+ {
+ loop_vinfo->suggested_unroll_factor = *suggested_unroll_factor;
+ *suggested_unroll_factor = 1;
+ suggested_unroll_factor = NULL;
+ }
+
/* Run the main analysis. */
- opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal);
+ opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
+ suggested_unroll_factor);
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"***** Analysis %s with vector mode %s\n",
@@ -3072,6 +3099,7 @@ vect_analyze_loop (class loop *loop, vec_info_shared
*shared)
opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
unsigned int mode_i = 0;
unsigned HOST_WIDE_INT simdlen = loop->simdlen;
+ unsigned int suggested_unroll_factor = 1;
/* First determine the main loop vectorization mode, either the first
one that works, starting with auto-detecting the vector mode and then
@@ -3083,7 +3111,8 @@ vect_analyze_loop (class loop *loop, vec_info_shared
*shared)
opt_loop_vec_info loop_vinfo
= vect_analyze_loop_1 (loop, shared, &loop_form_info,
NULL, vector_modes, mode_i,
- autodetected_vector_mode, fatal);
+ autodetected_vector_mode, fatal,
+ &suggested_unroll_factor);
if (fatal)
break;
@@ -3107,7 +3136,18 @@ vect_analyze_loop (class loop *loop, vec_info_shared
*shared)
first_loop_vinfo = opt_loop_vec_info::success (NULL);
}
if (first_loop_vinfo == NULL)
- first_loop_vinfo = loop_vinfo;
+ {
+ first_loop_vinfo = loop_vinfo;
+ if (suggested_unroll_factor > 1)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "***** Re-trying analysis for unrolling"
+ " with unroll factor %d.\n",
+ suggested_unroll_factor);
+ continue;
+ }
+ }
else
{
delete loop_vinfo;
@@ -3169,6 +3209,7 @@ vect_analyze_loop (class loop *loop, vec_info_shared
*shared)
/* ??? If first_loop_vinfo was using VOIDmode then we probably
want to instead search for the corresponding mode in vector_modes[]. */
+ poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
while (1)
{
bool fatal;
@@ -3180,6 +3221,22 @@ vect_analyze_loop (class loop *loop, vec_info_shared
*shared)
if (fatal)
break;
+ if (loop_vinfo
+ && known_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), first_vinfo_vf)
+ && !(LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
+ && maybe_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
+ first_vinfo_vf)))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "***** Will not use %s mode as an"
+ " epilogue, since it leads to an higher"
+ " vectorization factor than main loop\n",
+ GET_MODE_NAME (loop_vinfo->vector_mode));
+ delete loop_vinfo;
+ loop_vinfo = opt_loop_vec_info::success (NULL);
+ }
+
if (loop_vinfo)
{
if (pick_lowest_cost_p)
@@ -3892,7 +3949,8 @@ vect_get_known_peeling_cost (loop_vec_info loop_vinfo,
int peel_iters_prologue,
static void
vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
int *ret_min_profitable_niters,
- int *ret_min_profitable_estimate)
+ int *ret_min_profitable_estimate,
+ unsigned *suggested_unroll_factor)
{
int min_profitable_iters;
int min_profitable_estimate;
@@ -4252,8 +4310,23 @@ vect_estimate_min_profitable_iters (loop_vec_info
loop_vinfo,
}
/* Complete the target-specific cost calculations. */
- finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
- &vec_inside_cost, &vec_epilogue_cost);
+ finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo),
+ &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
+ suggested_unroll_factor);
+
+ if (suggested_unroll_factor && *suggested_unroll_factor > 1
+ && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
+ && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
+ *suggested_unroll_factor,
+ LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "can't unroll as unrolled vectorization factor larger"
+ " than maximum vectorization factor: %d\n",
+ LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
+ *suggested_unroll_factor = 1;
+ }
vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
@@ -7242,7 +7315,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
participating. */
if (ncopies > 1
&& (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
- && reduc_chain_length == 1)
+ && reduc_chain_length == 1
+ && loop_vinfo->suggested_unroll_factor == 1)
single_defuse_cycle = true;
if (single_defuse_cycle || lane_reduc_code_p)
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index
b552e9dccce5bce6a3bbcf5d531e7ccefa719b9a..238e0b4bf21871c518da73a79320f34e55c9201c
100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -624,6 +624,13 @@ public:
about the reductions that generated them. */
hash_map<tree, vect_reusable_accumulator> reusable_accumulators;
+ /* The number of times that the target suggested we unroll the vector loop
+ in order to promote more ILP. This value will be used to re-analyze the
+ loop for vectorization and if successful the value will be folded into
+ vectorization_factor (and therefore exactly divides
+ vectorization_factor). */
+ unsigned int suggested_unroll_factor;
+
/* Maximum runtime vectorization factor, or MAX_VECTORIZATION_FACTOR
if there is no particular limit. */
unsigned HOST_WIDE_INT max_vectorization_factor;
@@ -1430,6 +1437,7 @@ public:
unsigned int prologue_cost () const;
unsigned int body_cost () const;
unsigned int epilogue_cost () const;
+ unsigned int suggested_unroll_factor () const;
protected:
unsigned int record_stmt_cost (stmt_vec_info, vect_cost_model_location,
@@ -1447,6 +1455,9 @@ protected:
/* The costs of the three regions, indexed by vect_cost_model_location. */
unsigned int m_costs[3];
+ /* The suggested unrolling factor determined at finish_cost. */
+ unsigned int m_suggested_unroll_factor;
+
/* True if finish_cost has been called. */
bool m_finished;
};
@@ -1459,6 +1470,7 @@ vector_costs::vector_costs (vec_info *vinfo, bool
costing_for_scalar)
: m_vinfo (vinfo),
m_costing_for_scalar (costing_for_scalar),
m_costs (),
+ m_suggested_unroll_factor(1),
m_finished (false)
{
}
@@ -1490,6 +1502,15 @@ vector_costs::epilogue_cost () const
return m_costs[vect_epilogue];
}
+/* Return the suggested unroll factor. */
+
+inline unsigned int
+vector_costs::suggested_unroll_factor () const
+{
+ gcc_checking_assert (m_finished);
+ return m_suggested_unroll_factor;
+}
+
#define VECT_MAX_COST 1000
/* The maximum number of intermediate steps required in multi-step type
@@ -1665,12 +1686,15 @@ add_stmt_cost (vector_costs *costs, stmt_info_for_cost
*i)
static inline void
finish_cost (vector_costs *costs, unsigned *prologue_cost,
- unsigned *body_cost, unsigned *epilogue_cost)
+ unsigned *body_cost, unsigned *epilogue_cost,
+ unsigned *suggested_unroll_factor = NULL)
{
costs->finish_cost ();
*prologue_cost = costs->prologue_cost ();
*body_cost = costs->body_cost ();
*epilogue_cost = costs->epilogue_cost ();
+ if (suggested_unroll_factor)
+ *suggested_unroll_factor = costs->suggested_unroll_factor ();
}
inline void