Currently SLP of invariant loads is only supported for the case of
a single load that is splat, as side-effect of supporting this case
even for non-invariant loads. The following extends this to any
set of invariant loads. The way we have load permutations for
these makes it a bit awkward, thus adjustments in that area.
Bootstrapped and tested on x86_64-unknown-linux-gnu, will push later.
PR tree-optimization/122573
* tree-vect-slp.cc (vect_build_slp_tree_1): Support
groups of invariant loads.
(vect_build_slp_tree_2): Likewise.
(vect_transform_slp_perm_load_1): Likewise.
* tree-vect-stmts.cc (vectorizable_load): Handle non-splat
SLP for invaraint loads.
* gcc.dg/vect/slp-58.c: New testcase.
---
gcc/testsuite/gcc.dg/vect/slp-58.c | 28 ++++++++
gcc/tree-vect-slp.cc | 22 ++++--
gcc/tree-vect-stmts.cc | 104 +++++++++++++++++++++--------
3 files changed, 124 insertions(+), 30 deletions(-)
create mode 100644 gcc/testsuite/gcc.dg/vect/slp-58.c
diff --git a/gcc/testsuite/gcc.dg/vect/slp-58.c
b/gcc/testsuite/gcc.dg/vect/slp-58.c
new file mode 100644
index 00000000000..e03cfa3f115
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-58.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_float } */
+
+struct S {
+ float m_col1[4];
+ float m_col2[4];
+ float m_col3[4];
+ float m_col4[4];
+};
+
+void apply(struct S *s, const float *in, float *out, long numPixels)
+{
+ for (long idx = 0; idx < numPixels; ++idx)
+ {
+ const float r = in[0];
+ const float g = in[1];
+ const float b = in[2];
+ const float a = in[3];
+ out[0] = r*s->m_col1[0] + g*s->m_col2[0] + b*s->m_col3[0] +
a*s->m_col4[0];
+ out[1] = r*s->m_col1[1] + g*s->m_col2[1] + b*s->m_col3[1] +
a*s->m_col4[1];
+ out[2] = r*s->m_col1[2] + g*s->m_col2[2] + b*s->m_col3[2] +
a*s->m_col4[2];
+ out[3] = r*s->m_col1[3] + g*s->m_col2[3] + b*s->m_col3[3] +
a*s->m_col4[3];
+ in += 4;
+ out += 4;
+ }
+}
+
+/* { dg-final { scan-tree-dump "vectorization factor = 1" "vect" { target { !
vect_load_lanes } } } } */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index c0436ac20dc..07e22ea7ccf 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -1539,9 +1539,13 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char
*swap,
&& !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
/* Not grouped loads are handled as externals for BB
vectorization. For loop vectorization we can handle
- splats the same we handle single element interleaving. */
+ splats the same we handle single element interleaving.
+ Likewise we can handle a collection of invariant refs. */
&& (is_a <bb_vec_info> (vinfo)
- || stmt_info != first_stmt_info))
+ || (stmt_info != first_stmt_info
+ && !(integer_zerop (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)))
+ && integer_zerop (DR_STEP (STMT_VINFO_DATA_REF
+ (first_stmt_info)))))))
{
/* Not grouped load. */
if (dump_enabled_p ())
@@ -2094,7 +2098,10 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
load_place = vect_get_place_in_interleaving_chain
(load_info, first_stmt_info);
else
- load_place = 0;
+ /* Recognize the splat case as { 0, 0, ... } but make
+ sure to use the appropriate refs for collections
+ of invariant refs. */
+ load_place = (load_info == stmt_info) ? 0 : j;
gcc_assert (load_place != -1);
any_permute |= load_place != j;
load_permutation.quick_push (load_place);
@@ -10975,7 +10982,14 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo,
slp_tree node,
machine_mode mode;
if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
- dr_group_size = 1;
+ {
+ /* We have both splats of the same non-grouped load and groups
+ of distinct invariant loads entering here. */
+ unsigned max_idx = 0;
+ for (auto idx : perm)
+ max_idx = idx > max_idx ? idx : max_idx;
+ dr_group_size = max_idx + 1;
+ }
else
{
stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index d08f5f19fd4..2bc68e25b90 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -9903,10 +9903,24 @@ vectorizable_load (vec_info *vinfo,
once at analysis time, remembered and used in the
transform time. */
bool hoist_p = (LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo)
- && !nested_in_vect_loop
- && hoist_defs_of_uses (stmt_info->stmt, loop, false));
+ && !nested_in_vect_loop);
+ bool uniform_p = true;
+ for (stmt_vec_info sinfo : SLP_TREE_SCALAR_STMTS (slp_node))
+ {
+ hoist_p = hoist_p && hoist_defs_of_uses (sinfo->stmt, loop, false);
+ if (sinfo != SLP_TREE_SCALAR_STMTS (slp_node)[0])
+ uniform_p = false;
+ }
if (costing_p)
{
+ if (!uniform_p && (!hoist_p || !vf.is_constant ()))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "not vectorizing non-uniform invariant "
+ "load\n");
+ return false;
+ }
enum vect_cost_model_location cost_loc
= hoist_p ? vect_prologue : vect_body;
unsigned int cost = record_stmt_cost (cost_vec, 1, scalar_load,
@@ -9924,39 +9938,77 @@ vectorizable_load (vec_info *vinfo,
}
if (hoist_p)
{
- gassign *stmt = as_a <gassign *> (stmt_info->stmt);
- if (dump_enabled_p ())
- dump_printf_loc (MSG_NOTE, vect_location,
- "hoisting out of the vectorized loop: %G",
- (gimple *) stmt);
- scalar_dest = copy_ssa_name (scalar_dest);
- tree rhs = unshare_expr (gimple_assign_rhs1 (stmt));
- edge pe = loop_preheader_edge (loop);
- gphi *vphi = get_virtual_phi (loop->header);
- tree vuse;
- if (vphi)
- vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe);
- else
- vuse = gimple_vuse (gsi_stmt (*gsi));
- gimple *new_stmt = gimple_build_assign (scalar_dest, rhs);
- gimple_set_vuse (new_stmt, vuse);
- gsi_insert_on_edge_immediate (pe, new_stmt);
- hoist_defs_of_uses (new_stmt, loop, true);
+ /* ??? For non-uniform lanes there could be still duplicates.
+ We're leaving those to post-vectorizer CSE for the moment. */
+ auto_vec<tree> scalar_defs (SLP_TREE_LANES (slp_node));
+ for (stmt_vec_info sinfo : SLP_TREE_SCALAR_STMTS (slp_node))
+ {
+ gassign *stmt = as_a <gassign *> (sinfo->stmt);
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "hoisting out of the vectorized loop: %G",
+ (gimple *) stmt);
+ scalar_dest = copy_ssa_name (gimple_assign_lhs (stmt));
+ tree rhs = unshare_expr (gimple_assign_rhs1 (stmt));
+ edge pe = loop_preheader_edge (loop);
+ gphi *vphi = get_virtual_phi (loop->header);
+ tree vuse;
+ if (vphi)
+ vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe);
+ else
+ vuse = gimple_vuse (gsi_stmt (*gsi));
+ gimple *new_stmt = gimple_build_assign (scalar_dest, rhs);
+ gimple_set_vuse (new_stmt, vuse);
+ gsi_insert_on_edge_immediate (pe, new_stmt);
+ hoist_defs_of_uses (new_stmt, loop, true);
+ if (!useless_type_conversion_p (TREE_TYPE (vectype),
+ TREE_TYPE (scalar_dest)))
+ {
+ tree tem = make_ssa_name (TREE_TYPE (vectype));
+ new_stmt = gimple_build_assign (tem,
+ NOP_EXPR, scalar_dest);
+ gsi_insert_on_edge_immediate (pe, new_stmt);
+ scalar_dest = tem;
+ }
+ scalar_defs.quick_push (scalar_dest);
+ if (uniform_p)
+ break;
+ }
+ if (!uniform_p)
+ {
+ unsigned const_nunits
+ = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
+ for (j = 0; j < (int) vec_num; ++j)
+ {
+ vec<constructor_elt, va_gc> *v = NULL;
+ vec_safe_reserve (v, const_nunits, true);
+ for (unsigned i = 0; i < const_nunits; ++i)
+ {
+ unsigned def_idx
+ = (j * const_nunits + i) % SLP_TREE_LANES (slp_node);
+ CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
+ scalar_defs[def_idx]);
+ }
+ scalar_dest = build_constructor (vectype, v);
+ new_temp = vect_init_vector (vinfo, stmt_info, scalar_dest,
+ vectype, NULL);
+ slp_node->push_vec_def (new_temp);
+ }
+ return true;
+ }
+ new_temp = vect_init_vector (vinfo, stmt_info, scalar_dest,
+ vectype, NULL);
}
- /* These copies are all equivalent. */
- if (hoist_p)
- new_temp = vect_init_vector (vinfo, stmt_info, scalar_dest,
- vectype, NULL);
else
{
+ gcc_assert (uniform_p);
gimple_stmt_iterator gsi2 = *gsi;
gsi_next (&gsi2);
new_temp = vect_init_vector (vinfo, stmt_info, scalar_dest,
vectype, &gsi2);
}
- gimple *new_stmt = SSA_NAME_DEF_STMT (new_temp);
for (j = 0; j < (int) vec_num; ++j)
- slp_node->push_vec_def (new_stmt);
+ slp_node->push_vec_def (new_temp);
return true;
}
--
2.51.0