https://gcc.gnu.org/g:a645e903e8c3940f521262ff7ccb7a46e72e3604
commit r16-5275-ga645e903e8c3940f521262ff7ccb7a46e72e3604 Author: Richard Biener <[email protected]> Date: Thu Nov 6 11:49:31 2025 +0100 tree-optimization/122573 - enhance SLP of invariant loads Currently SLP of invariant loads is only supported for the case of a single load that is splat, as side-effect of supporting this case even for non-invariant loads. The following extends this to any set of invariant loads. The way we have load permutations for these makes it a bit awkward, thus adjustments in that area. PR tree-optimization/122573 * tree-vect-slp.cc (vect_build_slp_tree_1): Support groups of invariant loads. (vect_build_slp_tree_2): Likewise. (vect_transform_slp_perm_load_1): Likewise. * tree-vect-stmts.cc (vectorizable_load): Handle non-splat SLP for invaraint loads. * gcc.dg/vect/slp-58.c: New testcase. Diff: --- gcc/testsuite/gcc.dg/vect/slp-58.c | 28 ++++++++++ gcc/tree-vect-slp.cc | 22 ++++++-- gcc/tree-vect-stmts.cc | 104 +++++++++++++++++++++++++++---------- 3 files changed, 124 insertions(+), 30 deletions(-) diff --git a/gcc/testsuite/gcc.dg/vect/slp-58.c b/gcc/testsuite/gcc.dg/vect/slp-58.c new file mode 100644 index 000000000000..e03cfa3f115f --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-58.c @@ -0,0 +1,28 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_float } */ + +struct S { + float m_col1[4]; + float m_col2[4]; + float m_col3[4]; + float m_col4[4]; +}; + +void apply(struct S *s, const float *in, float *out, long numPixels) +{ + for (long idx = 0; idx < numPixels; ++idx) + { + const float r = in[0]; + const float g = in[1]; + const float b = in[2]; + const float a = in[3]; + out[0] = r*s->m_col1[0] + g*s->m_col2[0] + b*s->m_col3[0] + a*s->m_col4[0]; + out[1] = r*s->m_col1[1] + g*s->m_col2[1] + b*s->m_col3[1] + a*s->m_col4[1]; + out[2] = r*s->m_col1[2] + g*s->m_col2[2] + b*s->m_col3[2] + a*s->m_col4[2]; + out[3] = r*s->m_col1[3] + g*s->m_col2[3] + b*s->m_col3[3] + a*s->m_col4[3]; + in += 4; + out += 4; + } +} + +/* { dg-final { scan-tree-dump "vectorization factor = 1" "vect" { target { ! vect_load_lanes } } } } */ diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc index c0436ac20dc1..07e22ea7ccfa 100644 --- a/gcc/tree-vect-slp.cc +++ b/gcc/tree-vect-slp.cc @@ -1539,9 +1539,13 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap, && !STMT_VINFO_GATHER_SCATTER_P (stmt_info) /* Not grouped loads are handled as externals for BB vectorization. For loop vectorization we can handle - splats the same we handle single element interleaving. */ + splats the same we handle single element interleaving. + Likewise we can handle a collection of invariant refs. */ && (is_a <bb_vec_info> (vinfo) - || stmt_info != first_stmt_info)) + || (stmt_info != first_stmt_info + && !(integer_zerop (DR_STEP (STMT_VINFO_DATA_REF (stmt_info))) + && integer_zerop (DR_STEP (STMT_VINFO_DATA_REF + (first_stmt_info))))))) { /* Not grouped load. */ if (dump_enabled_p ()) @@ -2094,7 +2098,10 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, load_place = vect_get_place_in_interleaving_chain (load_info, first_stmt_info); else - load_place = 0; + /* Recognize the splat case as { 0, 0, ... } but make + sure to use the appropriate refs for collections + of invariant refs. */ + load_place = (load_info == stmt_info) ? 0 : j; gcc_assert (load_place != -1); any_permute |= load_place != j; load_permutation.quick_push (load_place); @@ -10975,7 +10982,14 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node, machine_mode mode; if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)) - dr_group_size = 1; + { + /* We have both splats of the same non-grouped load and groups + of distinct invariant loads entering here. */ + unsigned max_idx = 0; + for (auto idx : perm) + max_idx = idx > max_idx ? idx : max_idx; + dr_group_size = max_idx + 1; + } else { stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index d08f5f19fd44..2bc68e25b903 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -9903,10 +9903,24 @@ vectorizable_load (vec_info *vinfo, once at analysis time, remembered and used in the transform time. */ bool hoist_p = (LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) - && !nested_in_vect_loop - && hoist_defs_of_uses (stmt_info->stmt, loop, false)); + && !nested_in_vect_loop); + bool uniform_p = true; + for (stmt_vec_info sinfo : SLP_TREE_SCALAR_STMTS (slp_node)) + { + hoist_p = hoist_p && hoist_defs_of_uses (sinfo->stmt, loop, false); + if (sinfo != SLP_TREE_SCALAR_STMTS (slp_node)[0]) + uniform_p = false; + } if (costing_p) { + if (!uniform_p && (!hoist_p || !vf.is_constant ())) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "not vectorizing non-uniform invariant " + "load\n"); + return false; + } enum vect_cost_model_location cost_loc = hoist_p ? vect_prologue : vect_body; unsigned int cost = record_stmt_cost (cost_vec, 1, scalar_load, @@ -9924,39 +9938,77 @@ vectorizable_load (vec_info *vinfo, } if (hoist_p) { - gassign *stmt = as_a <gassign *> (stmt_info->stmt); - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "hoisting out of the vectorized loop: %G", - (gimple *) stmt); - scalar_dest = copy_ssa_name (scalar_dest); - tree rhs = unshare_expr (gimple_assign_rhs1 (stmt)); - edge pe = loop_preheader_edge (loop); - gphi *vphi = get_virtual_phi (loop->header); - tree vuse; - if (vphi) - vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe); - else - vuse = gimple_vuse (gsi_stmt (*gsi)); - gimple *new_stmt = gimple_build_assign (scalar_dest, rhs); - gimple_set_vuse (new_stmt, vuse); - gsi_insert_on_edge_immediate (pe, new_stmt); - hoist_defs_of_uses (new_stmt, loop, true); + /* ??? For non-uniform lanes there could be still duplicates. + We're leaving those to post-vectorizer CSE for the moment. */ + auto_vec<tree> scalar_defs (SLP_TREE_LANES (slp_node)); + for (stmt_vec_info sinfo : SLP_TREE_SCALAR_STMTS (slp_node)) + { + gassign *stmt = as_a <gassign *> (sinfo->stmt); + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "hoisting out of the vectorized loop: %G", + (gimple *) stmt); + scalar_dest = copy_ssa_name (gimple_assign_lhs (stmt)); + tree rhs = unshare_expr (gimple_assign_rhs1 (stmt)); + edge pe = loop_preheader_edge (loop); + gphi *vphi = get_virtual_phi (loop->header); + tree vuse; + if (vphi) + vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe); + else + vuse = gimple_vuse (gsi_stmt (*gsi)); + gimple *new_stmt = gimple_build_assign (scalar_dest, rhs); + gimple_set_vuse (new_stmt, vuse); + gsi_insert_on_edge_immediate (pe, new_stmt); + hoist_defs_of_uses (new_stmt, loop, true); + if (!useless_type_conversion_p (TREE_TYPE (vectype), + TREE_TYPE (scalar_dest))) + { + tree tem = make_ssa_name (TREE_TYPE (vectype)); + new_stmt = gimple_build_assign (tem, + NOP_EXPR, scalar_dest); + gsi_insert_on_edge_immediate (pe, new_stmt); + scalar_dest = tem; + } + scalar_defs.quick_push (scalar_dest); + if (uniform_p) + break; + } + if (!uniform_p) + { + unsigned const_nunits + = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); + for (j = 0; j < (int) vec_num; ++j) + { + vec<constructor_elt, va_gc> *v = NULL; + vec_safe_reserve (v, const_nunits, true); + for (unsigned i = 0; i < const_nunits; ++i) + { + unsigned def_idx + = (j * const_nunits + i) % SLP_TREE_LANES (slp_node); + CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, + scalar_defs[def_idx]); + } + scalar_dest = build_constructor (vectype, v); + new_temp = vect_init_vector (vinfo, stmt_info, scalar_dest, + vectype, NULL); + slp_node->push_vec_def (new_temp); + } + return true; + } + new_temp = vect_init_vector (vinfo, stmt_info, scalar_dest, + vectype, NULL); } - /* These copies are all equivalent. */ - if (hoist_p) - new_temp = vect_init_vector (vinfo, stmt_info, scalar_dest, - vectype, NULL); else { + gcc_assert (uniform_p); gimple_stmt_iterator gsi2 = *gsi; gsi_next (&gsi2); new_temp = vect_init_vector (vinfo, stmt_info, scalar_dest, vectype, &gsi2); } - gimple *new_stmt = SSA_NAME_DEF_STMT (new_temp); for (j = 0; j < (int) vec_num; ++j) - slp_node->push_vec_def (new_stmt); + slp_node->push_vec_def (new_temp); return true; }
