On Thu, Sep 18, 2025 at 1:21 PM Robin Dapp <[email protected]> wrote:
>
> Hi,
>
> This patch adds an explicit variant of vect_transform_slp_perm_load that
> just does the analysis part of vect_transform_slp_perm_load.
>
> I find it slightly clearer to indicate "analysis" in the
> function name already rather than having to pass "analyze_only = true"
> and set two other params to NULL.
>
> One call of vect_analyze_slp_perm_load is equivalent to
>
> return vect_transform_slp_perm_load_1 (vinfo, node,
> SLP_TREE_LOAD_PERMUTATION (node),
> vNULL, nullptr, vf, true,
> dump_enabled_p (), n_perms,
> punning_vectype);
>
> The patch also introduces an override for the vectype and moves the
> load-perm analysis from before get_load_store_type into the function.
>
> This is more a small refactoring than anything and in preparation for the
> "grouped gather" patch. Should we have more guardrails on the overriding
> vectype like asserting same size etc.?
It would need to have the same element size to have the
permutation still reflect reality? So I doubt overriding vectype is what we
want?
>
> Bootstrapped and regtested on x86 and power10. Regtested on rv64gcv_zvl512b,
> still running on aarch64.
>
> Regards
> Robin
>
> gcc/ChangeLog:
>
> * tree-vect-slp.cc (vect_transform_slp_perm_load_1): Add
> type-punning argument.
> (vect_transform_slp_perm_load): Ditto.
> (vect_analyze_slp_perm_load): New function.
> * tree-vect-stmts.cc (get_load_store_type): Add perm_ok
> argument.
> (vectorizable_store): Ditto.
> (vectorizable_load): Ditto.
> * tree-vectorizer.h (vect_transform_slp_perm_load): Add
> argument.
> (vect_analyze_slp_perm_load): Ditto.
> ---
> gcc/tree-vect-slp.cc | 35 ++++++++++++++++++++++++++++++-----
> gcc/tree-vect-stmts.cc | 38 +++++++++++++++++++-------------------
> gcc/tree-vectorizer.h | 5 +++++
> 3 files changed, 54 insertions(+), 24 deletions(-)
>
> diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> index 895fb88ab7f..ead9b558131 100644
> --- a/gcc/tree-vect-slp.cc
> +++ b/gcc/tree-vect-slp.cc
> @@ -59,6 +59,7 @@ static bool vect_transform_slp_perm_load_1 (vec_info *,
> slp_tree,
> gimple_stmt_iterator *,
> poly_uint64, bool, bool,
> unsigned *,
> + tree = NULL_TREE,
> unsigned * = nullptr,
> bool = false);
> static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator
> *,
> @@ -10595,12 +10596,16 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo,
> slp_tree node,
> const vec<tree> &dr_chain,
> gimple_stmt_iterator *gsi, poly_uint64 vf,
> bool analyze_only, bool dump_p,
> - unsigned *n_perms, unsigned int *n_loads,
> + unsigned *n_perms,
> + tree punning_vectype,
> + unsigned int *n_loads,
> bool dce_chain)
> {
> stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
> int vec_index = 0;
> - tree vectype = SLP_TREE_VECTYPE (node);
> + tree vectype = punning_vectype;
> + if (!vectype)
> + vectype = SLP_TREE_VECTYPE (node);
> unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
> unsigned int mask_element;
> unsigned dr_group_size;
> @@ -10868,22 +10873,42 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo,
> slp_tree node,
> permute statements for the SLP node NODE. Store the number of vector
> permute instructions in *N_PERMS and the number of vector load
> instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
> - that were not needed. */
> + that were not needed.
> + When PUNNING_VECTYPE is passed, use that one instead of NODE's vectype
> + for calculating the permutations. This can be used when performing the
> + load with a different ("punning") vectype and we want to know whether
> + the load permutation would be a nop with the punning vectype. */
>
> bool
> vect_transform_slp_perm_load (vec_info *vinfo,
> slp_tree node, const vec<tree> &dr_chain,
> gimple_stmt_iterator *gsi, poly_uint64 vf,
> bool analyze_only, unsigned *n_perms,
> - unsigned int *n_loads, bool dce_chain)
> + tree punning_vectype, unsigned int *n_loads,
> + bool dce_chain)
> {
> return vect_transform_slp_perm_load_1 (vinfo, node,
> SLP_TREE_LOAD_PERMUTATION (node),
> dr_chain, gsi, vf, analyze_only,
> - dump_enabled_p (), n_perms, n_loads,
> + dump_enabled_p (), n_perms,
> + punning_vectype, n_loads,
> dce_chain);
> }
>
> +/* Similar to vect_transform_slp_perm_load but only perform analysis
> + without changing anything. */
> +
> +bool
> +vect_analyze_slp_perm_load (vec_info *vinfo, slp_tree node, poly_uint64 vf,
> + unsigned *n_perms, tree punning_vectype)
> +{
> + return vect_transform_slp_perm_load_1 (vinfo, node,
> + SLP_TREE_LOAD_PERMUTATION (node),
> + vNULL, nullptr, vf, true,
> + dump_enabled_p (), n_perms,
> + punning_vectype);
> +}
> +
> /* Produce the next vector result for SLP permutation NODE by adding a vector
> statement at GSI. If MASK_VEC is nonnull, add:
>
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 6274956e2a5..f3cc54b6c4c 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -1972,7 +1972,7 @@ static bool
> get_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
> tree vectype, slp_tree slp_node,
> bool masked_p, vec_load_store_type vls_type,
> - bool perm_ok, vect_load_store_data *ls)
> + bool *perm_ok, vect_load_store_data *ls)
> {
> vect_memory_access_type *memory_access_type = &ls->memory_access_type;
> poly_int64 *poffset = &ls->poffset;
> @@ -1989,6 +1989,8 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
> stmt_info,
> unsigned HOST_WIDE_INT gap;
> bool single_element_p;
> poly_int64 neg_ldst_offset = 0;
> + poly_int64 vf = loop_vinfo ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
> + unsigned int *n_perms = &ls->n_perms;
>
> *misalignment = DR_MISALIGNMENT_UNKNOWN;
> *poffset = 0;
> @@ -2030,6 +2032,9 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
> stmt_info,
> if (! SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
> first_dr_info = STMT_VINFO_DR_INFO (SLP_TREE_SCALAR_STMTS (slp_node)[0]);
>
> + if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
> + *perm_ok = vect_analyze_slp_perm_load (vinfo, slp_node, vf, n_perms);
> +
> if (STMT_VINFO_STRIDED_P (first_stmt_info))
> /* Try to use consecutive accesses of as many elements as possible,
> separated by the stride, until we have a complete vector.
> @@ -2162,7 +2167,7 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
> stmt_info,
> && (*memory_access_type == VMAT_CONTIGUOUS
> || *memory_access_type == VMAT_CONTIGUOUS_REVERSE)
> && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
> - && !perm_ok)
> + && !*perm_ok)
> {
> *memory_access_type = VMAT_ELEMENTWISE;
> if (dump_enabled_p ())
> @@ -7878,11 +7883,13 @@ vectorizable_store (vec_info *vinfo,
> if (!STMT_VINFO_DATA_REF (stmt_info))
> return false;
>
> + bool perm_ok_tmp;
> +
> vect_load_store_data _ls_data{};
> vect_load_store_data &ls = slp_node->get_data (_ls_data);
> if (cost_vec
> && !get_load_store_type (vinfo, stmt_info, vectype, slp_node,
> mask_node,
> - vls_type, false, &_ls_data))
> + vls_type, &perm_ok_tmp, &_ls_data))
> return false;
> /* Temporary aliases to analysis data, should not be modified through
> these. */
> @@ -9449,16 +9456,12 @@ vectorizable_load (vec_info *vinfo,
> group_size = 1;
>
> bool perm_ok = true;
> - unsigned n_perms = -1U;
> - if (cost_vec && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
> - perm_ok = vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL, vf,
> - true, &n_perms);
>
> vect_load_store_data _ls_data{};
> vect_load_store_data &ls = slp_node->get_data (_ls_data);
> if (cost_vec
> && !get_load_store_type (vinfo, stmt_info, vectype, slp_node,
> mask_node,
> - VLS_LOAD, perm_ok, &ls))
> + VLS_LOAD, &perm_ok, &ls))
> return false;
> /* Temporary aliases to analysis data, should not be modified through
> these. */
> @@ -9523,10 +9526,7 @@ vectorizable_load (vec_info *vinfo,
> "unsupported load permutation\n");
> return false;
> }
> - ls.n_perms = n_perms;
> }
> - else
> - n_perms = ls.n_perms;
> }
>
> if (slp_node->ldst_lanes
> @@ -9999,8 +9999,8 @@ vectorizable_load (vec_info *vinfo,
> {
> if (costing_p)
> {
> - gcc_assert (n_perms != -1U);
> - inside_cost += record_stmt_cost (cost_vec, n_perms, vec_perm,
> + gcc_assert (ls.n_perms != -1U);
> + inside_cost += record_stmt_cost (cost_vec, ls.n_perms, vec_perm,
> slp_node, 0, vect_body);
> }
> else
> @@ -10008,7 +10008,7 @@ vectorizable_load (vec_info *vinfo,
> unsigned n_perms2;
> vect_transform_slp_perm_load (vinfo, slp_node, dr_chain, gsi,
> vf,
> false, &n_perms2);
> - gcc_assert (n_perms == n_perms2);
> + gcc_assert (ls.n_perms == n_perms2);
> }
> }
>
> @@ -11393,9 +11393,9 @@ vectorizable_load (vec_info *vinfo,
> in PR101120 and friends. */
> if (costing_p)
> {
> - gcc_assert (n_perms != -1U);
> - if (n_perms != 0)
> - inside_cost = record_stmt_cost (cost_vec, n_perms, vec_perm,
> + gcc_assert (ls.n_perms != -1U);
> + if (ls.n_perms != 0)
> + inside_cost = record_stmt_cost (cost_vec, ls.n_perms, vec_perm,
> slp_node, 0, vect_body);
> }
> else
> @@ -11403,8 +11403,8 @@ vectorizable_load (vec_info *vinfo,
> unsigned n_perms2;
> bool ok = vect_transform_slp_perm_load (vinfo, slp_node, dr_chain,
> gsi, vf, false, &n_perms2,
> - nullptr, true);
> - gcc_assert (ok && n_perms == n_perms2);
> + NULL_TREE, nullptr, true);
> + gcc_assert (ok && ls.n_perms == n_perms2);
> }
> dr_chain.release ();
> }
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index b7c2188ab3d..8f0e99b1457 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -2755,7 +2755,12 @@ extern void vect_free_slp_instance (slp_instance);
> extern bool vect_transform_slp_perm_load (vec_info *, slp_tree, const
> vec<tree> &,
> gimple_stmt_iterator *, poly_uint64,
> bool, unsigned *,
> + tree = NULL_TREE,
> unsigned * = nullptr, bool = false);
> +extern bool vect_analyze_slp_perm_load (vec_info *, slp_tree,
> + poly_uint64,
> + unsigned *,
> + tree = NULL_TREE);
> extern bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
> slp_tree, stmt_vector_for_cost *);
> extern bool vect_slp_analyze_operations (vec_info *);
> --
> 2.51.0
>