Hi,

This patch adds an explicit variant of vect_transform_slp_perm_load that
just does the analysis part of vect_transform_slp_perm_load.

I find it slightly clearer to indicate "analysis" in the
function name already rather than having to pass "analyze_only = true"
and set two other params to NULL.

One call of vect_analyze_slp_perm_load is equivalent to

   return vect_transform_slp_perm_load_1 (vinfo, node,
                                          SLP_TREE_LOAD_PERMUTATION (node),
                                          vNULL, nullptr, vf, true,
                                          dump_enabled_p (), n_perms,
                                          punning_vectype);

The patch also introduces an override for the vectype and moves the
load-perm analysis from before get_load_store_type into the function.

This is more a small refactoring than anything and in preparation for the 
"grouped gather" patch.  Should we have more guardrails on the overriding 
vectype like asserting same size etc.?

Bootstrapped and regtested on x86 and power10.  Regtested on rv64gcv_zvl512b, 
still running on aarch64.

Regards
 Robin

gcc/ChangeLog:

        * tree-vect-slp.cc (vect_transform_slp_perm_load_1): Add
        type-punning argument.
        (vect_transform_slp_perm_load): Ditto.
        (vect_analyze_slp_perm_load): New function.
        * tree-vect-stmts.cc (get_load_store_type): Add perm_ok
        argument.
        (vectorizable_store): Ditto.
        (vectorizable_load): Ditto.
        * tree-vectorizer.h (vect_transform_slp_perm_load): Add
        argument.
        (vect_analyze_slp_perm_load): Ditto.
---
 gcc/tree-vect-slp.cc   | 35 ++++++++++++++++++++++++++++++-----
 gcc/tree-vect-stmts.cc | 38 +++++++++++++++++++-------------------
 gcc/tree-vectorizer.h  |  5 +++++
 3 files changed, 54 insertions(+), 24 deletions(-)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 895fb88ab7f..ead9b558131 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -59,6 +59,7 @@ static bool vect_transform_slp_perm_load_1 (vec_info *, 
slp_tree,
                                            gimple_stmt_iterator *,
                                            poly_uint64, bool, bool,
                                            unsigned *,
+                                           tree = NULL_TREE,
                                            unsigned * = nullptr,
                                            bool = false);
 static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
@@ -10595,12 +10596,16 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo, 
slp_tree node,
                                const vec<tree> &dr_chain,
                                gimple_stmt_iterator *gsi, poly_uint64 vf,
                                bool analyze_only, bool dump_p,
-                               unsigned *n_perms, unsigned int *n_loads,
+                               unsigned *n_perms,
+                               tree punning_vectype,
+                               unsigned int *n_loads,
                                bool dce_chain)
 {
   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
   int vec_index = 0;
-  tree vectype = SLP_TREE_VECTYPE (node);
+  tree vectype = punning_vectype;
+  if (!vectype)
+    vectype = SLP_TREE_VECTYPE (node);
   unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
   unsigned int mask_element;
   unsigned dr_group_size;
@@ -10868,22 +10873,42 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo, 
slp_tree node,
    permute statements for the SLP node NODE.  Store the number of vector
    permute instructions in *N_PERMS and the number of vector load
    instructions in *N_LOADS.  If DCE_CHAIN is true, remove all definitions
-   that were not needed.  */
+   that were not needed.
+   When PUNNING_VECTYPE is passed, use that one instead of NODE's vectype
+   for calculating the permutations.  This can be used when performing the
+   load with a different ("punning") vectype and we want to know whether
+   the load permutation would be a nop with the punning vectype.  */
 
 bool
 vect_transform_slp_perm_load (vec_info *vinfo,
                              slp_tree node, const vec<tree> &dr_chain,
                              gimple_stmt_iterator *gsi, poly_uint64 vf,
                              bool analyze_only, unsigned *n_perms,
-                             unsigned int *n_loads, bool dce_chain)
+                             tree punning_vectype, unsigned int *n_loads,
+                             bool dce_chain)
 {
   return vect_transform_slp_perm_load_1 (vinfo, node,
                                         SLP_TREE_LOAD_PERMUTATION (node),
                                         dr_chain, gsi, vf, analyze_only,
-                                        dump_enabled_p (), n_perms, n_loads,
+                                        dump_enabled_p (), n_perms,
+                                        punning_vectype, n_loads,
                                         dce_chain);
 }
 
+/* Similar to vect_transform_slp_perm_load but only perform analysis
+   without changing anything.  */
+
+bool
+vect_analyze_slp_perm_load (vec_info *vinfo, slp_tree node, poly_uint64 vf,
+                           unsigned *n_perms, tree punning_vectype)
+{
+  return vect_transform_slp_perm_load_1 (vinfo, node,
+                                        SLP_TREE_LOAD_PERMUTATION (node),
+                                        vNULL, nullptr, vf, true,
+                                        dump_enabled_p (), n_perms,
+                                        punning_vectype);
+}
+
 /* Produce the next vector result for SLP permutation NODE by adding a vector
    statement at GSI.  If MASK_VEC is nonnull, add:
 
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 6274956e2a5..f3cc54b6c4c 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1972,7 +1972,7 @@ static bool
 get_load_store_type (vec_info  *vinfo, stmt_vec_info stmt_info,
                     tree vectype, slp_tree slp_node,
                     bool masked_p, vec_load_store_type vls_type,
-                    bool perm_ok, vect_load_store_data *ls)
+                    bool *perm_ok, vect_load_store_data *ls)
 {
   vect_memory_access_type *memory_access_type = &ls->memory_access_type;
   poly_int64 *poffset = &ls->poffset;
@@ -1989,6 +1989,8 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
stmt_info,
   unsigned HOST_WIDE_INT gap;
   bool single_element_p;
   poly_int64 neg_ldst_offset = 0;
+  poly_int64 vf = loop_vinfo ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
+  unsigned int *n_perms = &ls->n_perms;
 
   *misalignment = DR_MISALIGNMENT_UNKNOWN;
   *poffset = 0;
@@ -2030,6 +2032,9 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
stmt_info,
   if (! SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
     first_dr_info = STMT_VINFO_DR_INFO (SLP_TREE_SCALAR_STMTS (slp_node)[0]);
 
+  if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
+    *perm_ok = vect_analyze_slp_perm_load (vinfo, slp_node, vf, n_perms);
+
   if (STMT_VINFO_STRIDED_P (first_stmt_info))
     /* Try to use consecutive accesses of as many elements as possible,
        separated by the stride, until we have a complete vector.
@@ -2162,7 +2167,7 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
stmt_info,
          && (*memory_access_type == VMAT_CONTIGUOUS
              || *memory_access_type == VMAT_CONTIGUOUS_REVERSE)
          && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
-         && !perm_ok)
+         && !*perm_ok)
        {
          *memory_access_type = VMAT_ELEMENTWISE;
          if (dump_enabled_p ())
@@ -7878,11 +7883,13 @@ vectorizable_store (vec_info *vinfo,
   if (!STMT_VINFO_DATA_REF (stmt_info))
     return false;
 
+  bool perm_ok_tmp;
+
   vect_load_store_data _ls_data{};
   vect_load_store_data &ls = slp_node->get_data (_ls_data);
   if (cost_vec
       && !get_load_store_type (vinfo, stmt_info, vectype, slp_node, mask_node,
-                              vls_type, false, &_ls_data))
+                              vls_type, &perm_ok_tmp, &_ls_data))
     return false;
   /* Temporary aliases to analysis data, should not be modified through
      these.  */
@@ -9449,16 +9456,12 @@ vectorizable_load (vec_info *vinfo,
     group_size = 1;
 
   bool perm_ok = true;
-  unsigned n_perms = -1U;
-  if (cost_vec && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
-    perm_ok = vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL, vf,
-                                           true, &n_perms);
 
   vect_load_store_data _ls_data{};
   vect_load_store_data &ls = slp_node->get_data (_ls_data);
   if (cost_vec
       && !get_load_store_type (vinfo, stmt_info, vectype, slp_node, mask_node,
-                              VLS_LOAD, perm_ok, &ls))
+                              VLS_LOAD, &perm_ok, &ls))
     return false;
   /* Temporary aliases to analysis data, should not be modified through
      these.  */
@@ -9523,10 +9526,7 @@ vectorizable_load (vec_info *vinfo,
                                 "unsupported load permutation\n");
              return false;
            }
-         ls.n_perms = n_perms;
        }
-      else
-       n_perms = ls.n_perms;
     }
 
   if (slp_node->ldst_lanes
@@ -9999,8 +9999,8 @@ vectorizable_load (vec_info *vinfo,
        {
          if (costing_p)
            {
-             gcc_assert (n_perms != -1U);
-             inside_cost += record_stmt_cost (cost_vec, n_perms, vec_perm,
+             gcc_assert (ls.n_perms != -1U);
+             inside_cost += record_stmt_cost (cost_vec, ls.n_perms, vec_perm,
                                               slp_node, 0, vect_body);
            }
          else
@@ -10008,7 +10008,7 @@ vectorizable_load (vec_info *vinfo,
              unsigned n_perms2;
              vect_transform_slp_perm_load (vinfo, slp_node, dr_chain, gsi, vf,
                                            false, &n_perms2);
-             gcc_assert (n_perms == n_perms2);
+             gcc_assert (ls.n_perms == n_perms2);
            }
        }
 
@@ -11393,9 +11393,9 @@ vectorizable_load (vec_info *vinfo,
         in PR101120 and friends.  */
       if (costing_p)
        {
-         gcc_assert (n_perms != -1U);
-         if (n_perms != 0)
-           inside_cost = record_stmt_cost (cost_vec, n_perms, vec_perm,
+         gcc_assert (ls.n_perms != -1U);
+         if (ls.n_perms != 0)
+           inside_cost = record_stmt_cost (cost_vec, ls.n_perms, vec_perm,
                                            slp_node, 0, vect_body);
        }
       else
@@ -11403,8 +11403,8 @@ vectorizable_load (vec_info *vinfo,
          unsigned n_perms2;
          bool ok = vect_transform_slp_perm_load (vinfo, slp_node, dr_chain,
                                                  gsi, vf, false, &n_perms2,
-                                                 nullptr, true);
-         gcc_assert (ok && n_perms == n_perms2);
+                                                 NULL_TREE, nullptr, true);
+         gcc_assert (ok && ls.n_perms == n_perms2);
        }
       dr_chain.release ();
     }
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index b7c2188ab3d..8f0e99b1457 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2755,7 +2755,12 @@ extern void vect_free_slp_instance (slp_instance);
 extern bool vect_transform_slp_perm_load (vec_info *, slp_tree, const 
vec<tree> &,
                                          gimple_stmt_iterator *, poly_uint64,
                                          bool, unsigned *,
+                                         tree = NULL_TREE,
                                          unsigned * = nullptr, bool = false);
+extern bool vect_analyze_slp_perm_load (vec_info *, slp_tree,
+                                       poly_uint64,
+                                       unsigned *,
+                                       tree = NULL_TREE);
 extern bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
                                          slp_tree, stmt_vector_for_cost *);
 extern bool vect_slp_analyze_operations (vec_info *);
-- 
2.51.0

Reply via email to