https://gcc.gnu.org/g:0f8bcacf6c4603cb7614aa23ef4a9568e46bddfa

commit r16-5115-g0f8bcacf6c4603cb7614aa23ef4a9568e46bddfa
Author: Robin Dapp <[email protected]>
Date:   Thu Oct 9 17:25:59 2025 +0200

    vect: Reduce group size of consecutive strided accesses.
    
    Consecutive load permutations like {0, 1, 2, 3} or {4, 5, 6, 7} in a
    group of 8 only read a part of the group, leaving a gap.
    
    For strided accesses we can elide the permutation and, instead of
    accessing the whole group, use the number of SLP lanes.  This
    effectively increases the vector size as we don't load gaps.  On top we
    do not need to emit the permutes at all.
    
    gcc/ChangeLog:
    
            * tree-vect-slp.cc (vect_load_perm_consecutive_p): New function.
            (vect_lower_load_permutations): Use.
            (vect_optimize_slp_pass::remove_redundant_permutations): Use.
            * tree-vect-stmts.cc (has_consecutive_load_permutation): New
            function that uses vect_load_perm_consecutive_p.
            (get_load_store_type): Use.
            (vectorizable_load): Reduce group size.
            * tree-vectorizer.h (struct vect_load_store_data): Add
            subchain_p.
            (vect_load_perm_consecutive_p): Declare.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/riscv/rvv/autovec/pr118019-2.c:

Diff:
---
 .../gcc.target/riscv/rvv/autovec/pr118019-2.c      |  2 +-
 gcc/tree-vect-slp.cc                               | 52 +++++++++---------
 gcc/tree-vect-stmts.cc                             | 63 +++++++++++++++++++---
 gcc/tree-vectorizer.h                              |  4 ++
 4 files changed, 88 insertions(+), 33 deletions(-)

diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c
index c8c1a7291fbd..d3436b78377c 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/pr118019-2.c
@@ -47,4 +47,4 @@ x264_pixel_satd_8x4 (uint8_t *pix1, int i_pix1, uint8_t 
*pix2, int i_pix2)
   return (((uint16_t) sum) + ((uint32_t) sum >> 16)) >> 1;
 }
 
-/* { dg-final { scan-assembler-times "vlse64" 8 } } */
+/* { dg-final { scan-assembler-times "vlse32" 4 } } */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index aa6c3e2e041d..c0436ac20dc1 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -5184,6 +5184,25 @@ vllp_cmp (const void *a_, const void *b_)
     }
 }
 
+/* Return whether if the load permutation of NODE is consecutive starting
+   from index START_IDX.  */
+
+bool
+vect_load_perm_consecutive_p (slp_tree node, unsigned start_idx)
+{
+  load_permutation_t perm = SLP_TREE_LOAD_PERMUTATION (node);
+
+  if (!perm.exists () || perm.length () < start_idx)
+    return false;
+
+  unsigned int start = perm[start_idx];
+  for (unsigned int i = start_idx + 1; i < perm.length (); i++)
+    if (perm[i] != start + (unsigned int)i)
+      return false;
+
+  return true;
+}
+
 /* Process the set of LOADS that are all from the same dataref group.  */
 
 static void
@@ -5230,12 +5249,11 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
            ld_lanes_lanes = 0;
            break;
          }
-       for (unsigned i = 1; i < SLP_TREE_LANES (load); ++i)
-         if (SLP_TREE_LOAD_PERMUTATION (load)[i] != first + i)
-           {
-             ld_lanes_lanes = 0;
-             break;
-           }
+       if (!vect_load_perm_consecutive_p (load))
+         {
+           ld_lanes_lanes = 0;
+           break;
+         }
       }
 
   /* Only a power-of-two number of lanes matches interleaving with N levels.
@@ -5272,18 +5290,12 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
        continue;
 
       /* Build the permute to get the original load permutation order.  */
-      bool contiguous = true;
+      bool contiguous = vect_load_perm_consecutive_p (load);
       lane_permutation_t final_perm;
       final_perm.create (SLP_TREE_LANES (load));
       for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i)
-       {
-         final_perm.quick_push
-           (std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
-         if (i != 0
-             && (SLP_TREE_LOAD_PERMUTATION (load)[i]
-                 != SLP_TREE_LOAD_PERMUTATION (load)[i-1] + 1))
-           contiguous = false;
-       }
+       final_perm.quick_push (
+         std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
 
       /* When the load permutation accesses a contiguous unpermuted,
         power-of-two aligned and sized chunk leave the load alone.
@@ -7855,15 +7867,7 @@ vect_optimize_slp_pass::remove_redundant_permutations ()
       else
        {
          loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
-         stmt_vec_info load_info;
-         bool this_load_permuted = false;
-         unsigned j;
-         FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
-           if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
-             {
-               this_load_permuted = true;
-               break;
-             }
+         bool this_load_permuted = !vect_load_perm_consecutive_p (node);
          /* When this isn't a grouped access we know it's single element
             and contiguous.  */
          if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 1620fc6cbccb..25ec87022f2a 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2056,6 +2056,27 @@ vector_vector_composition_type (tree vtype, poly_uint64 
nelts, tree *ptype,
   return NULL_TREE;
 }
 
+/* Check if the load permutation of NODE only refers to a consecutive
+   subset of the group indices where GROUP_SIZE is the size of the
+   dataref's group.  We also assert that the length of the permutation
+   divides the group size and is a power of two.
+   Such load permutations can be elided in strided access schemes as
+   we can "jump over" the gap they leave.  */
+
+bool
+has_consecutive_load_permutation (slp_tree node, unsigned group_size)
+{
+  load_permutation_t perm = SLP_TREE_LOAD_PERMUTATION (node);
+  if (!perm.exists ()
+      || perm.length () <= 1
+      || !pow2p_hwi (perm.length ())
+      || group_size % perm.length ())
+    return false;
+
+  return vect_load_perm_consecutive_p (node);
+}
+
+
 /* Analyze load or store SLP_NODE of type VLS_TYPE.  Return true
    if there is a memory access type that the vectorized form can use,
    storing it in *MEMORY_ACCESS_TYPE if so.  If we decide to use gathers
@@ -2103,6 +2124,7 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
stmt_info,
   *ls_type = NULL_TREE;
   *slp_perm = false;
   *n_perms = -1U;
+  ls->subchain_p = false;
 
   bool perm_ok = true;
   poly_int64 vf = loop_vinfo ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
@@ -2149,10 +2171,22 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
stmt_info,
     first_dr_info = STMT_VINFO_DR_INFO (SLP_TREE_SCALAR_STMTS (slp_node)[0]);
 
   if (STMT_VINFO_STRIDED_P (first_stmt_info))
-    /* Try to use consecutive accesses of as many elements as possible,
-       separated by the stride, until we have a complete vector.
-       Fall back to scalar accesses if that isn't possible.  */
-    *memory_access_type = VMAT_STRIDED_SLP;
+    {
+      /* Try to use consecutive accesses of as many elements as possible,
+        separated by the stride, until we have a complete vector.
+        Fall back to scalar accesses if that isn't possible.  */
+      *memory_access_type = VMAT_STRIDED_SLP;
+
+      /* If the load permutation is consecutive we can reduce the group to
+        the elements the permutation accesses.  Then we release the
+        permutation.  */
+      if (has_consecutive_load_permutation (slp_node, group_size))
+       {
+         ls->subchain_p = true;
+         group_size = SLP_TREE_LANES (slp_node);
+         SLP_TREE_LOAD_PERMUTATION (slp_node).release ();
+       }
+    }
   else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
     {
       slp_tree offset_node = SLP_TREE_CHILDREN (slp_node)[0];
@@ -2436,8 +2470,7 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
stmt_info,
   vect_memory_access_type grouped_gather_fallback = VMAT_UNINITIALIZED;
   if (loop_vinfo
       && (*memory_access_type == VMAT_ELEMENTWISE
-         || *memory_access_type == VMAT_STRIDED_SLP)
-      && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+         || *memory_access_type == VMAT_STRIDED_SLP))
     {
       gather_scatter_info gs_info;
       if (SLP_TREE_LANES (slp_node) == 1
@@ -9945,7 +9978,14 @@ vectorizable_load (vec_info *vinfo,
 
       if (grouped_load)
        {
-         first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+         /* If we elided a consecutive load permutation, don't
+            use the original first statement (which could be elided)
+            but the one the load permutation starts with.
+            This ensures the stride_base below is correct.  */
+         if (!ls.subchain_p)
+           first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
+         else
+           first_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
          first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
          ref_type = get_group_alias_ptr_type (first_stmt_info);
        }
@@ -9959,7 +9999,14 @@ vectorizable_load (vec_info *vinfo,
       if (grouped_load)
        {
          if (memory_access_type == VMAT_STRIDED_SLP)
-           group_size = DR_GROUP_SIZE (first_stmt_info);
+           {
+             /* If we elided a consecutive load permutation, adjust
+                the group size here.  */
+             if (!ls.subchain_p)
+               group_size = DR_GROUP_SIZE (first_stmt_info);
+             else
+               group_size = SLP_TREE_LANES (slp_node);
+           }
          else /* VMAT_ELEMENTWISE */
            group_size = SLP_TREE_LANES (slp_node);
        }
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index b7f3297a16b9..1635829c7662 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -288,6 +288,7 @@ struct vect_load_store_data : vect_data {
       tree decl;       // VMAT_GATHER_SCATTER_DECL
   } gs;
   tree strided_offset_vectype; // VMAT_GATHER_SCATTER_IFN, originally strided
+  /* Load/store type with larger element mode used for punning the vectype.  */
   tree ls_type; // VMAT_GATHER_SCATTER_IFN
   /* This is set to a supported offset vector type if we don't support the
      originally requested offset type, otherwise NULL.
@@ -302,6 +303,8 @@ struct vect_load_store_data : vect_data {
   /* True if the load requires a load permutation.  */
   bool slp_perm;    // SLP_TREE_LOAD_PERMUTATION
   unsigned n_perms; // SLP_TREE_LOAD_PERMUTATION
+  /* Whether the load permutation is consecutive and simple.  */
+  bool subchain_p; // VMAT_STRIDED_SLP and VMAT_GATHER_SCATTER
 };
 
 /* A computation tree of an SLP instance.  Each node corresponds to a group of
@@ -2763,6 +2766,7 @@ extern int vect_slp_child_index_for_operand (const gimple 
*, int op, bool);
 extern tree prepare_vec_mask (loop_vec_info, tree, tree, tree,
                              gimple_stmt_iterator *);
 extern tree vect_get_mask_load_else (int, tree);
+extern bool vect_load_perm_consecutive_p (slp_tree, unsigned = 0);
 
 /* In tree-vect-patterns.cc.  */
 extern void

Reply via email to