[PATCH] tree-optimization/122573 - enhance SLP of invariant loads

Richard Biener Fri, 14 Nov 2025 02:01:37 -0800

Currently SLP of invariant loads is only supported for the case of
a single load that is splat, as side-effect of supporting this case
even for non-invariant loads.  The following extends this to any
set of invariant loads.  The way we have load permutations for
these makes it a bit awkward, thus adjustments in that area.


Bootstrapped and tested on x86_64-unknown-linux-gnu, will push later.

        PR tree-optimization/122573
        * tree-vect-slp.cc (vect_build_slp_tree_1): Support
        groups of invariant loads.
        (vect_build_slp_tree_2): Likewise.
        (vect_transform_slp_perm_load_1): Likewise.
        * tree-vect-stmts.cc (vectorizable_load): Handle non-splat
        SLP for invaraint loads.

        * gcc.dg/vect/slp-58.c: New testcase.
---
 gcc/testsuite/gcc.dg/vect/slp-58.c |  28 ++++++++
 gcc/tree-vect-slp.cc               |  22 ++++--
 gcc/tree-vect-stmts.cc             | 104 +++++++++++++++++++++--------
 3 files changed, 124 insertions(+), 30 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/slp-58.c

diff --git a/gcc/testsuite/gcc.dg/vect/slp-58.c 
b/gcc/testsuite/gcc.dg/vect/slp-58.c
new file mode 100644
index 00000000000..e03cfa3f115
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-58.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_float } */
+
+struct S {
+    float m_col1[4];
+    float m_col2[4];
+    float m_col3[4];
+    float m_col4[4];
+};
+
+void apply(struct S *s, const float *in, float *out, long numPixels)
+{
+  for (long idx = 0; idx < numPixels; ++idx)
+    {
+      const float r = in[0];
+      const float g = in[1];
+      const float b = in[2];
+      const float a = in[3];
+      out[0] = r*s->m_col1[0] + g*s->m_col2[0] + b*s->m_col3[0] + 
a*s->m_col4[0];
+      out[1] = r*s->m_col1[1] + g*s->m_col2[1] + b*s->m_col3[1] + 
a*s->m_col4[1];
+      out[2] = r*s->m_col1[2] + g*s->m_col2[2] + b*s->m_col3[2] + 
a*s->m_col4[2];
+      out[3] = r*s->m_col1[3] + g*s->m_col2[3] + b*s->m_col3[3] + 
a*s->m_col4[3];
+      in  += 4;
+      out += 4;
+    }
+}
+
+/* { dg-final { scan-tree-dump "vectorization factor = 1" "vect" { target { ! 
vect_load_lanes } } } } */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index c0436ac20dc..07e22ea7ccf 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -1539,9 +1539,13 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char 
*swap,
              && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
              /* Not grouped loads are handled as externals for BB
                 vectorization.  For loop vectorization we can handle
-                splats the same we handle single element interleaving.  */
+                splats the same we handle single element interleaving.
+                Likewise we can handle a collection of invariant refs.  */
              && (is_a <bb_vec_info> (vinfo)
-                 || stmt_info != first_stmt_info))
+                 || (stmt_info != first_stmt_info
+                 && !(integer_zerop (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)))
+                     && integer_zerop (DR_STEP (STMT_VINFO_DATA_REF
+                                                        (first_stmt_info)))))))
            {
              /* Not grouped load.  */
              if (dump_enabled_p ())
@@ -2094,7 +2098,10 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
                load_place = vect_get_place_in_interleaving_chain
                    (load_info, first_stmt_info);
              else
-               load_place = 0;
+               /* Recognize the splat case as { 0, 0, ... } but make
+                  sure to use the appropriate refs for collections
+                  of invariant refs.  */
+               load_place = (load_info == stmt_info) ? 0 : j;
              gcc_assert (load_place != -1);
              any_permute |= load_place != j;
              load_permutation.quick_push (load_place);
@@ -10975,7 +10982,14 @@ vect_transform_slp_perm_load_1 (vec_info *vinfo, 
slp_tree node,
   machine_mode mode;
 
   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
-    dr_group_size = 1;
+    {
+      /* We have both splats of the same non-grouped load and groups
+        of distinct invariant loads entering here.  */
+      unsigned max_idx = 0;
+      for (auto idx : perm)
+       max_idx = idx > max_idx ? idx : max_idx;
+      dr_group_size = max_idx + 1;
+    }
   else
     {
       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index d08f5f19fd4..2bc68e25b90 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -9903,10 +9903,24 @@ vectorizable_load (vec_info *vinfo,
         once at analysis time, remembered and used in the
         transform time.  */
       bool hoist_p = (LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo)
-                     && !nested_in_vect_loop
-                     && hoist_defs_of_uses (stmt_info->stmt, loop, false));
+                     && !nested_in_vect_loop);
+      bool uniform_p = true;
+      for (stmt_vec_info sinfo : SLP_TREE_SCALAR_STMTS (slp_node))
+       {
+         hoist_p = hoist_p && hoist_defs_of_uses (sinfo->stmt, loop, false);
+         if (sinfo != SLP_TREE_SCALAR_STMTS (slp_node)[0])
+           uniform_p = false;
+       }
       if (costing_p)
        {
+         if (!uniform_p && (!hoist_p || !vf.is_constant ()))
+           {
+             if (dump_enabled_p ())
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                "not vectorizing non-uniform invariant "
+                                "load\n");
+             return false;
+           }
          enum vect_cost_model_location cost_loc
            = hoist_p ? vect_prologue : vect_body;
          unsigned int cost = record_stmt_cost (cost_vec, 1, scalar_load,
@@ -9924,39 +9938,77 @@ vectorizable_load (vec_info *vinfo,
        }
       if (hoist_p)
        {
-         gassign *stmt = as_a <gassign *> (stmt_info->stmt);
-         if (dump_enabled_p ())
-           dump_printf_loc (MSG_NOTE, vect_location,
-                            "hoisting out of the vectorized loop: %G",
-                            (gimple *) stmt);
-         scalar_dest = copy_ssa_name (scalar_dest);
-         tree rhs = unshare_expr (gimple_assign_rhs1 (stmt));
-         edge pe = loop_preheader_edge (loop);
-         gphi *vphi = get_virtual_phi (loop->header);
-         tree vuse;
-         if (vphi)
-           vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe);
-         else
-           vuse = gimple_vuse (gsi_stmt (*gsi));
-         gimple *new_stmt = gimple_build_assign (scalar_dest, rhs);
-         gimple_set_vuse (new_stmt, vuse);
-         gsi_insert_on_edge_immediate (pe, new_stmt);
-         hoist_defs_of_uses (new_stmt, loop, true);
+         /* ???  For non-uniform lanes there could be still duplicates.
+            We're leaving those to post-vectorizer CSE for the moment.  */
+         auto_vec<tree> scalar_defs (SLP_TREE_LANES (slp_node));
+         for (stmt_vec_info sinfo : SLP_TREE_SCALAR_STMTS (slp_node))
+           {
+             gassign *stmt = as_a <gassign *> (sinfo->stmt);
+             if (dump_enabled_p ())
+               dump_printf_loc (MSG_NOTE, vect_location,
+                                "hoisting out of the vectorized loop: %G",
+                                (gimple *) stmt);
+             scalar_dest = copy_ssa_name (gimple_assign_lhs (stmt));
+             tree rhs = unshare_expr (gimple_assign_rhs1 (stmt));
+             edge pe = loop_preheader_edge (loop);
+             gphi *vphi = get_virtual_phi (loop->header);
+             tree vuse;
+             if (vphi)
+               vuse = PHI_ARG_DEF_FROM_EDGE (vphi, pe);
+             else
+               vuse = gimple_vuse (gsi_stmt (*gsi));
+             gimple *new_stmt = gimple_build_assign (scalar_dest, rhs);
+             gimple_set_vuse (new_stmt, vuse);
+             gsi_insert_on_edge_immediate (pe, new_stmt);
+             hoist_defs_of_uses (new_stmt, loop, true);
+             if (!useless_type_conversion_p (TREE_TYPE (vectype),
+                                             TREE_TYPE (scalar_dest)))
+               {
+                 tree tem = make_ssa_name (TREE_TYPE (vectype));
+                 new_stmt = gimple_build_assign (tem,
+                                                 NOP_EXPR, scalar_dest);
+                 gsi_insert_on_edge_immediate (pe, new_stmt);
+                 scalar_dest = tem;
+               }
+             scalar_defs.quick_push (scalar_dest);
+             if (uniform_p)
+               break;
+           }
+         if (!uniform_p)
+           {
+             unsigned const_nunits
+               = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
+             for (j = 0; j < (int) vec_num; ++j)
+               {
+                 vec<constructor_elt, va_gc> *v = NULL;
+                 vec_safe_reserve (v, const_nunits, true);
+                 for (unsigned i = 0; i < const_nunits; ++i)
+                   {
+                     unsigned def_idx
+                       = (j * const_nunits + i) % SLP_TREE_LANES (slp_node);
+                     CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
+                                             scalar_defs[def_idx]);
+                   }
+                 scalar_dest = build_constructor (vectype, v);
+                 new_temp = vect_init_vector (vinfo, stmt_info, scalar_dest,
+                                              vectype, NULL);
+                 slp_node->push_vec_def (new_temp);
+               }
+             return true;
+           }
+         new_temp = vect_init_vector (vinfo, stmt_info, scalar_dest,
+                                      vectype, NULL);
        }
-      /* These copies are all equivalent.  */
-      if (hoist_p)
-       new_temp = vect_init_vector (vinfo, stmt_info, scalar_dest,
-                                    vectype, NULL);
       else
        {
+         gcc_assert (uniform_p);
          gimple_stmt_iterator gsi2 = *gsi;
          gsi_next (&gsi2);
          new_temp = vect_init_vector (vinfo, stmt_info, scalar_dest,
                                       vectype, &gsi2);
        }
-      gimple *new_stmt = SSA_NAME_DEF_STMT (new_temp);
       for (j = 0; j < (int) vec_num; ++j)
-       slp_node->push_vec_def (new_stmt);
+       slp_node->push_vec_def (new_temp);
       return true;
     }
 
-- 
2.51.0

[PATCH] tree-optimization/122573 - enhance SLP of invariant loads

Reply via email to