The following does a simple legitimising attempt on the SLP graph
permutations before trying to optimize them.  For the case we have
a single non-zero layout we can force that to all partitions if
it is compatible.  This way we end up with the most canonical
(and possibly no-op) load permutations and permutes.

I have refrained from trying to use internal_node_cost to actually
check if the result is legitimate (it would need at least the
change to anticipate redundant load permute eliding).  This relies
on start_choosing_layouts chosing layout zero for everything we
cannot handle (like non-bijective permutes).  What's missing is
to try to process disconnected parts of the SLP graph separately,
I think create_partitions doesn't attempt to compute this.  It
shouldn't be too difficult to extend to cover this, but this is
a RFC and not supposed to be a final patch.

Bootstrapped and tested on x86_64-unknown-linux-gnu.

v2 fixes missed layout compatibility checks for an initial batch
of -1 layout nodes (implementation detail, the overall idea is
the same)

        PR tree-optimization/120687
        * tree-vect-slp.cc (vect_optimize_slp_pass::run): Try
        a single layout for all nodes.
---
 gcc/tree-vect-slp.cc | 78 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 76 insertions(+), 2 deletions(-)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 31d84857d49..9d9ac6db52e 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -8038,8 +8038,82 @@ vect_optimize_slp_pass::run ()
   start_choosing_layouts ();
   if (m_perms.length () > 1)
     {
-      forward_pass ();
-      backward_pass ();
+      /* Perform a very simple legitimizing attempt by attempting to chose
+        a single layout for all partitions that will make all permutations
+        a noop.  That should also be the optimal layout choice in case
+        layout zero is legitimate.
+        ???  Disconnected components of the SLP graph could have distinct
+        single layouts.  */
+      int single_layout_i = -1;
+      auto_vec<unsigned int> check_defered;
+      for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
+          ++partition_i)
+       {
+         auto &partition = m_partitions[partition_i];
+         if (single_layout_i == -1)
+           single_layout_i = partition.layout;
+         else if (partition.layout == single_layout_i
+                  || partition.layout == -1)
+           ;
+         else
+           {
+             single_layout_i = 0;
+             break;
+           }
+
+         if (single_layout_i != -1)
+           for (unsigned int order_i = partition.node_begin;
+                order_i < partition.node_end; ++order_i)
+             {
+               unsigned int node_i = m_partitioned_nodes[order_i];
+               auto &vertex = m_vertices[node_i];
+
+               /* reject the layout if it is individually incompatible
+                  with any node in the partition.  */
+               if (!is_compatible_layout (vertex.node, single_layout_i))
+                 {
+                   single_layout_i = 0;
+                   break;
+                 }
+             }
+         else
+           check_defered.safe_push (partition_i);
+         if (single_layout_i == 0)
+           break;
+       }
+      if (single_layout_i > 0)
+       for (unsigned int partition_i : check_defered)
+         {
+           auto &partition = m_partitions[partition_i];
+           for (unsigned int order_i = partition.node_begin;
+                order_i < partition.node_end; ++order_i)
+             {
+               unsigned int node_i = m_partitioned_nodes[order_i];
+               auto &vertex = m_vertices[node_i];
+
+               /* reject the layout if it is individually incompatible
+                  with any node in the partition.  */
+               if (!is_compatible_layout (vertex.node, single_layout_i))
+                 {
+                   single_layout_i = 0;
+                   break;
+                 }
+             }
+           if (single_layout_i == 0)
+             break;
+         }
+      if (single_layout_i > 0)
+       for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
+            ++partition_i)
+         {
+           auto &partition = m_partitions[partition_i];
+           partition.layout = single_layout_i;
+         }
+      else
+       {
+         forward_pass ();
+         backward_pass ();
+       }
       if (dump_enabled_p ())
        dump ();
       materialize ();
-- 
2.51.0

Reply via email to