I am testing the following pair of patches (first for trunk, 2nd for GCC 7 
branch) to fix PR81633.  On trunk recent refactoring made the PR71752
change obsolete, on the branch the patch installs the simpler originally
suggested patch which works within the constraints vect_get_slp_defs
is used on the branch.

Bootstrapped and tested on x86_64-unknown-linux-gnu (trunk version still
in testing).

Richard.

2017-08-01  Richard Biener  <rguent...@suse.de>

        PR tree-optimization/81633
        Revert
        2015-08-17  Alan Hayward  <alan.hayw...@arm.com>

        PR tree-optimization/71752
        * tree-vect-slp.c (vect_get_slp_defs): Handle null operands.

        * gcc.dg/vect/pr81633.c: New testcase.

Index: gcc/tree-vect-slp.c
===================================================================
--- gcc/tree-vect-slp.c (revision 250725)
+++ gcc/tree-vect-slp.c (working copy)
@@ -3325,32 +3325,24 @@ vect_get_slp_defs (vec<tree> ops, slp_tr
 {
   gimple *first_stmt;
   int number_of_vects = 0, i;
+  unsigned int child_index = 0;
   HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
   slp_tree child = NULL;
   vec<tree> vec_defs;
   tree oprnd;
-  bool first_iteration = true;
+  bool vectorized_defs;
 
   first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
   FOR_EACH_VEC_ELT (ops, i, oprnd)
     {
-      bool vectorized_defs = false;
-
-      if (oprnd == NULL)
-       {
-         vec_defs = vNULL;
-         vec_defs.create (0);
-         vec_oprnds->quick_push (vec_defs);
-         continue;
-       }
-
       /* For each operand we check if it has vectorized definitions in a child
         node or we need to create them (for invariants and constants).  We
         check if the LHS of the first stmt of the next child matches OPRND.
         If it does, we found the correct child.  Otherwise, we call
-        vect_get_constant_vectors ().  */
-      for (unsigned int child_index = 0;
-          child_index < SLP_TREE_CHILDREN (slp_node).length (); child_index++)
+        vect_get_constant_vectors (), and not advance CHILD_INDEX in order
+        to check this child node for the next operand.  */
+      vectorized_defs = false;
+      if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
         {
           child = SLP_TREE_CHILDREN (slp_node)[child_index];
 
@@ -3375,25 +3367,30 @@ vect_get_slp_defs (vec<tree> ops, slp_tr
                     statements.  */
                  number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
                  vectorized_defs = true;
-                 break;
+                 child_index++;
                }
            }
+         else
+           child_index++;
         }
 
-      if (!vectorized_defs && first_iteration)
-       {
-         number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
-         /* Number of vector stmts was calculated according to LHS in
-            vect_schedule_slp_instance (), fix it by replacing LHS with
-            RHS, if necessary.  See vect_get_smallest_scalar_type () for
-            details.  */
-         vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
-                                        &rhs_size_unit);
-         if (rhs_size_unit != lhs_size_unit)
-           {
-             number_of_vects *= rhs_size_unit;
-             number_of_vects /= lhs_size_unit;
-           }
+      if (!vectorized_defs)
+        {
+          if (i == 0)
+            {
+              number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
+              /* Number of vector stmts was calculated according to LHS in
+                 vect_schedule_slp_instance (), fix it by replacing LHS with
+                 RHS, if necessary.  See vect_get_smallest_scalar_type () for
+                 details.  */
+              vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
+                                             &rhs_size_unit);
+              if (rhs_size_unit != lhs_size_unit)
+                {
+                  number_of_vects *= rhs_size_unit;
+                  number_of_vects /= lhs_size_unit;
+                }
+            }
         }
 
       /* Allocate memory for vectorized defs.  */
@@ -3411,8 +3408,6 @@ vect_get_slp_defs (vec<tree> ops, slp_tr
                                   number_of_vects);
 
       vec_oprnds->quick_push (vec_defs);
-
-      first_iteration = false;
     }
 }
 
Index: gcc/testsuite/gcc.dg/vect/pr81633.c
===================================================================
--- gcc/testsuite/gcc.dg/vect/pr81633.c (nonexistent)
+++ gcc/testsuite/gcc.dg/vect/pr81633.c (working copy)
@@ -0,0 +1,33 @@
+/* { dg-do run } */
+
+static double identity[4][4] = {{1, 0, 0, 0},
+                                {0, 1, 0, 0},
+                                {0, 0, 1, 0},
+                                {0, 0, 0, 1}};
+static double expected[4][4] = {{1, 0, 0, 0},
+                                {0, 0, 0, 0},
+                                {0, 0, 0, 0},
+                                {0, 0, 0, 0}};
+
+static void __attribute__((noinline,noclone))
+kernel(double A[4][4])
+{
+  double tmp[4][4];
+  for (int j = 0; j < 4; j++)
+    for (int k = 0; k < 4; k++)
+      tmp[j][k] = identity[j][0] * identity[j][k];
+  for (int j = 0; j < 4; j++ )
+    for (int k = 0; k < 4; k++)
+      A[j][k] = tmp[j][k];
+}
+
+int main(void)
+{
+  double A[4][4] = {{0.0}};
+  kernel(A);
+  for ( int i = 0; i < 4; i++ )
+    for ( int j = 0; j < 4; j++ )
+      if (A[i][j] != expected[i][j])
+       __builtin_abort ();
+  return 0;
+}


2017-08-01  Richard Biener  <rguent...@suse.de>

        PR tree-optimization/71752
        PR tree-optimization/81633
        * tree-vect-slp.c (vect_get_slp_defs): Handle null operands
        in the original suggested way.

        * gcc.dg/vect/pr81633.c: New testcase.

Index: gcc/tree-vect-slp.c
===================================================================
--- gcc/tree-vect-slp.c (revision 250732)
+++ gcc/tree-vect-slp.c (working copy)
@@ -3289,19 +3289,22 @@ vect_get_slp_defs (vec<tree> ops, slp_tr
 {
   gimple *first_stmt;
   int number_of_vects = 0, i;
+  unsigned int child_index = 0;
   HOST_WIDE_INT lhs_size_unit, rhs_size_unit;
   slp_tree child = NULL;
   vec<tree> vec_defs;
   tree oprnd;
+  bool vectorized_defs;
   bool first_iteration = true;
 
   first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0];
   FOR_EACH_VEC_ELT (ops, i, oprnd)
     {
-      bool vectorized_defs = false;
-
       if (oprnd == NULL)
        {
+         /* Only vectorizable_reduction will call us with a NULL op which
+            will always match the reduction operand for which we have no
+            SLP child.  */
          vec_defs = vNULL;
          vec_defs.create (0);
          vec_oprnds->quick_push (vec_defs);
@@ -3312,9 +3315,10 @@ vect_get_slp_defs (vec<tree> ops, slp_tr
         node or we need to create them (for invariants and constants).  We
         check if the LHS of the first stmt of the next child matches OPRND.
         If it does, we found the correct child.  Otherwise, we call
-        vect_get_constant_vectors ().  */
-      for (unsigned int child_index = 0;
-          child_index < SLP_TREE_CHILDREN (slp_node).length (); child_index++)
+        vect_get_constant_vectors (), and not advance CHILD_INDEX in order
+        to check this child node for the next operand.  */
+      vectorized_defs = false;
+      if (SLP_TREE_CHILDREN (slp_node).length () > child_index)
         {
           child = SLP_TREE_CHILDREN (slp_node)[child_index];
 
@@ -3334,25 +3338,30 @@ vect_get_slp_defs (vec<tree> ops, slp_tr
                     statements.  */
                  number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
                  vectorized_defs = true;
-                 break;
+                 child_index++;
                }
            }
+         else
+           child_index++;
         }
 
-      if (!vectorized_defs && first_iteration)
-       {
-         number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
-         /* Number of vector stmts was calculated according to LHS in
-            vect_schedule_slp_instance (), fix it by replacing LHS with
-            RHS, if necessary.  See vect_get_smallest_scalar_type () for
-            details.  */
-         vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
-                                        &rhs_size_unit);
-         if (rhs_size_unit != lhs_size_unit)
-           {
-             number_of_vects *= rhs_size_unit;
-             number_of_vects /= lhs_size_unit;
-           }
+      if (!vectorized_defs)
+        {
+          if (first_iteration)
+            {
+              number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
+              /* Number of vector stmts was calculated according to LHS in
+                 vect_schedule_slp_instance (), fix it by replacing LHS with
+                 RHS, if necessary.  See vect_get_smallest_scalar_type () for
+                 details.  */
+              vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit,
+                                             &rhs_size_unit);
+              if (rhs_size_unit != lhs_size_unit)
+                {
+                  number_of_vects *= rhs_size_unit;
+                  number_of_vects /= lhs_size_unit;
+                }
+            }
         }
 
       /* Allocate memory for vectorized defs.  */
Index: gcc/testsuite/gcc.dg/vect/pr81633.c
===================================================================
--- gcc/testsuite/gcc.dg/vect/pr81633.c (nonexistent)
+++ gcc/testsuite/gcc.dg/vect/pr81633.c (working copy)
@@ -0,0 +1,33 @@
+/* { dg-do run } */
+
+static double identity[4][4] = {{1, 0, 0, 0},
+                                {0, 1, 0, 0},
+                                {0, 0, 1, 0},
+                                {0, 0, 0, 1}};
+static double expected[4][4] = {{1, 0, 0, 0},
+                                {0, 0, 0, 0},
+                                {0, 0, 0, 0},
+                                {0, 0, 0, 0}};
+
+static void __attribute__((noinline,noclone))
+kernel(double A[4][4])
+{
+  double tmp[4][4];
+  for (int j = 0; j < 4; j++)
+    for (int k = 0; k < 4; k++)
+      tmp[j][k] = identity[j][0] * identity[j][k];
+  for (int j = 0; j < 4; j++ )
+    for (int k = 0; k < 4; k++)
+      A[j][k] = tmp[j][k];
+}
+
+int main(void)
+{
+  double A[4][4] = {{0.0}};
+  kernel(A);
+  for ( int i = 0; i < 4; i++ )
+    for ( int j = 0; j < 4; j++ )
+      if (A[i][j] != expected[i][j])
+       __builtin_abort ();
+  return 0;
+}

Reply via email to