The following patch fixes loop vectorization of SLP reduction chains
that involve patterns.  The issue here is that pattern recog runs
after reduction detection and this screws things up.  Re-ordering
this created interesting side-effects so I didn't explore this
further (for now) but instead fix the detected reduction chains
after pattern recog.  This of course just reveals multiple places
where things go wrong with this setting, fixed with the following
patch which finally vectorizes one of the hottest loop nest
in a soon popular x264 encoder/decoder.

Bootstrapped and tested on x86_64-unknown-linux-gnu, applied to trunk.

Richard.

2015-05-28  Richard Biener  <rguent...@suse.de>

        * tree-vect-loop.c (vect_fixup_reduc_chain): New function.
        (vect_fixup_scalar_cycles_with_patterns): Likewise.
        (vect_analyze_loop_2): Call vect_fixup_scalar_cycles_with_patterns
        after pattern recog.
        (vect_create_epilog_for_reduction): Properly handle reductions
        with patterns.
        (vectorizable_reduction): Likewise.
        * tree-vect-slp.c (vect_analyze_slp_instance): Properly mark
        reduction chains.
        (vect_get_constant_vectors): Create the correct number of
        initial values for reductions.
        (vect_schedule_slp_instance): Handle reduction chains that are
        type changing properly.
        * tree-vect-stmts.c (vect_analyze_stmt): Adjust.

        * gcc.dg/vect/slp-reduc-sad.c: New testcase.

Index: gcc/tree-vect-loop.c
===================================================================
--- gcc/tree-vect-loop.c        (revision 223814)
+++ gcc/tree-vect-loop.c        (working copy)
@@ -828,6 +828,45 @@ vect_analyze_scalar_cycles (loop_vec_inf
     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 }
 
+/* Transfer group and reduction information from STMT to its pattern stmt.  */
+
+static void
+vect_fixup_reduc_chain (gimple stmt)
+{
+  gimple firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
+  gimple stmtp;
+  gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
+             && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
+  GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
+  do
+    {
+      stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
+      GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
+      stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
+      if (stmt)
+       GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
+         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
+    }
+  while (stmt);
+  STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
+}
+
+/* Fixup scalar cycles that now have their stmts detected as patterns.  */
+
+static void
+vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
+{
+  gimple first;
+  unsigned i;
+
+  FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
+    if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
+      {
+       vect_fixup_reduc_chain (first);
+       LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
+         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
+      }
+}
 
 /* Function vect_get_loop_niters.
 
@@ -1708,6 +1747,8 @@ vect_analyze_loop_2 (loop_vec_info loop_
 
   vect_pattern_recog (loop_vinfo, NULL);
 
+  vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
+
   /* Analyze the access patterns of the data-refs in the loop (consecutive,
      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
 
@@ -4573,8 +4614,12 @@ vect_finalize_reduction:
      exit phi node.  */
   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
     {
-      scalar_dest = gimple_assign_lhs (
-                       SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
+      gimple dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
+      /* Handle reduction patterns.  */
+      if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
+       dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
+
+      scalar_dest = gimple_assign_lhs (dest_stmt);
       group_size = 1;
     }
 
@@ -4875,12 +4920,17 @@ vectorizable_reduction (gimple stmt, gim
   auto_vec<gimple> phis;
   int vec_num;
   tree def0, def1, tem, op0, op1 = NULL_TREE;
+  bool first_p = true;
 
   /* In case of reduction chain we switch to the first stmt in the chain, but
      we don't update STMT_INFO, since only the last stmt is marked as reduction
      and has reduction properties.  */
-  if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
-    stmt = GROUP_FIRST_ELEMENT (stmt_info);
+  if (GROUP_FIRST_ELEMENT (stmt_info)
+      && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
+    {
+      stmt = GROUP_FIRST_ELEMENT (stmt_info);
+      first_p = false;
+    }
 
   if (nested_in_vect_loop_p (loop, stmt))
     {
@@ -4903,8 +4953,8 @@ vectorizable_reduction (gimple stmt, gim
     return false;
 
   /* Make sure it was already recognized as a reduction computation.  */
-  if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
-      && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
+  if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
+      && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
     return false;
 
   /* 2. Has this been recognized as a reduction pattern?
@@ -4914,7 +4964,7 @@ vectorizable_reduction (gimple stmt, gim
      the STMT_VINFO_RELATED_STMT field records the last stmt in
      the original sequence that constitutes the pattern.  */
 
-  orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
+  orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
   if (orig_stmt)
     {
       orig_stmt_info = vinfo_for_stmt (orig_stmt);
@@ -5040,20 +5090,16 @@ vectorizable_reduction (gimple stmt, gim
       return false;
     }
 
+  gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
+                                        !nested_cycle, &dummy);
   if (orig_stmt)
-    gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
-                                                       reduc_def_stmt,
-                                                       !nested_cycle,
-                                                       &dummy));
+    gcc_assert (tmp == orig_stmt
+               || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
   else
-    {
-      gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
-                                             !nested_cycle, &dummy);
-      /* We changed STMT to be the first stmt in reduction chain, hence we
-         check that in this case the first element in the chain is STMT.  */
-      gcc_assert (stmt == tmp
-                  || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
-    }
+    /* We changed STMT to be the first stmt in reduction chain, hence we
+       check that in this case the first element in the chain is STMT.  */
+    gcc_assert (stmt == tmp
+               || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
 
   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
     return false;
@@ -5267,8 +5313,9 @@ vectorizable_reduction (gimple stmt, gim
 
   if (!vec_stmt) /* transformation not required.  */
     {
-      if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies,
-                                     reduc_index))
+      if (first_p
+         && !vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies,
+                                        reduc_index))
         return false;
       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
       return true;
@@ -5324,11 +5371,7 @@ vectorizable_reduction (gimple stmt, gim
   prev_stmt_info = NULL;
   prev_phi_info = NULL;
   if (slp_node)
-    {
-      vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
-      gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out) 
-                  == TYPE_VECTOR_SUBPARTS (vectype_in));
-    }
+    vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
   else
     {
       vec_num = 1;
Index: gcc/tree-vect-slp.c
===================================================================
--- gcc/tree-vect-slp.c (revision 223814)
+++ gcc/tree-vect-slp.c (working copy)
@@ -1793,6 +1793,11 @@ vect_analyze_slp_instance (loop_vec_info
             scalar_stmts.safe_push (next);
           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
         }
+      /* Mark the first element of the reduction chain as reduction to properly
+        transform the node.  In the reduction analysis phase only the last
+        element of the chain is marked as reduction.  */
+      if (!STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
+       STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_reduction_def;
     }
   else
     {
@@ -2738,7 +2743,7 @@ vect_get_constant_vectors (tree op, slp_
      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
      {s5, s6, s7, s8}.  */
 
-  number_of_copies = least_common_multiple (nunits, group_size) / group_size;
+  number_of_copies = nunits * number_of_vectors / group_size;
 
   number_of_places_left_in_vector = nunits;
   elts = XALLOCAVEC (tree, nunits);
@@ -3383,8 +3388,14 @@ vect_schedule_slp_instance (slp_tree nod
      for the scalar stmts in each node of the SLP tree.  Number of vector
      elements in one vector iteration is the number of scalar elements in
      one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
-     size.  */
-  vec_stmts_size = (vectorization_factor * group_size) / nunits;
+     size.
+     Unless this is a SLP reduction in which case the number of vector
+     stmts is equal to the number of vector stmts of the children.  */
+  if (GROUP_FIRST_ELEMENT (stmt_info)
+      && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
+    vec_stmts_size = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN 
(node)[0]);
+  else
+    vec_stmts_size = (vectorization_factor * group_size) / nunits;
 
   if (!SLP_TREE_VEC_STMTS (node).exists ())
     {
Index: gcc/tree-vect-stmts.c
===================================================================
--- gcc/tree-vect-stmts.c       (revision 223814)
+++ gcc/tree-vect-stmts.c       (working copy)
@@ -7310,9 +7310,11 @@ vect_analyze_stmt (gimple stmt, bool *ne
 
       case vect_reduction_def:
       case vect_nested_cycle:
-         gcc_assert (!bb_vinfo && (relevance == vect_used_in_outer
-                     || relevance == vect_used_in_outer_by_reduction
-                     || relevance == vect_unused_in_scope));
+         gcc_assert (!bb_vinfo
+                    && (relevance == vect_used_in_outer
+                        || relevance == vect_used_in_outer_by_reduction
+                        || relevance == vect_used_by_reduction
+                        || relevance == vect_unused_in_scope));
          break;
 
       case vect_induction_def:
Index: gcc/testsuite/gcc.dg/vect/slp-reduc-sad.c
===================================================================
--- gcc/testsuite/gcc.dg/vect/slp-reduc-sad.c   (revision 0)
+++ gcc/testsuite/gcc.dg/vect/slp-reduc-sad.c   (working copy)
@@ -0,0 +1,64 @@
+/* { dg-require-effective-target vect_usad_char } */
+
+#include "tree-vect.h"
+
+typedef unsigned int uint32_t;
+typedef unsigned short uint16_t;
+typedef unsigned char uint8_t;
+
+extern int abs (int);
+extern void abort (void);
+
+int __attribute__((noinline,noclone))
+foo (uint8_t *pix1, uint8_t *pix2, int i_stride_pix2)
+{
+  int i_sum = 0;
+  for( int y = 0; y < 16; y++ )
+    {
+      i_sum += abs ( pix1[0] - pix2[0] );
+      i_sum += abs ( pix1[1] - pix2[1] );
+      i_sum += abs ( pix1[2] - pix2[2] );
+      i_sum += abs ( pix1[3] - pix2[3] );
+      i_sum += abs ( pix1[4] - pix2[4] );
+      i_sum += abs ( pix1[5] - pix2[5] );
+      i_sum += abs ( pix1[6] - pix2[6] );
+      i_sum += abs ( pix1[7] - pix2[7] );
+      i_sum += abs ( pix1[8] - pix2[8] );
+      i_sum += abs ( pix1[9] - pix2[9] );
+      i_sum += abs ( pix1[10] - pix2[10] );
+      i_sum += abs ( pix1[11] - pix2[11] );
+      i_sum += abs ( pix1[12] - pix2[12] );
+      i_sum += abs ( pix1[13] - pix2[13] );
+      i_sum += abs ( pix1[14] - pix2[14] );
+      i_sum += abs ( pix1[15] - pix2[15] );
+      pix1 += 16;
+      pix2 += i_stride_pix2;
+    }
+  return i_sum; 
+}
+
+int
+main ()
+{
+  check_vect ();
+
+  uint8_t X[16*16];
+  uint8_t Y[16*16];
+
+  for (int i = 0; i < 16*16; ++i)
+    {
+      X[i] = i;
+      Y[i] = 16*16 - i;
+      __asm__ volatile ("");
+    }
+
+  if (foo (X, Y, 16) != 32512)
+    abort ();
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_sad_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */

Reply via email to