Richard Sandiford <richard.sandif...@linaro.org> writes:
> This patch makes vectorizable_reduction cope with variable-length vectors.
> We can handle the simple case of an inner loop reduction for which
> the target has native support for the epilogue operation.  For now we
> punt on other cases, but patches after the main SVE submission allow
> SLP and double reductions too.

Here's an updated version that applies on top of the recent removal
of REDUC_*_EXPR.

Thanks,
Richard


2017-11-22  Richard Sandiford  <richard.sandif...@linaro.org>
            Alan Hayward  <alan.hayw...@arm.com>
            David Sherwood  <david.sherw...@arm.com>

gcc/
        * tree.h (build_index_vector): Declare.
        * tree.c (build_index_vector): New function.
        * tree-vect-loop.c (get_initial_def_for_reduction): Treat the number
        of units as polynomial, forcibly converting it to a constant if
        vectorizable_reduction has already enforced the condition.
        (get_initial_defs_for_reduction): Likewise.
        (vect_create_epilog_for_reduction): Likewise.  Use build_index_vector
        to create a {1,2,3,...} vector.
        (vectorizable_reduction): Treat the number of units as polynomial.
        Choose vectype_in based on the largest scalar element size rather
        than the smallest number of units.  Enforce the restrictions
        relied on above.

Index: gcc/tree.h
===================================================================
--- gcc/tree.h  2017-11-22 18:02:23.618126313 +0000
+++ gcc/tree.h  2017-11-22 18:02:28.724789004 +0000
@@ -4036,6 +4036,7 @@ extern tree build_vector (tree, vec<tree
 extern tree build_vector_from_ctor (tree, vec<constructor_elt, va_gc> *);
 extern tree build_vector_from_val (tree, tree);
 extern tree build_vec_series (tree, tree, tree);
+extern tree build_index_vector (tree, poly_uint64, poly_uint64);
 extern void recompute_constructor_flags (tree);
 extern void verify_constructor_flags (tree);
 extern tree build_constructor (tree, vec<constructor_elt, va_gc> *);
Index: gcc/tree.c
===================================================================
--- gcc/tree.c  2017-11-22 18:02:23.618126313 +0000
+++ gcc/tree.c  2017-11-22 18:02:28.724789004 +0000
@@ -2027,6 +2027,37 @@ build_vec_series (tree type, tree base,
   return build2 (VEC_SERIES_EXPR, type, base, step);
 }
 
+/* Return a vector with the same number of units and number of bits
+   as VEC_TYPE, but in which the elements are a linear series of unsigned
+   integers { BASE, BASE + STEP, BASE + STEP * 2, ... }.  */
+
+tree
+build_index_vector (tree vec_type, poly_uint64 base, poly_uint64 step)
+{
+  tree index_vec_type = vec_type;
+  tree index_elt_type = TREE_TYPE (vec_type);
+  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vec_type);
+  if (!INTEGRAL_TYPE_P (index_elt_type) || !TYPE_UNSIGNED (index_elt_type))
+    {
+      index_elt_type = build_nonstandard_integer_type
+       (GET_MODE_BITSIZE (SCALAR_TYPE_MODE (index_elt_type)), true);
+      index_vec_type = build_vector_type (index_elt_type, nunits);
+    }
+
+  unsigned HOST_WIDE_INT count;
+  if (nunits.is_constant (&count))
+    {
+      auto_vec<tree, 32> v (count);
+      for (unsigned int i = 0; i < count; ++i)
+       v.quick_push (build_int_cstu (index_elt_type, base + i * step));
+      return build_vector (index_vec_type, v);
+    }
+
+  return build_vec_series (index_vec_type,
+                          build_int_cstu (index_elt_type, base),
+                          build_int_cstu (index_elt_type, step));
+}
+
 /* Something has messed with the elements of CONSTRUCTOR C after it was built;
    calculate TREE_CONSTANT and TREE_SIDE_EFFECTS.  */
 
Index: gcc/tree-vect-loop.c
===================================================================
--- gcc/tree-vect-loop.c        2017-11-22 18:02:23.618126313 +0000
+++ gcc/tree-vect-loop.c        2017-11-22 18:02:28.722773349 +0000
@@ -3997,11 +3997,10 @@ get_initial_def_for_reduction (gimple *s
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   tree scalar_type = TREE_TYPE (init_val);
   tree vectype = get_vectype_for_scalar_type (scalar_type);
-  int nunits;
+  poly_uint64 nunits;
   enum tree_code code = gimple_assign_rhs_code (stmt);
   tree def_for_init;
   tree init_def;
-  int i;
   bool nested_in_vect_loop = false;
   REAL_VALUE_TYPE real_init_val = dconst0;
   int int_init_val = 0;
@@ -4082,9 +4081,13 @@ get_initial_def_for_reduction (gimple *s
        else
          {
            /* Option2: the first element is INIT_VAL.  */
-           auto_vec<tree, 32> elts (nunits);
+
+           /* Enforced by vectorizable_reduction (which disallows double
+              reductions with variable-length vectors).  */
+           unsigned int count = nunits.to_constant ();
+           auto_vec<tree, 32> elts (count);
            elts.quick_push (init_val);
-           for (i = 1; i < nunits; ++i)
+           for (unsigned int i = 1; i < count; ++i)
              elts.quick_push (def_for_init);
            init_def = gimple_build_vector (&stmts, vectype, elts);
          }
@@ -4144,6 +4147,8 @@ get_initial_defs_for_reduction (slp_tree
 
   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
   scalar_type = TREE_TYPE (vector_type);
+  /* vectorizable_reduction has already rejected SLP reductions on
+     variable-length vectors.  */
   nunits = TYPE_VECTOR_SUBPARTS (vector_type);
 
   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
@@ -4510,8 +4515,7 @@ vect_create_epilog_for_reduction (vec<tr
   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
     {
       tree indx_before_incr, indx_after_incr;
-      int nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
-      int k;
+      poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
 
       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
@@ -4527,10 +4531,7 @@ vect_create_epilog_for_reduction (vec<tr
         vector size (STEP).  */
 
       /* Create a {1,2,3,...} vector.  */
-      auto_vec<tree, 32> vtemp (nunits_out);
-      for (k = 0; k < nunits_out; ++k)
-       vtemp.quick_push (build_int_cst (cr_index_scalar_type, k + 1));
-      tree series_vect = build_vector (cr_index_vector_type, vtemp);
+      tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
 
       /* Create a vector of the step value.  */
       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
@@ -4908,8 +4909,11 @@ vect_create_epilog_for_reduction (vec<tr
       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
-      unsigned HOST_WIDE_INT v_size
-       = el_size * TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
+      poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
+      /* Enforced by vectorizable_reduction, which ensures we have target
+        support before allowing a conditional reduction on variable-length
+        vectors.  */
+      unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
       tree idx_val = NULL_TREE, val = NULL_TREE;
       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
        {
@@ -5026,6 +5030,9 @@ vect_create_epilog_for_reduction (vec<tr
     {
       bool reduce_with_shift = have_whole_vector_shift (mode);
       int element_bitsize = tree_to_uhwi (bitsize);
+      /* Enforced by vectorizable_reduction, which disallows SLP reductions
+        for variable-length vectors and also requires direct target support
+        for loop reductions.  */
       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
       tree vec_temp;
 
@@ -5706,10 +5713,10 @@ vectorizable_reduction (gimple *stmt, gi
          if (k == 1
              && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
            continue;
-         tem = get_vectype_for_scalar_type (TREE_TYPE (op));
-         if (! vectype_in
-             || TYPE_VECTOR_SUBPARTS (tem) < TYPE_VECTOR_SUBPARTS (vectype_in))
-           vectype_in = tem;
+         if (!vectype_in
+             || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
+                 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
+           vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
          break;
        }
       gcc_assert (vectype_in);
@@ -5875,7 +5882,8 @@ vectorizable_reduction (gimple *stmt, gi
          /* To properly compute ncopies we are interested in the widest
             input type in case we're looking at a widening accumulation.  */
          if (!vectype_in
-             || TYPE_VECTOR_SUBPARTS (vectype_in) > TYPE_VECTOR_SUBPARTS (tem))
+             || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
+                 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
            vectype_in = tem;
        }
 
@@ -6022,6 +6030,7 @@ vectorizable_reduction (gimple *stmt, gi
   gcc_assert (ncopies >= 1);
 
   vec_mode = TYPE_MODE (vectype_in);
+  poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
 
   if (code == COND_EXPR)
     {
@@ -6203,14 +6212,23 @@ vectorizable_reduction (gimple *stmt, gi
       int scalar_precision
        = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
       cr_index_scalar_type = make_unsigned_type (scalar_precision);
-      cr_index_vector_type = build_vector_type
-       (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out));
+      cr_index_vector_type = build_vector_type (cr_index_scalar_type,
+                                               nunits_out);
 
       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
                                          OPTIMIZE_FOR_SPEED))
        reduc_fn = IFN_REDUC_MAX;
     }
 
+  if (reduc_fn == IFN_LAST && !nunits_out.is_constant ())
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "missing target support for reduction on"
+                        " variable-length vectors.\n");
+      return false;
+    }
+
   if ((double_reduc
        || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != TREE_CODE_REDUCTION)
       && ncopies > 1)
@@ -6222,6 +6240,27 @@ vectorizable_reduction (gimple *stmt, gi
       return false;
     }
 
+  if (double_reduc && !nunits_out.is_constant ())
+    {
+      /* The current double-reduction code creates the initial value
+        element-by-element.  */
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "double reduction not supported for variable-length"
+                        " vectors.\n");
+      return false;
+    }
+
+  if (slp_node && !nunits_out.is_constant ())
+    {
+      /* The current SLP code creates the initial value element-by-element.  */
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "SLP reduction not supported for variable-length"
+                        " vectors.\n");
+      return false;
+    }
+
   /* In case of widenning multiplication by a constant, we update the type
      of the constant to be the type of the other operand.  We check that the
      constant fits the type in the pattern recognition pass.  */

Reply via email to