On Wed, May 6, 2015 at 5:37 PM, Michael Matz <m...@suse.de> wrote:
> Hi,
>
> I'm sitting on this since quite some time already and always missed stage
> 1.  This implements support for vectorizing strided stores with unknown
> but loop invariant stride, like:
>
> sumit (float * __restrict dest,
>        float * __restrict src, float * __restrict src2,
>        int stride, int n)
> {
>   int i;
>   for (i = 0; i < n; i++)
>     dest[i*stride] = src[i] + src2[i];
> }
>
> I use the same scheme like for strided loads, i.e. expanding such store
> with N separate scalar stores (so alignment could also be ignored for such
> ones).
>
> This doesn't yet fix PR65962, because that one uses a _constant_ step.
> That makes us try a grouped access, which isn't supported for stores when
> there's a gap (which there is).  Unfortunately vect_analyze_group_access
> actively changes some vect info even before detecting that it's not a
> grouped access, so there must be some rollback implemented; I decided to
> defer this to a follow up.
>
> Regstrapped on x86-64-linux, no regressions (I had to adjust two
> fortran testcases where one more loop is vectorized now).  Okay for trunk?

Ok.

Thanks,
Richard.

>
> Ciao,
> Michael.
>         * tree-vectorizer.h (struct _stmt_vec_info): Rename stride_load_p
>         to strided_p.
>         (STMT_VINFO_STRIDE_LOAD_P): Rename to ...
>         (STMT_VINFO_STRIDED_P): ... this.
>         * tree-vect-data-refs.c (vect_compute_data_ref_alignment): Adjust.
>         (vect_verify_datarefs_alignment): Likewise.
>         (vect_enhance_data_refs_alignment): Likewise.
>         (vect_analyze_data_ref_access): Likewise.
>         (vect_analyze_data_refs): Accept strided stores.
>         * tree-vect-stmts.c (vect_model_store_cost): Count strided stores.
>         (vect_model_load_cost): Adjust for macro rename.
>         (vectorizable_mask_load_store): Likewise.
>         (vectorizable_load): Likewise.
>         (vectorizable_store): Open code strided stores.
>
> testsuite/
>         * gcc.dg/vect/vect-strided-store.c: New test.
>         * gfortran.dg/vect/fast-math-pr37021.f90: Adjust.
>         * gfortran.dg/vect/fast-math-rnflow-trs2a2.f90: Adjust.
>
> diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
> index 96afc7a..6d8f17e 100644
> --- a/gcc/tree-vect-data-refs.c
> +++ b/gcc/tree-vect-data-refs.c
> @@ -665,7 +665,7 @@ vect_compute_data_ref_alignment (struct data_reference 
> *dr)
>
>    /* Strided loads perform only component accesses, misalignment information
>       is irrelevant for them.  */
> -  if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
> +  if (STMT_VINFO_STRIDED_P (stmt_info))
>      return true;
>
>    misalign = DR_INIT (dr);
> @@ -942,7 +942,7 @@ vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, 
> bb_vec_info bb_vinfo)
>
>        /* Strided loads perform only component accesses, alignment is
>          irrelevant for them.  */
> -      if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
> +      if (STMT_VINFO_STRIDED_P (stmt_info))
>         continue;
>
>        supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
> @@ -1409,7 +1409,7 @@ vect_enhance_data_refs_alignment (loop_vec_info 
> loop_vinfo)
>
>        /* Strided loads perform only component accesses, alignment is
>          irrelevant for them.  */
> -      if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
> +      if (STMT_VINFO_STRIDED_P (stmt_info))
>         continue;
>
>        supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
> @@ -1701,7 +1701,7 @@ vect_enhance_data_refs_alignment (loop_vec_info 
> loop_vinfo)
>
>           /* Strided loads perform only component accesses, alignment is
>              irrelevant for them.  */
> -         if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
> +         if (STMT_VINFO_STRIDED_P (stmt_info))
>             continue;
>
>           save_misalignment = DR_MISALIGNMENT (dr);
> @@ -1821,7 +1821,7 @@ vect_enhance_data_refs_alignment (loop_vec_info 
> loop_vinfo)
>
>           /* Strided loads perform only component accesses, alignment is
>              irrelevant for them.  */
> -         if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
> +         if (STMT_VINFO_STRIDED_P (stmt_info))
>             continue;
>
>           supportable_dr_alignment = vect_supportable_dr_alignment (dr, 
> false);
> @@ -2368,7 +2368,7 @@ vect_analyze_data_ref_access (struct data_reference *dr)
>
>    /* Assume this is a DR handled by non-constant strided load case.  */
>    if (TREE_CODE (step) != INTEGER_CST)
> -    return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
> +    return STMT_VINFO_STRIDED_P (stmt_info);
>
>    /* Not consecutive access - check if it's a part of interleaving group.  */
>    return vect_analyze_group_access (dr);
> @@ -3764,8 +3764,7 @@ again:
>        else if (loop_vinfo
>                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
>         {
> -         if (nested_in_vect_loop_p (loop, stmt)
> -             || !DR_IS_READ (dr))
> +         if (nested_in_vect_loop_p (loop, stmt))
>             {
>               if (dump_enabled_p ())
>                 {
> @@ -3777,7 +3776,7 @@ again:
>                 }
>               return false;
>             }
> -         STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
> +         STMT_VINFO_STRIDED_P (stmt_info) = true;
>         }
>      }
>
> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> index 2ce6d4d..d268eb0 100644
> --- a/gcc/tree-vect-stmts.c
> +++ b/gcc/tree-vect-stmts.c
> @@ -1014,7 +1014,19 @@ vect_model_store_cost (stmt_vec_info stmt_info, int 
> ncopies,
>      }
>
>    /* Costs of the stores.  */
> -  vect_get_store_cost (first_dr, ncopies, &inside_cost, body_cost_vec);
> +  if (STMT_VINFO_STRIDED_P (stmt_info))
> +    {
> +      /* N scalar stores plus extracting the elements.  */
> +      tree vectype = STMT_VINFO_VECTYPE (stmt_info);
> +      inside_cost += record_stmt_cost (body_cost_vec,
> +                                      ncopies * TYPE_VECTOR_SUBPARTS 
> (vectype),
> +                                      scalar_store, stmt_info, 0, vect_body);
> +      inside_cost += record_stmt_cost (body_cost_vec,
> +                                      ncopies * TYPE_VECTOR_SUBPARTS 
> (vectype),
> +                                      vec_to_scalar, stmt_info, 0, 
> vect_body);
> +    }
> +  else
> +    vect_get_store_cost (first_dr, ncopies, &inside_cost, body_cost_vec);
>
>    if (dump_enabled_p ())
>      dump_printf_loc (MSG_NOTE, vect_location,
> @@ -1127,7 +1139,7 @@ vect_model_load_cost (stmt_vec_info stmt_info, int 
> ncopies,
>      }
>
>    /* The loads themselves.  */
> -  if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
> +  if (STMT_VINFO_STRIDED_P (stmt_info))
>      {
>        /* N scalar loads plus gathering them into a vector.  */
>        tree vectype = STMT_VINFO_VECTYPE (stmt_info);
> @@ -1820,7 +1832,7 @@ vectorizable_mask_load_store (gimple stmt, 
> gimple_stmt_iterator *gsi,
>    if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
>      return false;
>
> -  if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
> +  if (STMT_VINFO_STRIDED_P (stmt_info))
>      return false;
>
>    if (STMT_VINFO_GATHER_P (stmt_info))
> @@ -5013,7 +5025,7 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator 
> *gsi, gimple *vec_stmt,
>    tree dataref_ptr = NULL_TREE;
>    tree dataref_offset = NULL_TREE;
>    gimple ptr_incr = NULL;
> -  int nunits = TYPE_VECTOR_SUBPARTS (vectype);
> +  unsigned int nunits = TYPE_VECTOR_SUBPARTS (vectype);
>    int ncopies;
>    int j;
>    gimple next_stmt, first_stmt = NULL;
> @@ -5100,38 +5112,42 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator 
> *gsi, gimple *vec_stmt,
>    if (!STMT_VINFO_DATA_REF (stmt_info))
>      return false;
>
> -  negative =
> -    tree_int_cst_compare (loop && nested_in_vect_loop_p (loop, stmt)
> -                         ? STMT_VINFO_DR_STEP (stmt_info) : DR_STEP (dr),
> -                         size_zero_node) < 0;
> -  if (negative && ncopies > 1)
> -    {
> -      if (dump_enabled_p ())
> -        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -                        "multiple types with negative step.\n");
> -      return false;
> -    }
> -
> -  if (negative)
> +  if (STMT_VINFO_STRIDED_P (stmt_info))
> +    ;
> +  else
>      {
> -      gcc_assert (!grouped_store);
> -      alignment_support_scheme = vect_supportable_dr_alignment (dr, false);
> -      if (alignment_support_scheme != dr_aligned
> -         && alignment_support_scheme != dr_unaligned_supported)
> +      negative =
> +         tree_int_cst_compare (loop && nested_in_vect_loop_p (loop, stmt)
> +                               ? STMT_VINFO_DR_STEP (stmt_info) : DR_STEP 
> (dr),
> +                               size_zero_node) < 0;
> +      if (negative && ncopies > 1)
>         {
>           if (dump_enabled_p ())
>             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -                            "negative step but alignment required.\n");
> +                            "multiple types with negative step.\n");
>           return false;
>         }
> -      if (dt != vect_constant_def
> -         && dt != vect_external_def
> -         && !perm_mask_for_reverse (vectype))
> +      if (negative)
>         {
> -         if (dump_enabled_p ())
> -           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -                            "negative step and reversing not supported.\n");
> -         return false;
> +         gcc_assert (!grouped_store);
> +         alignment_support_scheme = vect_supportable_dr_alignment (dr, 
> false);
> +         if (alignment_support_scheme != dr_aligned
> +             && alignment_support_scheme != dr_unaligned_supported)
> +           {
> +             if (dump_enabled_p ())
> +               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                                "negative step but alignment required.\n");
> +             return false;
> +           }
> +         if (dt != vect_constant_def
> +             && dt != vect_external_def
> +             && !perm_mask_for_reverse (vectype))
> +           {
> +             if (dump_enabled_p ())
> +               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                                "negative step and reversing not 
> supported.\n");
> +             return false;
> +           }
>         }
>      }
>
> @@ -5230,6 +5246,113 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator 
> *gsi, gimple *vec_stmt,
>      dump_printf_loc (MSG_NOTE, vect_location,
>                       "transform store. ncopies = %d\n", ncopies);
>
> +  if (STMT_VINFO_STRIDED_P (stmt_info))
> +    {
> +      gimple_stmt_iterator incr_gsi;
> +      bool insert_after;
> +      gimple incr;
> +      tree offvar;
> +      tree ivstep;
> +      tree running_off;
> +      gimple_seq stmts = NULL;
> +      tree stride_base, stride_step, alias_off;
> +      tree vec_oprnd;
> +
> +      gcc_assert (!nested_in_vect_loop_p (loop, stmt));
> +
> +      stride_base
> +       = fold_build_pointer_plus
> +           (unshare_expr (DR_BASE_ADDRESS (dr)),
> +            size_binop (PLUS_EXPR,
> +                        convert_to_ptrofftype (unshare_expr (DR_OFFSET 
> (dr))),
> +                        convert_to_ptrofftype (DR_INIT(dr))));
> +      stride_step = fold_convert (sizetype, unshare_expr (DR_STEP (dr)));
> +
> +      /* For a store with loop-invariant (but other than power-of-2)
> +         stride (i.e. not a grouped access) like so:
> +
> +          for (i = 0; i < n; i += stride)
> +            array[i] = ...;
> +
> +        we generate a new induction variable and new stores from
> +        the components of the (vectorized) rhs:
> +
> +          for (j = 0; ; j += VF*stride)
> +            vectemp = ...;
> +            tmp1 = vectemp[0];
> +            array[j] = tmp1;
> +            tmp2 = vectemp[1];
> +            array[j + stride] = tmp2;
> +            ...
> +         */
> +
> +      ivstep = stride_step;
> +      ivstep = fold_build2 (MULT_EXPR, TREE_TYPE (ivstep), ivstep,
> +                           build_int_cst (TREE_TYPE (ivstep),
> +                                          ncopies * nunits));
> +
> +      standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> +
> +      create_iv (stride_base, ivstep, NULL,
> +                loop, &incr_gsi, insert_after,
> +                &offvar, NULL);
> +      incr = gsi_stmt (incr_gsi);
> +      set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
> +
> +      stride_step = force_gimple_operand (stride_step, &stmts, true, 
> NULL_TREE);
> +      if (stmts)
> +       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
> +
> +      prev_stmt_info = NULL;
> +      running_off = offvar;
> +      alias_off = build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0);
> +      for (j = 0; j < ncopies; j++)
> +       {
> +         /* We've set op and dt above, from gimple_assign_rhs1(stmt),
> +            and first_stmt == stmt.  */
> +         if (j == 0)
> +           vec_oprnd = vect_get_vec_def_for_operand (op, first_stmt, NULL);
> +         else
> +           vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, vec_oprnd);
> +
> +         for (i = 0; i < nunits; i++)
> +           {
> +             tree newref, newoff;
> +             gimple incr, assign;
> +             tree size = TYPE_SIZE (elem_type);
> +             /* Extract the i'th component.  */
> +             tree pos = fold_build2 (MULT_EXPR, bitsizetype, bitsize_int (i),
> +                                     size);
> +             tree elem = fold_build3 (BIT_FIELD_REF, elem_type, vec_oprnd,
> +                                      size, pos);
> +
> +             elem = force_gimple_operand_gsi (gsi, elem, true,
> +                                              NULL_TREE, true,
> +                                              GSI_SAME_STMT);
> +
> +             newref = build2 (MEM_REF, TREE_TYPE (vectype),
> +                              running_off, alias_off);
> +
> +             /* And store it to *running_off.  */
> +             assign = gimple_build_assign (newref, elem);
> +             vect_finish_stmt_generation (stmt, assign, gsi);
> +
> +             newoff = copy_ssa_name (running_off, NULL);
> +             incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR,
> +                                         running_off, stride_step);
> +             vect_finish_stmt_generation (stmt, incr, gsi);
> +
> +             running_off = newoff;
> +             if (j == 0 && i == i)
> +               STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = assign;
> +             else
> +               STMT_VINFO_RELATED_STMT (prev_stmt_info) = assign;
> +             prev_stmt_info = vinfo_for_stmt (assign);
> +           }
> +       }
> +      return true;
> +    }
> +
>    dr_chain.create (group_size);
>    oprnds.create (group_size);
>
> @@ -5846,7 +5969,7 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator 
> *gsi, gimple *vec_stmt,
>           return false;
>         }
>      }
> -  else if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
> +  else if (STMT_VINFO_STRIDED_P (stmt_info))
>      ;
>    else
>      {
> @@ -6079,7 +6202,7 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator 
> *gsi, gimple *vec_stmt,
>         }
>        return true;
>      }
> -  else if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
> +  else if (STMT_VINFO_STRIDED_P (stmt_info))
>      {
>        gimple_stmt_iterator incr_gsi;
>        bool insert_after;
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 0796cc1..d231626 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -643,7 +643,9 @@ typedef struct _stmt_vec_info {
>
>    /* For loads only, true if this is a gather load.  */
>    bool gather_p;
> -  bool stride_load_p;
> +
> +  /* True if this is an access with loop-invariant stride.  */
> +  bool strided_p;
>
>    /* For both loads and stores.  */
>    bool simd_lane_access_p;
> @@ -661,7 +663,7 @@ typedef struct _stmt_vec_info {
>  #define STMT_VINFO_VECTORIZABLE(S)         (S)->vectorizable
>  #define STMT_VINFO_DATA_REF(S)             (S)->data_ref_info
>  #define STMT_VINFO_GATHER_P(S)            (S)->gather_p
> -#define STMT_VINFO_STRIDE_LOAD_P(S)       (S)->stride_load_p
> +#define STMT_VINFO_STRIDED_P(S)                   (S)->strided_p
>  #define STMT_VINFO_SIMD_LANE_ACCESS_P(S)   (S)->simd_lane_access_p
>
>  #define STMT_VINFO_DR_BASE_ADDRESS(S)      (S)->dr_base_address
> diff --git a/gcc/testsuite/gfortran.dg/vect/fast-math-pr37021.f90 
> b/gcc/testsuite/gfortran.dg/vect/fast-math-pr37021.f90
> index b17ac9c..d5f5d40 100644
> --- a/gcc/testsuite/gfortran.dg/vect/fast-math-pr37021.f90
> +++ b/gcc/testsuite/gfortran.dg/vect/fast-math-pr37021.f90
> @@ -14,5 +14,5 @@ subroutine to_product_of(self,a,b,a1,a2)
>    end do
>  end subroutine
>
> -! { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } }
> +! { dg-final { scan-tree-dump "vectorized 2 loops" "vect" } }
>  ! { dg-final { cleanup-tree-dump "vect" } }
> diff --git a/gcc/testsuite/gfortran.dg/vect/fast-math-rnflow-trs2a2.f90 
> b/gcc/testsuite/gfortran.dg/vect/fast-math-rnflow-trs2a2.f90
> index 1d13cea..625be83 100644
> --- a/gcc/testsuite/gfortran.dg/vect/fast-math-rnflow-trs2a2.f90
> +++ b/gcc/testsuite/gfortran.dg/vect/fast-math-rnflow-trs2a2.f90
> @@ -29,5 +29,5 @@
>        return
>        end function trs2a2
>
> -! { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  } }
> +! { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect"  } }
>  ! { dg-final { cleanup-tree-dump "vect" } }
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-strided-store.c 
> b/gcc/testsuite/gcc.dg/vect/vect-strided-store.c
> new file mode 100644
> index 0000000..32bcff9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-strided-store.c
> @@ -0,0 +1,30 @@
> +/* { dg-require-effective-target vect_float } */
> +
> +#include <stdarg.h>
> +#include "tree-vect.h"
> +
> +void __attribute__((noinline))
> +sumit (float * __restrict dest,
> +       float * __restrict src, float * __restrict src2,
> +       int stride, int n)
> +{
> +  int i;
> +  for (i = 0; i < n; i++)
> +    dest[i*stride] = src[i] + src2[i];
> +}
> +
> +int main()
> +{
> +  int i;
> +  float src[] = {1, 2, 3, 4, 5, 6, 7, 8};
> +  float dest[8];
> +  check_vect ();
> +  sumit (dest, src, src, 1, 8);
> +  for (i = 0; i < 8; i++)
> +    if (2*src[i] != dest[i])
> +      abort ();
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
> +/* { dg-final { cleanup-tree-dump "vect" } } */

Reply via email to