On Wed, May 6, 2015 at 5:37 PM, Michael Matz <m...@suse.de> wrote: > Hi, > > I'm sitting on this since quite some time already and always missed stage > 1. This implements support for vectorizing strided stores with unknown > but loop invariant stride, like: > > sumit (float * __restrict dest, > float * __restrict src, float * __restrict src2, > int stride, int n) > { > int i; > for (i = 0; i < n; i++) > dest[i*stride] = src[i] + src2[i]; > } > > I use the same scheme like for strided loads, i.e. expanding such store > with N separate scalar stores (so alignment could also be ignored for such > ones). > > This doesn't yet fix PR65962, because that one uses a _constant_ step. > That makes us try a grouped access, which isn't supported for stores when > there's a gap (which there is). Unfortunately vect_analyze_group_access > actively changes some vect info even before detecting that it's not a > grouped access, so there must be some rollback implemented; I decided to > defer this to a follow up. > > Regstrapped on x86-64-linux, no regressions (I had to adjust two > fortran testcases where one more loop is vectorized now). Okay for trunk?
Ok. Thanks, Richard. > > Ciao, > Michael. > * tree-vectorizer.h (struct _stmt_vec_info): Rename stride_load_p > to strided_p. > (STMT_VINFO_STRIDE_LOAD_P): Rename to ... > (STMT_VINFO_STRIDED_P): ... this. > * tree-vect-data-refs.c (vect_compute_data_ref_alignment): Adjust. > (vect_verify_datarefs_alignment): Likewise. > (vect_enhance_data_refs_alignment): Likewise. > (vect_analyze_data_ref_access): Likewise. > (vect_analyze_data_refs): Accept strided stores. > * tree-vect-stmts.c (vect_model_store_cost): Count strided stores. > (vect_model_load_cost): Adjust for macro rename. > (vectorizable_mask_load_store): Likewise. > (vectorizable_load): Likewise. > (vectorizable_store): Open code strided stores. > > testsuite/ > * gcc.dg/vect/vect-strided-store.c: New test. > * gfortran.dg/vect/fast-math-pr37021.f90: Adjust. > * gfortran.dg/vect/fast-math-rnflow-trs2a2.f90: Adjust. > > diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c > index 96afc7a..6d8f17e 100644 > --- a/gcc/tree-vect-data-refs.c > +++ b/gcc/tree-vect-data-refs.c > @@ -665,7 +665,7 @@ vect_compute_data_ref_alignment (struct data_reference > *dr) > > /* Strided loads perform only component accesses, misalignment information > is irrelevant for them. */ > - if (STMT_VINFO_STRIDE_LOAD_P (stmt_info)) > + if (STMT_VINFO_STRIDED_P (stmt_info)) > return true; > > misalign = DR_INIT (dr); > @@ -942,7 +942,7 @@ vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, > bb_vec_info bb_vinfo) > > /* Strided loads perform only component accesses, alignment is > irrelevant for them. */ > - if (STMT_VINFO_STRIDE_LOAD_P (stmt_info)) > + if (STMT_VINFO_STRIDED_P (stmt_info)) > continue; > > supportable_dr_alignment = vect_supportable_dr_alignment (dr, false); > @@ -1409,7 +1409,7 @@ vect_enhance_data_refs_alignment (loop_vec_info > loop_vinfo) > > /* Strided loads perform only component accesses, alignment is > irrelevant for them. */ > - if (STMT_VINFO_STRIDE_LOAD_P (stmt_info)) > + if (STMT_VINFO_STRIDED_P (stmt_info)) > continue; > > supportable_dr_alignment = vect_supportable_dr_alignment (dr, true); > @@ -1701,7 +1701,7 @@ vect_enhance_data_refs_alignment (loop_vec_info > loop_vinfo) > > /* Strided loads perform only component accesses, alignment is > irrelevant for them. */ > - if (STMT_VINFO_STRIDE_LOAD_P (stmt_info)) > + if (STMT_VINFO_STRIDED_P (stmt_info)) > continue; > > save_misalignment = DR_MISALIGNMENT (dr); > @@ -1821,7 +1821,7 @@ vect_enhance_data_refs_alignment (loop_vec_info > loop_vinfo) > > /* Strided loads perform only component accesses, alignment is > irrelevant for them. */ > - if (STMT_VINFO_STRIDE_LOAD_P (stmt_info)) > + if (STMT_VINFO_STRIDED_P (stmt_info)) > continue; > > supportable_dr_alignment = vect_supportable_dr_alignment (dr, > false); > @@ -2368,7 +2368,7 @@ vect_analyze_data_ref_access (struct data_reference *dr) > > /* Assume this is a DR handled by non-constant strided load case. */ > if (TREE_CODE (step) != INTEGER_CST) > - return STMT_VINFO_STRIDE_LOAD_P (stmt_info); > + return STMT_VINFO_STRIDED_P (stmt_info); > > /* Not consecutive access - check if it's a part of interleaving group. */ > return vect_analyze_group_access (dr); > @@ -3764,8 +3764,7 @@ again: > else if (loop_vinfo > && TREE_CODE (DR_STEP (dr)) != INTEGER_CST) > { > - if (nested_in_vect_loop_p (loop, stmt) > - || !DR_IS_READ (dr)) > + if (nested_in_vect_loop_p (loop, stmt)) > { > if (dump_enabled_p ()) > { > @@ -3777,7 +3776,7 @@ again: > } > return false; > } > - STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true; > + STMT_VINFO_STRIDED_P (stmt_info) = true; > } > } > > diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c > index 2ce6d4d..d268eb0 100644 > --- a/gcc/tree-vect-stmts.c > +++ b/gcc/tree-vect-stmts.c > @@ -1014,7 +1014,19 @@ vect_model_store_cost (stmt_vec_info stmt_info, int > ncopies, > } > > /* Costs of the stores. */ > - vect_get_store_cost (first_dr, ncopies, &inside_cost, body_cost_vec); > + if (STMT_VINFO_STRIDED_P (stmt_info)) > + { > + /* N scalar stores plus extracting the elements. */ > + tree vectype = STMT_VINFO_VECTYPE (stmt_info); > + inside_cost += record_stmt_cost (body_cost_vec, > + ncopies * TYPE_VECTOR_SUBPARTS > (vectype), > + scalar_store, stmt_info, 0, vect_body); > + inside_cost += record_stmt_cost (body_cost_vec, > + ncopies * TYPE_VECTOR_SUBPARTS > (vectype), > + vec_to_scalar, stmt_info, 0, > vect_body); > + } > + else > + vect_get_store_cost (first_dr, ncopies, &inside_cost, body_cost_vec); > > if (dump_enabled_p ()) > dump_printf_loc (MSG_NOTE, vect_location, > @@ -1127,7 +1139,7 @@ vect_model_load_cost (stmt_vec_info stmt_info, int > ncopies, > } > > /* The loads themselves. */ > - if (STMT_VINFO_STRIDE_LOAD_P (stmt_info)) > + if (STMT_VINFO_STRIDED_P (stmt_info)) > { > /* N scalar loads plus gathering them into a vector. */ > tree vectype = STMT_VINFO_VECTYPE (stmt_info); > @@ -1820,7 +1832,7 @@ vectorizable_mask_load_store (gimple stmt, > gimple_stmt_iterator *gsi, > if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) > return false; > > - if (STMT_VINFO_STRIDE_LOAD_P (stmt_info)) > + if (STMT_VINFO_STRIDED_P (stmt_info)) > return false; > > if (STMT_VINFO_GATHER_P (stmt_info)) > @@ -5013,7 +5025,7 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator > *gsi, gimple *vec_stmt, > tree dataref_ptr = NULL_TREE; > tree dataref_offset = NULL_TREE; > gimple ptr_incr = NULL; > - int nunits = TYPE_VECTOR_SUBPARTS (vectype); > + unsigned int nunits = TYPE_VECTOR_SUBPARTS (vectype); > int ncopies; > int j; > gimple next_stmt, first_stmt = NULL; > @@ -5100,38 +5112,42 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator > *gsi, gimple *vec_stmt, > if (!STMT_VINFO_DATA_REF (stmt_info)) > return false; > > - negative = > - tree_int_cst_compare (loop && nested_in_vect_loop_p (loop, stmt) > - ? STMT_VINFO_DR_STEP (stmt_info) : DR_STEP (dr), > - size_zero_node) < 0; > - if (negative && ncopies > 1) > - { > - if (dump_enabled_p ()) > - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > - "multiple types with negative step.\n"); > - return false; > - } > - > - if (negative) > + if (STMT_VINFO_STRIDED_P (stmt_info)) > + ; > + else > { > - gcc_assert (!grouped_store); > - alignment_support_scheme = vect_supportable_dr_alignment (dr, false); > - if (alignment_support_scheme != dr_aligned > - && alignment_support_scheme != dr_unaligned_supported) > + negative = > + tree_int_cst_compare (loop && nested_in_vect_loop_p (loop, stmt) > + ? STMT_VINFO_DR_STEP (stmt_info) : DR_STEP > (dr), > + size_zero_node) < 0; > + if (negative && ncopies > 1) > { > if (dump_enabled_p ()) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > - "negative step but alignment required.\n"); > + "multiple types with negative step.\n"); > return false; > } > - if (dt != vect_constant_def > - && dt != vect_external_def > - && !perm_mask_for_reverse (vectype)) > + if (negative) > { > - if (dump_enabled_p ()) > - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > - "negative step and reversing not supported.\n"); > - return false; > + gcc_assert (!grouped_store); > + alignment_support_scheme = vect_supportable_dr_alignment (dr, > false); > + if (alignment_support_scheme != dr_aligned > + && alignment_support_scheme != dr_unaligned_supported) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "negative step but alignment required.\n"); > + return false; > + } > + if (dt != vect_constant_def > + && dt != vect_external_def > + && !perm_mask_for_reverse (vectype)) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "negative step and reversing not > supported.\n"); > + return false; > + } > } > } > > @@ -5230,6 +5246,113 @@ vectorizable_store (gimple stmt, gimple_stmt_iterator > *gsi, gimple *vec_stmt, > dump_printf_loc (MSG_NOTE, vect_location, > "transform store. ncopies = %d\n", ncopies); > > + if (STMT_VINFO_STRIDED_P (stmt_info)) > + { > + gimple_stmt_iterator incr_gsi; > + bool insert_after; > + gimple incr; > + tree offvar; > + tree ivstep; > + tree running_off; > + gimple_seq stmts = NULL; > + tree stride_base, stride_step, alias_off; > + tree vec_oprnd; > + > + gcc_assert (!nested_in_vect_loop_p (loop, stmt)); > + > + stride_base > + = fold_build_pointer_plus > + (unshare_expr (DR_BASE_ADDRESS (dr)), > + size_binop (PLUS_EXPR, > + convert_to_ptrofftype (unshare_expr (DR_OFFSET > (dr))), > + convert_to_ptrofftype (DR_INIT(dr)))); > + stride_step = fold_convert (sizetype, unshare_expr (DR_STEP (dr))); > + > + /* For a store with loop-invariant (but other than power-of-2) > + stride (i.e. not a grouped access) like so: > + > + for (i = 0; i < n; i += stride) > + array[i] = ...; > + > + we generate a new induction variable and new stores from > + the components of the (vectorized) rhs: > + > + for (j = 0; ; j += VF*stride) > + vectemp = ...; > + tmp1 = vectemp[0]; > + array[j] = tmp1; > + tmp2 = vectemp[1]; > + array[j + stride] = tmp2; > + ... > + */ > + > + ivstep = stride_step; > + ivstep = fold_build2 (MULT_EXPR, TREE_TYPE (ivstep), ivstep, > + build_int_cst (TREE_TYPE (ivstep), > + ncopies * nunits)); > + > + standard_iv_increment_position (loop, &incr_gsi, &insert_after); > + > + create_iv (stride_base, ivstep, NULL, > + loop, &incr_gsi, insert_after, > + &offvar, NULL); > + incr = gsi_stmt (incr_gsi); > + set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL)); > + > + stride_step = force_gimple_operand (stride_step, &stmts, true, > NULL_TREE); > + if (stmts) > + gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); > + > + prev_stmt_info = NULL; > + running_off = offvar; > + alias_off = build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0); > + for (j = 0; j < ncopies; j++) > + { > + /* We've set op and dt above, from gimple_assign_rhs1(stmt), > + and first_stmt == stmt. */ > + if (j == 0) > + vec_oprnd = vect_get_vec_def_for_operand (op, first_stmt, NULL); > + else > + vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, vec_oprnd); > + > + for (i = 0; i < nunits; i++) > + { > + tree newref, newoff; > + gimple incr, assign; > + tree size = TYPE_SIZE (elem_type); > + /* Extract the i'th component. */ > + tree pos = fold_build2 (MULT_EXPR, bitsizetype, bitsize_int (i), > + size); > + tree elem = fold_build3 (BIT_FIELD_REF, elem_type, vec_oprnd, > + size, pos); > + > + elem = force_gimple_operand_gsi (gsi, elem, true, > + NULL_TREE, true, > + GSI_SAME_STMT); > + > + newref = build2 (MEM_REF, TREE_TYPE (vectype), > + running_off, alias_off); > + > + /* And store it to *running_off. */ > + assign = gimple_build_assign (newref, elem); > + vect_finish_stmt_generation (stmt, assign, gsi); > + > + newoff = copy_ssa_name (running_off, NULL); > + incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR, > + running_off, stride_step); > + vect_finish_stmt_generation (stmt, incr, gsi); > + > + running_off = newoff; > + if (j == 0 && i == i) > + STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = assign; > + else > + STMT_VINFO_RELATED_STMT (prev_stmt_info) = assign; > + prev_stmt_info = vinfo_for_stmt (assign); > + } > + } > + return true; > + } > + > dr_chain.create (group_size); > oprnds.create (group_size); > > @@ -5846,7 +5969,7 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator > *gsi, gimple *vec_stmt, > return false; > } > } > - else if (STMT_VINFO_STRIDE_LOAD_P (stmt_info)) > + else if (STMT_VINFO_STRIDED_P (stmt_info)) > ; > else > { > @@ -6079,7 +6202,7 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator > *gsi, gimple *vec_stmt, > } > return true; > } > - else if (STMT_VINFO_STRIDE_LOAD_P (stmt_info)) > + else if (STMT_VINFO_STRIDED_P (stmt_info)) > { > gimple_stmt_iterator incr_gsi; > bool insert_after; > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h > index 0796cc1..d231626 100644 > --- a/gcc/tree-vectorizer.h > +++ b/gcc/tree-vectorizer.h > @@ -643,7 +643,9 @@ typedef struct _stmt_vec_info { > > /* For loads only, true if this is a gather load. */ > bool gather_p; > - bool stride_load_p; > + > + /* True if this is an access with loop-invariant stride. */ > + bool strided_p; > > /* For both loads and stores. */ > bool simd_lane_access_p; > @@ -661,7 +663,7 @@ typedef struct _stmt_vec_info { > #define STMT_VINFO_VECTORIZABLE(S) (S)->vectorizable > #define STMT_VINFO_DATA_REF(S) (S)->data_ref_info > #define STMT_VINFO_GATHER_P(S) (S)->gather_p > -#define STMT_VINFO_STRIDE_LOAD_P(S) (S)->stride_load_p > +#define STMT_VINFO_STRIDED_P(S) (S)->strided_p > #define STMT_VINFO_SIMD_LANE_ACCESS_P(S) (S)->simd_lane_access_p > > #define STMT_VINFO_DR_BASE_ADDRESS(S) (S)->dr_base_address > diff --git a/gcc/testsuite/gfortran.dg/vect/fast-math-pr37021.f90 > b/gcc/testsuite/gfortran.dg/vect/fast-math-pr37021.f90 > index b17ac9c..d5f5d40 100644 > --- a/gcc/testsuite/gfortran.dg/vect/fast-math-pr37021.f90 > +++ b/gcc/testsuite/gfortran.dg/vect/fast-math-pr37021.f90 > @@ -14,5 +14,5 @@ subroutine to_product_of(self,a,b,a1,a2) > end do > end subroutine > > -! { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } > +! { dg-final { scan-tree-dump "vectorized 2 loops" "vect" } } > ! { dg-final { cleanup-tree-dump "vect" } } > diff --git a/gcc/testsuite/gfortran.dg/vect/fast-math-rnflow-trs2a2.f90 > b/gcc/testsuite/gfortran.dg/vect/fast-math-rnflow-trs2a2.f90 > index 1d13cea..625be83 100644 > --- a/gcc/testsuite/gfortran.dg/vect/fast-math-rnflow-trs2a2.f90 > +++ b/gcc/testsuite/gfortran.dg/vect/fast-math-rnflow-trs2a2.f90 > @@ -29,5 +29,5 @@ > return > end function trs2a2 > > -! { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } > +! { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" } } > ! { dg-final { cleanup-tree-dump "vect" } } > diff --git a/gcc/testsuite/gcc.dg/vect/vect-strided-store.c > b/gcc/testsuite/gcc.dg/vect/vect-strided-store.c > new file mode 100644 > index 0000000..32bcff9 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/vect-strided-store.c > @@ -0,0 +1,30 @@ > +/* { dg-require-effective-target vect_float } */ > + > +#include <stdarg.h> > +#include "tree-vect.h" > + > +void __attribute__((noinline)) > +sumit (float * __restrict dest, > + float * __restrict src, float * __restrict src2, > + int stride, int n) > +{ > + int i; > + for (i = 0; i < n; i++) > + dest[i*stride] = src[i] + src2[i]; > +} > + > +int main() > +{ > + int i; > + float src[] = {1, 2, 3, 4, 5, 6, 7, 8}; > + float dest[8]; > + check_vect (); > + sumit (dest, src, src, 1, 8); > + for (i = 0; i < 8; i++) > + if (2*src[i] != dest[i]) > + abort (); > + return 0; > +} > + > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ > +/* { dg-final { cleanup-tree-dump "vect" } } */