On Tue, Apr 10, 2012 at 3:46 PM, Michael Matz <m...@suse.de> wrote: > Hi, > > and this implements generally strided loads where the stride is a > loop-invariant (constant or ssa-name). We only do so if the load can't be > handled by interleaving groups. The implementation is fairly straight > forward: > > for (i = 0; i < n; i += stride) > ... = array[i]; > > is transformed into: > > for (j = 0; ; j += VF*stride) > tmp1 = array[j]; > tmp2 = array[j + stride]; > ... > vectemp = {tmp1, tmp2, ...} > > (of course variously adjusted for component number) > > The nice thing is that by such separate loads we don't need to care for > alignment (if the old access was aligned, so will be the new as the access > size doesn't change). > > This is one more step in vectorizing the same loops as ICC. polyhedron > has such a case in rnflow, where it helps quite much: > > Without patch: > > Benchmark Compile Executable Ave Run Number Estim > Name (secs) (bytes) (secs) Repeats Err % > --------- ------- ---------- ------- ------- ------ > ac 3.71 3960337 10.06 2 0.0368 > aermod 75.30 5594185 19.30 5 0.7102 > air 10.30 4832222 6.59 2 0.0653 > capacita 7.23 4185227 57.25 2 0.0968 > channel 2.05 4658532 2.70 5 4.4701 > doduc 14.55 4237850 23.31 2 0.0768 > fatigue 6.64 4127554 7.48 5 0.3063 > gas_dyn 13.25 4113557 2.99 5 4.5063 > induct 7.23 4338356 8.85 5 0.9927 > linpk 1.24 3927350 10.37 5 5.1174 > mdbx 5.51 4053841 12.95 5 0.0956 > nf 10.69 4062276 11.44 5 0.2727 > protein 33.04 5011924 39.53 5 0.2328 > rnflow 25.35 4238651 31.78 2 0.0673 > test_fpu 12.24 4184939 9.00 5 0.1157 > tfft 1.15 3976710 3.95 3 0.0736 > > Geometric Mean Execution Time = 11.21 seconds > > With patch: > > Benchmark Compile Executable Ave Run Number Estim > Name (secs) (bytes) (secs) Repeats Err % > --------- ------- ---------- ------- ------- ------ > ac 3.91 3960337 10.34 5 0.3661 > aermod 76.44 5598281 19.03 5 1.3394 > air 11.52 4832222 6.75 5 0.9347 > capacita 7.78 4189323 56.33 2 0.0976 > channel 2.14 4658532 2.59 5 0.6640 > doduc 14.61 4237850 23.41 5 0.2974 > fatigue 6.62 4127554 7.44 5 0.1082 > gas_dyn 13.14 4113557 2.82 5 0.5253 > induct 7.26 4338356 8.49 2 0.0082 > linpk 1.23 3927350 9.78 2 0.0705 > mdbx 5.47 4053841 12.90 2 0.0601 > nf 10.67 4062276 11.33 2 0.0004 > protein 32.81 5011924 39.48 2 0.0893 > rnflow 26.57 4246843 26.70 5 0.0915 > test_fpu 13.29 4193131 8.82 5 0.2136 > tfft 1.14 3976710 3.95 5 0.1753 > > Geometric Mean Execution Time = 10.95 seconds > > I.e. for rnflow from 31.78 to 26.70 seconds, nearly 20% better. > > Regstrapped together with [1/2] on x86_64-linux, all langs+Ada, no > regressions. Okay for trunk?
Ok with ... > > Ciao, > Michael. > > PR tree-optimization/18437 > > * tree-vectorizer.h (_stmt_vec_info.stride_load_p): New member. > (STMT_VINFO_STRIDE_LOAD_P): New accessor. > (vect_check_strided_load): Declare. > * tree-vect-data-refs.c (vect_check_strided_load): New function. > (vect_analyze_data_refs): Use it to accept strided loads. > * tree-vect-stmts.c (vectorizable_load): Ditto and handle them. > > testsuite/ > * gfortran.dg/vect/rnflow-trs2a2.f90: New test. > > Index: tree-vectorizer.h > =================================================================== > --- tree-vectorizer.h.orig 2012-04-10 13:19:18.000000000 +0200 > +++ tree-vectorizer.h 2012-04-10 13:21:19.000000000 +0200 > @@ -545,6 +545,7 @@ typedef struct _stmt_vec_info { > > /* For loads only, true if this is a gather load. */ > bool gather_p; > + bool stride_load_p; > } *stmt_vec_info; > > /* Access Functions. */ > @@ -559,6 +560,7 @@ typedef struct _stmt_vec_info { > #define STMT_VINFO_VECTORIZABLE(S) (S)->vectorizable > #define STMT_VINFO_DATA_REF(S) (S)->data_ref_info > #define STMT_VINFO_GATHER_P(S) (S)->gather_p > +#define STMT_VINFO_STRIDE_LOAD_P(S) (S)->stride_load_p > > #define STMT_VINFO_DR_BASE_ADDRESS(S) (S)->dr_base_address > #define STMT_VINFO_DR_INIT(S) (S)->dr_init > @@ -875,6 +877,7 @@ extern bool vect_analyze_data_ref_access > extern bool vect_prune_runtime_alias_test_list (loop_vec_info); > extern tree vect_check_gather (gimple, loop_vec_info, tree *, tree *, > int *); > +extern bool vect_check_strided_load (gimple, loop_vec_info, tree *, tree *); > extern bool vect_analyze_data_refs (loop_vec_info, bb_vec_info, int *); > extern tree vect_create_data_ref_ptr (gimple, tree, struct loop *, tree, > tree *, gimple_stmt_iterator *, > Index: tree-vect-data-refs.c > =================================================================== > --- tree-vect-data-refs.c.orig 2012-04-10 13:18:35.000000000 +0200 > +++ tree-vect-data-refs.c 2012-04-10 13:21:19.000000000 +0200 > @@ -2690,6 +2690,53 @@ vect_check_gather (gimple stmt, loop_vec > return decl; > } > > +/* Check wether a non-affine load in STMT (being in the loop referred to > + in LOOP_VINFO) is suitable for handling as strided load. That is the case > + if its address is a simple induction variable. If so return the base > + of that induction variable in *BASEP and the (loop-invariant) step > + in *STEPP, both only when that pointer is non-zero. > + > + This handles ARRAY_REFs (with variant index) and MEM_REFs (with variant > + base pointer) only. */ > + > +bool > +vect_check_strided_load (gimple stmt, loop_vec_info loop_vinfo, tree *basep, > + tree *stepp) > +{ > + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); > + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); > + struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); > + tree base, off; > + affine_iv iv; > + > + base = DR_REF (dr); > + > + if (TREE_CODE (base) == ARRAY_REF) > + { > + off = TREE_OPERAND (base, 1); > + base = TREE_OPERAND (base, 0); > + } > + else if (TREE_CODE (base) == MEM_REF) > + { > + off = TREE_OPERAND (base, 0); > + base = TREE_OPERAND (base, 1); > + } > + else > + return false; > + > + if (TREE_CODE (off) != SSA_NAME) > + return false; > + > + if (!expr_invariant_in_loop_p (loop, base) > + || !simple_iv (loop, loop_containing_stmt (stmt), off, &iv, true)) > + return false; > + > + if (basep) > + *basep = iv.base; > + if (stepp) > + *stepp = iv.step; > + return true; > +} > > /* Function vect_analyze_data_refs. > > @@ -3090,16 +3137,21 @@ vect_analyze_data_refs (loop_vec_info lo > VEC (ddr_p, heap) *ddrs = LOOP_VINFO_DDRS (loop_vinfo); > struct data_dependence_relation *ddr, *newddr; > bool bad = false; > + bool strided_load = false; > tree off; > VEC (loop_p, heap) *nest = LOOP_VINFO_LOOP_NEST (loop_vinfo); > > - if (!vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL) > - || get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE) > + strided_load = vect_check_strided_load (stmt, loop_vinfo, NULL, > NULL); > + gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, > NULL); > + if (gather > + && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE) > + gather = false; > + if (!gather && !strided_load) > { > if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) > { > fprintf (vect_dump, > - "not vectorized: not suitable for gather "); > + "not vectorized: not suitable for gather/strided > load "); > print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); > } > return false; > @@ -3152,13 +3204,16 @@ vect_analyze_data_refs (loop_vec_info lo > { > fprintf (vect_dump, > "not vectorized: data dependence conflict" > - " prevents gather"); > + " prevents gather/strided load"); > print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); > } > return false; > } > > - STMT_VINFO_GATHER_P (stmt_info) = true; > + if (gather) > + STMT_VINFO_GATHER_P (stmt_info) = true; > + else if (strided_load) > + STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true; > } > } > > Index: tree-vect-stmts.c > =================================================================== > --- tree-vect-stmts.c.orig 2012-04-10 13:18:35.000000000 +0200 > +++ tree-vect-stmts.c 2012-04-10 13:21:19.000000000 +0200 > @@ -4224,6 +4224,7 @@ vectorizable_load (gimple stmt, gimple_s > tree aggr_type; > tree gather_base = NULL_TREE, gather_off = NULL_TREE; > tree gather_off_vectype = NULL_TREE, gather_decl = NULL_TREE; > + tree stride_base, stride_step; > int gather_scale = 1; > enum vect_def_type gather_dt = vect_unknown_def_type; > > @@ -4357,6 +4358,10 @@ vectorizable_load (gimple stmt, gimple_s > return false; > } > } > + else if (STMT_VINFO_STRIDE_LOAD_P (stmt_info)) > + { > + vect_check_strided_load (stmt, loop_vinfo, &stride_base, &stride_step); > + } > > if (!vec_stmt) /* transformation not required. */ > { > @@ -4520,6 +4525,104 @@ vectorizable_load (gimple stmt, gimple_s > STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; > else > STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; > + prev_stmt_info = vinfo_for_stmt (new_stmt); > + } > + return true; > + } > + else if (STMT_VINFO_STRIDE_LOAD_P (stmt_info)) > + { > + gimple_stmt_iterator incr_gsi; > + bool insert_after; > + gimple incr; > + tree offvar; > + tree ref = DR_REF (dr); > + tree ivstep; > + tree running_off; > + VEC(constructor_elt, gc) *v = NULL; > + gimple_seq stmts = NULL; > + > + gcc_assert (stride_base && stride_step); > + > + /* For a load with loop-invariant (but other than power-of-2) > + stride (i.e. not a grouped access) like so: > + > + for (i = 0; i < n; i += stride) > + ... = array[i]; > + > + we generate a new induction variable and new accesses to > + form a new vector (or vectors, depending on ncopies): > + > + for (j = 0; ; j += VF*stride) > + tmp1 = array[j]; > + tmp2 = array[j + stride]; > + ... > + vectemp = {tmp1, tmp2, ...} > + */ > + > + ivstep = stride_step; > + ivstep = fold_build2 (MULT_EXPR, TREE_TYPE (ivstep), ivstep, > + build_int_cst_type (TREE_TYPE (ivstep), vf)); Use build_int_cst. > + > + standard_iv_increment_position (loop, &incr_gsi, &insert_after); > + > + create_iv (stride_base, ivstep, NULL, > + loop, &incr_gsi, insert_after, > + &offvar, NULL); > + incr = gsi_stmt (incr_gsi); > + set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL)); > + > + stride_step = force_gimple_operand (stride_step, &stmts, true, > NULL_TREE); > + if (stmts) > + gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); > + > + prev_stmt_info = NULL; > + running_off = offvar; > + for (j = 0; j < ncopies; j++) > + { > + tree vec_inv; > + > + v = VEC_alloc (constructor_elt, gc, nunits); > + for (i = 0; i < nunits; i++) > + { > + tree newref, newoff; > + gimple incr; > + if (TREE_CODE (ref) == ARRAY_REF) > + newref = build4 (ARRAY_REF, TREE_TYPE (ref), > + unshare_expr (TREE_OPERAND (ref, 0)), > + running_off, > + NULL_TREE, NULL_TREE); > + else > + newref = build2 (MEM_REF, TREE_TYPE (ref), > + running_off, > + TREE_OPERAND (ref, 1)); > + > + newref = force_gimple_operand_gsi (gsi, newref, true, > + NULL_TREE, true, > + GSI_SAME_STMT); > + CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, newref); > + newoff = SSA_NAME_VAR (running_off); > + if (POINTER_TYPE_P (TREE_TYPE (newoff))) > + incr = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, > newoff, > + running_off, > stride_step); > + else > + incr = gimple_build_assign_with_ops (PLUS_EXPR, newoff, > + running_off, > stride_step); > + newoff = make_ssa_name (newoff, incr); > + gimple_assign_set_lhs (incr, newoff); > + vect_finish_stmt_generation (stmt, incr, gsi); > + > + running_off = newoff; > + } > + > + vec_inv = build_constructor (vectype, v); > + new_temp = vect_init_vector (stmt, vec_inv, vectype, gsi); > + new_stmt = SSA_NAME_DEF_STMT (new_temp); > + mark_symbols_for_renaming (new_stmt); This should not be necessary - in fact please manually set the proper VUSE here. Thanks, Richard. > + > + if (j == 0) > + STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; > + else > + STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; > prev_stmt_info = vinfo_for_stmt (new_stmt); > } > return true; > Index: testsuite/gfortran.dg/vect/rnflow-trs2a2.f90 > =================================================================== > --- testsuite/gfortran.dg/vect/rnflow-trs2a2.f90 (revision 0) > +++ testsuite/gfortran.dg/vect/rnflow-trs2a2.f90 (revision 0) > @@ -0,0 +1,33 @@ > +! { dg-do compile } > +! { dg-require-effective-target vect_double } > + > + function trs2a2 (j, k, u, d, m) > +! matrice de transition intermediaire, partant de k sans descendre > +! sous j. R = IjU(I-Ik)DIj, avec Ii = deltajj, j >= i. > +! alternative: trs2a2 = 0 > +! trs2a2 (j:k-1, j:k-1) = matmul (utrsft (j:k-1,j:k-1), > +! dtrsft (j:k-1,j:k-1)) > +! > + real, dimension (1:m,1:m) :: trs2a2 ! resultat > + real, dimension (1:m,1:m) :: u, d ! matrices utrsft, dtrsft > + integer, intent (in) :: j, k, m ! niveaux vallee pic > +! > +!##### following line replaced by Prentice to make less system dependent > +! real (kind = kind (1.0d0)) :: dtmp > + real (kind = selected_real_kind (10,50)) :: dtmp > +! > + trs2a2 = 0.0 > + do iclw1 = j, k - 1 > + do iclw2 = j, k - 1 > + dtmp = 0.0d0 > + do iclww = j, k - 1 > + dtmp = dtmp + u (iclw1, iclww) * d (iclww, iclw2) > + enddo > + trs2a2 (iclw1, iclw2) = dtmp > + enddo > + enddo > + return > + end function trs2a2 > + > +! { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } > +! { dg-final { cleanup-tree-dump "vect" } }