On Tue, Apr 10, 2012 at 3:46 PM, Michael Matz <[email protected]> wrote:
> Hi,
>
> and this implements generally strided loads where the stride is a
> loop-invariant (constant or ssa-name). We only do so if the load can't be
> handled by interleaving groups. The implementation is fairly straight
> forward:
>
> for (i = 0; i < n; i += stride)
> ... = array[i];
>
> is transformed into:
>
> for (j = 0; ; j += VF*stride)
> tmp1 = array[j];
> tmp2 = array[j + stride];
> ...
> vectemp = {tmp1, tmp2, ...}
>
> (of course variously adjusted for component number)
>
> The nice thing is that by such separate loads we don't need to care for
> alignment (if the old access was aligned, so will be the new as the access
> size doesn't change).
>
> This is one more step in vectorizing the same loops as ICC. polyhedron
> has such a case in rnflow, where it helps quite much:
>
> Without patch:
>
> Benchmark Compile Executable Ave Run Number Estim
> Name (secs) (bytes) (secs) Repeats Err %
> --------- ------- ---------- ------- ------- ------
> ac 3.71 3960337 10.06 2 0.0368
> aermod 75.30 5594185 19.30 5 0.7102
> air 10.30 4832222 6.59 2 0.0653
> capacita 7.23 4185227 57.25 2 0.0968
> channel 2.05 4658532 2.70 5 4.4701
> doduc 14.55 4237850 23.31 2 0.0768
> fatigue 6.64 4127554 7.48 5 0.3063
> gas_dyn 13.25 4113557 2.99 5 4.5063
> induct 7.23 4338356 8.85 5 0.9927
> linpk 1.24 3927350 10.37 5 5.1174
> mdbx 5.51 4053841 12.95 5 0.0956
> nf 10.69 4062276 11.44 5 0.2727
> protein 33.04 5011924 39.53 5 0.2328
> rnflow 25.35 4238651 31.78 2 0.0673
> test_fpu 12.24 4184939 9.00 5 0.1157
> tfft 1.15 3976710 3.95 3 0.0736
>
> Geometric Mean Execution Time = 11.21 seconds
>
> With patch:
>
> Benchmark Compile Executable Ave Run Number Estim
> Name (secs) (bytes) (secs) Repeats Err %
> --------- ------- ---------- ------- ------- ------
> ac 3.91 3960337 10.34 5 0.3661
> aermod 76.44 5598281 19.03 5 1.3394
> air 11.52 4832222 6.75 5 0.9347
> capacita 7.78 4189323 56.33 2 0.0976
> channel 2.14 4658532 2.59 5 0.6640
> doduc 14.61 4237850 23.41 5 0.2974
> fatigue 6.62 4127554 7.44 5 0.1082
> gas_dyn 13.14 4113557 2.82 5 0.5253
> induct 7.26 4338356 8.49 2 0.0082
> linpk 1.23 3927350 9.78 2 0.0705
> mdbx 5.47 4053841 12.90 2 0.0601
> nf 10.67 4062276 11.33 2 0.0004
> protein 32.81 5011924 39.48 2 0.0893
> rnflow 26.57 4246843 26.70 5 0.0915
> test_fpu 13.29 4193131 8.82 5 0.2136
> tfft 1.14 3976710 3.95 5 0.1753
>
> Geometric Mean Execution Time = 10.95 seconds
>
> I.e. for rnflow from 31.78 to 26.70 seconds, nearly 20% better.
>
> Regstrapped together with [1/2] on x86_64-linux, all langs+Ada, no
> regressions. Okay for trunk?
Ok with ...
>
> Ciao,
> Michael.
>
> PR tree-optimization/18437
>
> * tree-vectorizer.h (_stmt_vec_info.stride_load_p): New member.
> (STMT_VINFO_STRIDE_LOAD_P): New accessor.
> (vect_check_strided_load): Declare.
> * tree-vect-data-refs.c (vect_check_strided_load): New function.
> (vect_analyze_data_refs): Use it to accept strided loads.
> * tree-vect-stmts.c (vectorizable_load): Ditto and handle them.
>
> testsuite/
> * gfortran.dg/vect/rnflow-trs2a2.f90: New test.
>
> Index: tree-vectorizer.h
> ===================================================================
> --- tree-vectorizer.h.orig 2012-04-10 13:19:18.000000000 +0200
> +++ tree-vectorizer.h 2012-04-10 13:21:19.000000000 +0200
> @@ -545,6 +545,7 @@ typedef struct _stmt_vec_info {
>
> /* For loads only, true if this is a gather load. */
> bool gather_p;
> + bool stride_load_p;
> } *stmt_vec_info;
>
> /* Access Functions. */
> @@ -559,6 +560,7 @@ typedef struct _stmt_vec_info {
> #define STMT_VINFO_VECTORIZABLE(S) (S)->vectorizable
> #define STMT_VINFO_DATA_REF(S) (S)->data_ref_info
> #define STMT_VINFO_GATHER_P(S) (S)->gather_p
> +#define STMT_VINFO_STRIDE_LOAD_P(S) (S)->stride_load_p
>
> #define STMT_VINFO_DR_BASE_ADDRESS(S) (S)->dr_base_address
> #define STMT_VINFO_DR_INIT(S) (S)->dr_init
> @@ -875,6 +877,7 @@ extern bool vect_analyze_data_ref_access
> extern bool vect_prune_runtime_alias_test_list (loop_vec_info);
> extern tree vect_check_gather (gimple, loop_vec_info, tree *, tree *,
> int *);
> +extern bool vect_check_strided_load (gimple, loop_vec_info, tree *, tree *);
> extern bool vect_analyze_data_refs (loop_vec_info, bb_vec_info, int *);
> extern tree vect_create_data_ref_ptr (gimple, tree, struct loop *, tree,
> tree *, gimple_stmt_iterator *,
> Index: tree-vect-data-refs.c
> ===================================================================
> --- tree-vect-data-refs.c.orig 2012-04-10 13:18:35.000000000 +0200
> +++ tree-vect-data-refs.c 2012-04-10 13:21:19.000000000 +0200
> @@ -2690,6 +2690,53 @@ vect_check_gather (gimple stmt, loop_vec
> return decl;
> }
>
> +/* Check wether a non-affine load in STMT (being in the loop referred to
> + in LOOP_VINFO) is suitable for handling as strided load. That is the case
> + if its address is a simple induction variable. If so return the base
> + of that induction variable in *BASEP and the (loop-invariant) step
> + in *STEPP, both only when that pointer is non-zero.
> +
> + This handles ARRAY_REFs (with variant index) and MEM_REFs (with variant
> + base pointer) only. */
> +
> +bool
> +vect_check_strided_load (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
> + tree *stepp)
> +{
> + struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
> + stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
> + struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
> + tree base, off;
> + affine_iv iv;
> +
> + base = DR_REF (dr);
> +
> + if (TREE_CODE (base) == ARRAY_REF)
> + {
> + off = TREE_OPERAND (base, 1);
> + base = TREE_OPERAND (base, 0);
> + }
> + else if (TREE_CODE (base) == MEM_REF)
> + {
> + off = TREE_OPERAND (base, 0);
> + base = TREE_OPERAND (base, 1);
> + }
> + else
> + return false;
> +
> + if (TREE_CODE (off) != SSA_NAME)
> + return false;
> +
> + if (!expr_invariant_in_loop_p (loop, base)
> + || !simple_iv (loop, loop_containing_stmt (stmt), off, &iv, true))
> + return false;
> +
> + if (basep)
> + *basep = iv.base;
> + if (stepp)
> + *stepp = iv.step;
> + return true;
> +}
>
> /* Function vect_analyze_data_refs.
>
> @@ -3090,16 +3137,21 @@ vect_analyze_data_refs (loop_vec_info lo
> VEC (ddr_p, heap) *ddrs = LOOP_VINFO_DDRS (loop_vinfo);
> struct data_dependence_relation *ddr, *newddr;
> bool bad = false;
> + bool strided_load = false;
> tree off;
> VEC (loop_p, heap) *nest = LOOP_VINFO_LOOP_NEST (loop_vinfo);
>
> - if (!vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL)
> - || get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
> + strided_load = vect_check_strided_load (stmt, loop_vinfo, NULL,
> NULL);
> + gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off,
> NULL);
> + if (gather
> + && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
> + gather = false;
> + if (!gather && !strided_load)
> {
> if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
> {
> fprintf (vect_dump,
> - "not vectorized: not suitable for gather ");
> + "not vectorized: not suitable for gather/strided
> load ");
> print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
> }
> return false;
> @@ -3152,13 +3204,16 @@ vect_analyze_data_refs (loop_vec_info lo
> {
> fprintf (vect_dump,
> "not vectorized: data dependence conflict"
> - " prevents gather");
> + " prevents gather/strided load");
> print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
> }
> return false;
> }
>
> - STMT_VINFO_GATHER_P (stmt_info) = true;
> + if (gather)
> + STMT_VINFO_GATHER_P (stmt_info) = true;
> + else if (strided_load)
> + STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
> }
> }
>
> Index: tree-vect-stmts.c
> ===================================================================
> --- tree-vect-stmts.c.orig 2012-04-10 13:18:35.000000000 +0200
> +++ tree-vect-stmts.c 2012-04-10 13:21:19.000000000 +0200
> @@ -4224,6 +4224,7 @@ vectorizable_load (gimple stmt, gimple_s
> tree aggr_type;
> tree gather_base = NULL_TREE, gather_off = NULL_TREE;
> tree gather_off_vectype = NULL_TREE, gather_decl = NULL_TREE;
> + tree stride_base, stride_step;
> int gather_scale = 1;
> enum vect_def_type gather_dt = vect_unknown_def_type;
>
> @@ -4357,6 +4358,10 @@ vectorizable_load (gimple stmt, gimple_s
> return false;
> }
> }
> + else if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
> + {
> + vect_check_strided_load (stmt, loop_vinfo, &stride_base, &stride_step);
> + }
>
> if (!vec_stmt) /* transformation not required. */
> {
> @@ -4520,6 +4525,104 @@ vectorizable_load (gimple stmt, gimple_s
> STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
> else
> STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
> + prev_stmt_info = vinfo_for_stmt (new_stmt);
> + }
> + return true;
> + }
> + else if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
> + {
> + gimple_stmt_iterator incr_gsi;
> + bool insert_after;
> + gimple incr;
> + tree offvar;
> + tree ref = DR_REF (dr);
> + tree ivstep;
> + tree running_off;
> + VEC(constructor_elt, gc) *v = NULL;
> + gimple_seq stmts = NULL;
> +
> + gcc_assert (stride_base && stride_step);
> +
> + /* For a load with loop-invariant (but other than power-of-2)
> + stride (i.e. not a grouped access) like so:
> +
> + for (i = 0; i < n; i += stride)
> + ... = array[i];
> +
> + we generate a new induction variable and new accesses to
> + form a new vector (or vectors, depending on ncopies):
> +
> + for (j = 0; ; j += VF*stride)
> + tmp1 = array[j];
> + tmp2 = array[j + stride];
> + ...
> + vectemp = {tmp1, tmp2, ...}
> + */
> +
> + ivstep = stride_step;
> + ivstep = fold_build2 (MULT_EXPR, TREE_TYPE (ivstep), ivstep,
> + build_int_cst_type (TREE_TYPE (ivstep), vf));
Use build_int_cst.
> +
> + standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> +
> + create_iv (stride_base, ivstep, NULL,
> + loop, &incr_gsi, insert_after,
> + &offvar, NULL);
> + incr = gsi_stmt (incr_gsi);
> + set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
> +
> + stride_step = force_gimple_operand (stride_step, &stmts, true,
> NULL_TREE);
> + if (stmts)
> + gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
> +
> + prev_stmt_info = NULL;
> + running_off = offvar;
> + for (j = 0; j < ncopies; j++)
> + {
> + tree vec_inv;
> +
> + v = VEC_alloc (constructor_elt, gc, nunits);
> + for (i = 0; i < nunits; i++)
> + {
> + tree newref, newoff;
> + gimple incr;
> + if (TREE_CODE (ref) == ARRAY_REF)
> + newref = build4 (ARRAY_REF, TREE_TYPE (ref),
> + unshare_expr (TREE_OPERAND (ref, 0)),
> + running_off,
> + NULL_TREE, NULL_TREE);
> + else
> + newref = build2 (MEM_REF, TREE_TYPE (ref),
> + running_off,
> + TREE_OPERAND (ref, 1));
> +
> + newref = force_gimple_operand_gsi (gsi, newref, true,
> + NULL_TREE, true,
> + GSI_SAME_STMT);
> + CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, newref);
> + newoff = SSA_NAME_VAR (running_off);
> + if (POINTER_TYPE_P (TREE_TYPE (newoff)))
> + incr = gimple_build_assign_with_ops (POINTER_PLUS_EXPR,
> newoff,
> + running_off,
> stride_step);
> + else
> + incr = gimple_build_assign_with_ops (PLUS_EXPR, newoff,
> + running_off,
> stride_step);
> + newoff = make_ssa_name (newoff, incr);
> + gimple_assign_set_lhs (incr, newoff);
> + vect_finish_stmt_generation (stmt, incr, gsi);
> +
> + running_off = newoff;
> + }
> +
> + vec_inv = build_constructor (vectype, v);
> + new_temp = vect_init_vector (stmt, vec_inv, vectype, gsi);
> + new_stmt = SSA_NAME_DEF_STMT (new_temp);
> + mark_symbols_for_renaming (new_stmt);
This should not be necessary - in fact please manually set the proper
VUSE here.
Thanks,
Richard.
> +
> + if (j == 0)
> + STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
> + else
> + STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
> prev_stmt_info = vinfo_for_stmt (new_stmt);
> }
> return true;
> Index: testsuite/gfortran.dg/vect/rnflow-trs2a2.f90
> ===================================================================
> --- testsuite/gfortran.dg/vect/rnflow-trs2a2.f90 (revision 0)
> +++ testsuite/gfortran.dg/vect/rnflow-trs2a2.f90 (revision 0)
> @@ -0,0 +1,33 @@
> +! { dg-do compile }
> +! { dg-require-effective-target vect_double }
> +
> + function trs2a2 (j, k, u, d, m)
> +! matrice de transition intermediaire, partant de k sans descendre
> +! sous j. R = IjU(I-Ik)DIj, avec Ii = deltajj, j >= i.
> +! alternative: trs2a2 = 0
> +! trs2a2 (j:k-1, j:k-1) = matmul (utrsft (j:k-1,j:k-1),
> +! dtrsft (j:k-1,j:k-1))
> +!
> + real, dimension (1:m,1:m) :: trs2a2 ! resultat
> + real, dimension (1:m,1:m) :: u, d ! matrices utrsft, dtrsft
> + integer, intent (in) :: j, k, m ! niveaux vallee pic
> +!
> +!##### following line replaced by Prentice to make less system dependent
> +! real (kind = kind (1.0d0)) :: dtmp
> + real (kind = selected_real_kind (10,50)) :: dtmp
> +!
> + trs2a2 = 0.0
> + do iclw1 = j, k - 1
> + do iclw2 = j, k - 1
> + dtmp = 0.0d0
> + do iclww = j, k - 1
> + dtmp = dtmp + u (iclw1, iclww) * d (iclww, iclw2)
> + enddo
> + trs2a2 (iclw1, iclw2) = dtmp
> + enddo
> + enddo
> + return
> + end function trs2a2
> +
> +! { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } }
> +! { dg-final { cleanup-tree-dump "vect" } }