On Tue, Apr 10, 2012 at 3:46 PM, Michael Matz <m...@suse.de> wrote:
> Hi,
>
> and this implements generally strided loads where the stride is a
> loop-invariant (constant or ssa-name).  We only do so if the load can't be
> handled by interleaving groups.  The implementation is fairly straight
> forward:
>
>         for (i = 0; i < n; i += stride)
>           ... = array[i];
>
> is transformed into:
>
>         for (j = 0; ; j += VF*stride)
>           tmp1 = array[j];
>           tmp2 = array[j + stride];
>           ...
>           vectemp = {tmp1, tmp2, ...}
>
> (of course variously adjusted for component number)
>
> The nice thing is that by such separate loads we don't need to care for
> alignment (if the old access was aligned, so will be the new as the access
> size doesn't change).
>
> This is one more step in vectorizing the same loops as ICC.  polyhedron
> has such a case in rnflow, where it helps quite much:
>
> Without patch:
>
>   Benchmark   Compile  Executable   Ave Run  Number   Estim
>        Name    (secs)     (bytes)    (secs) Repeats   Err %
>   ---------   -------  ----------   ------- -------  ------
>          ac      3.71     3960337     10.06       2  0.0368
>      aermod     75.30     5594185     19.30       5  0.7102
>         air     10.30     4832222      6.59       2  0.0653
>    capacita      7.23     4185227     57.25       2  0.0968
>     channel      2.05     4658532      2.70       5  4.4701
>       doduc     14.55     4237850     23.31       2  0.0768
>     fatigue      6.64     4127554      7.48       5  0.3063
>     gas_dyn     13.25     4113557      2.99       5  4.5063
>      induct      7.23     4338356      8.85       5  0.9927
>       linpk      1.24     3927350     10.37       5  5.1174
>        mdbx      5.51     4053841     12.95       5  0.0956
>          nf     10.69     4062276     11.44       5  0.2727
>     protein     33.04     5011924     39.53       5  0.2328
>      rnflow     25.35     4238651     31.78       2  0.0673
>    test_fpu     12.24     4184939      9.00       5  0.1157
>        tfft      1.15     3976710      3.95       3  0.0736
>
> Geometric Mean Execution Time =      11.21 seconds
>
> With patch:
>
>   Benchmark   Compile  Executable   Ave Run  Number   Estim
>        Name    (secs)     (bytes)    (secs) Repeats   Err %
>   ---------   -------  ----------   ------- -------  ------
>          ac      3.91     3960337     10.34       5  0.3661
>      aermod     76.44     5598281     19.03       5  1.3394
>         air     11.52     4832222      6.75       5  0.9347
>    capacita      7.78     4189323     56.33       2  0.0976
>     channel      2.14     4658532      2.59       5  0.6640
>       doduc     14.61     4237850     23.41       5  0.2974
>     fatigue      6.62     4127554      7.44       5  0.1082
>     gas_dyn     13.14     4113557      2.82       5  0.5253
>      induct      7.26     4338356      8.49       2  0.0082
>       linpk      1.23     3927350      9.78       2  0.0705
>        mdbx      5.47     4053841     12.90       2  0.0601
>          nf     10.67     4062276     11.33       2  0.0004
>     protein     32.81     5011924     39.48       2  0.0893
>      rnflow     26.57     4246843     26.70       5  0.0915
>    test_fpu     13.29     4193131      8.82       5  0.2136
>        tfft      1.14     3976710      3.95       5  0.1753
>
> Geometric Mean Execution Time =      10.95 seconds
>
> I.e. for rnflow from 31.78 to 26.70 seconds, nearly 20% better.
>
> Regstrapped together with [1/2] on x86_64-linux, all langs+Ada, no
> regressions.  Okay for trunk?

Ok with ...

>
> Ciao,
> Michael.
>
>        PR tree-optimization/18437
>
>        * tree-vectorizer.h (_stmt_vec_info.stride_load_p): New member.
>        (STMT_VINFO_STRIDE_LOAD_P): New accessor.
>        (vect_check_strided_load): Declare.
>        * tree-vect-data-refs.c (vect_check_strided_load): New function.
>        (vect_analyze_data_refs): Use it to accept strided loads.
>        * tree-vect-stmts.c (vectorizable_load): Ditto and handle them.
>
> testsuite/
>        * gfortran.dg/vect/rnflow-trs2a2.f90: New test.
>
> Index: tree-vectorizer.h
> ===================================================================
> --- tree-vectorizer.h.orig      2012-04-10 13:19:18.000000000 +0200
> +++ tree-vectorizer.h   2012-04-10 13:21:19.000000000 +0200
> @@ -545,6 +545,7 @@ typedef struct _stmt_vec_info {
>
>   /* For loads only, true if this is a gather load.  */
>   bool gather_p;
> +  bool stride_load_p;
>  } *stmt_vec_info;
>
>  /* Access Functions.  */
> @@ -559,6 +560,7 @@ typedef struct _stmt_vec_info {
>  #define STMT_VINFO_VECTORIZABLE(S)         (S)->vectorizable
>  #define STMT_VINFO_DATA_REF(S)             (S)->data_ref_info
>  #define STMT_VINFO_GATHER_P(S)            (S)->gather_p
> +#define STMT_VINFO_STRIDE_LOAD_P(S)       (S)->stride_load_p
>
>  #define STMT_VINFO_DR_BASE_ADDRESS(S)      (S)->dr_base_address
>  #define STMT_VINFO_DR_INIT(S)              (S)->dr_init
> @@ -875,6 +877,7 @@ extern bool vect_analyze_data_ref_access
>  extern bool vect_prune_runtime_alias_test_list (loop_vec_info);
>  extern tree vect_check_gather (gimple, loop_vec_info, tree *, tree *,
>                               int *);
> +extern bool vect_check_strided_load (gimple, loop_vec_info, tree *, tree *);
>  extern bool vect_analyze_data_refs (loop_vec_info, bb_vec_info, int *);
>  extern tree vect_create_data_ref_ptr (gimple, tree, struct loop *, tree,
>                                      tree *, gimple_stmt_iterator *,
> Index: tree-vect-data-refs.c
> ===================================================================
> --- tree-vect-data-refs.c.orig  2012-04-10 13:18:35.000000000 +0200
> +++ tree-vect-data-refs.c       2012-04-10 13:21:19.000000000 +0200
> @@ -2690,6 +2690,53 @@ vect_check_gather (gimple stmt, loop_vec
>   return decl;
>  }
>
> +/* Check wether a non-affine load in STMT (being in the loop referred to
> +   in LOOP_VINFO) is suitable for handling as strided load.  That is the case
> +   if its address is a simple induction variable.  If so return the base
> +   of that induction variable in *BASEP and the (loop-invariant) step
> +   in *STEPP, both only when that pointer is non-zero.
> +
> +   This handles ARRAY_REFs (with variant index) and MEM_REFs (with variant
> +   base pointer) only.  */
> +
> +bool
> +vect_check_strided_load (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
> +                        tree *stepp)
> +{
> +  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
> +  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
> +  struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
> +  tree base, off;
> +  affine_iv iv;
> +
> +  base = DR_REF (dr);
> +
> +  if (TREE_CODE (base) == ARRAY_REF)
> +    {
> +      off = TREE_OPERAND (base, 1);
> +      base = TREE_OPERAND (base, 0);
> +    }
> +  else if (TREE_CODE (base) == MEM_REF)
> +    {
> +      off = TREE_OPERAND (base, 0);
> +      base = TREE_OPERAND (base, 1);
> +    }
> +  else
> +    return false;
> +
> +  if (TREE_CODE (off) != SSA_NAME)
> +    return false;
> +
> +  if (!expr_invariant_in_loop_p (loop, base)
> +      || !simple_iv (loop, loop_containing_stmt (stmt), off, &iv, true))
> +    return false;
> +
> +  if (basep)
> +    *basep = iv.base;
> +  if (stepp)
> +    *stepp = iv.step;
> +  return true;
> +}
>
>  /* Function vect_analyze_data_refs.
>
> @@ -3090,16 +3137,21 @@ vect_analyze_data_refs (loop_vec_info lo
>          VEC (ddr_p, heap) *ddrs = LOOP_VINFO_DDRS (loop_vinfo);
>          struct data_dependence_relation *ddr, *newddr;
>          bool bad = false;
> +         bool strided_load = false;
>          tree off;
>          VEC (loop_p, heap) *nest = LOOP_VINFO_LOOP_NEST (loop_vinfo);
>
> -         if (!vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL)
> -             || get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
> +         strided_load = vect_check_strided_load (stmt, loop_vinfo, NULL, 
> NULL);
> +         gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, 
> NULL);
> +         if (gather
> +             && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
> +           gather = false;
> +         if (!gather && !strided_load)
>            {
>              if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
>                {
>                  fprintf (vect_dump,
> -                          "not vectorized: not suitable for gather ");
> +                          "not vectorized: not suitable for gather/strided 
> load ");
>                  print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
>                }
>              return false;
> @@ -3152,13 +3204,16 @@ vect_analyze_data_refs (loop_vec_info lo
>                {
>                  fprintf (vect_dump,
>                           "not vectorized: data dependence conflict"
> -                          " prevents gather");
> +                          " prevents gather/strided load");
>                  print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
>                }
>              return false;
>            }
>
> -         STMT_VINFO_GATHER_P (stmt_info) = true;
> +         if (gather)
> +           STMT_VINFO_GATHER_P (stmt_info) = true;
> +         else if (strided_load)
> +           STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
>        }
>     }
>
> Index: tree-vect-stmts.c
> ===================================================================
> --- tree-vect-stmts.c.orig      2012-04-10 13:18:35.000000000 +0200
> +++ tree-vect-stmts.c   2012-04-10 13:21:19.000000000 +0200
> @@ -4224,6 +4224,7 @@ vectorizable_load (gimple stmt, gimple_s
>   tree aggr_type;
>   tree gather_base = NULL_TREE, gather_off = NULL_TREE;
>   tree gather_off_vectype = NULL_TREE, gather_decl = NULL_TREE;
> +  tree stride_base, stride_step;
>   int gather_scale = 1;
>   enum vect_def_type gather_dt = vect_unknown_def_type;
>
> @@ -4357,6 +4358,10 @@ vectorizable_load (gimple stmt, gimple_s
>          return false;
>        }
>     }
> +  else if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
> +    {
> +      vect_check_strided_load (stmt, loop_vinfo, &stride_base, &stride_step);
> +    }
>
>   if (!vec_stmt) /* transformation not required.  */
>     {
> @@ -4520,6 +4525,104 @@ vectorizable_load (gimple stmt, gimple_s
>            STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
>          else
>            STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
> +         prev_stmt_info = vinfo_for_stmt (new_stmt);
> +       }
> +      return true;
> +    }
> +  else if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
> +    {
> +      gimple_stmt_iterator incr_gsi;
> +      bool insert_after;
> +      gimple incr;
> +      tree offvar;
> +      tree ref = DR_REF (dr);
> +      tree ivstep;
> +      tree running_off;
> +      VEC(constructor_elt, gc) *v = NULL;
> +      gimple_seq stmts = NULL;
> +
> +      gcc_assert (stride_base && stride_step);
> +
> +      /* For a load with loop-invariant (but other than power-of-2)
> +         stride (i.e. not a grouped access) like so:
> +
> +          for (i = 0; i < n; i += stride)
> +            ... = array[i];
> +
> +        we generate a new induction variable and new accesses to
> +        form a new vector (or vectors, depending on ncopies):
> +
> +          for (j = 0; ; j += VF*stride)
> +            tmp1 = array[j];
> +            tmp2 = array[j + stride];
> +            ...
> +            vectemp = {tmp1, tmp2, ...}
> +         */
> +
> +      ivstep = stride_step;
> +      ivstep = fold_build2 (MULT_EXPR, TREE_TYPE (ivstep), ivstep,
> +                           build_int_cst_type (TREE_TYPE (ivstep), vf));

Use build_int_cst.

> +
> +      standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> +
> +      create_iv (stride_base, ivstep, NULL,
> +                loop, &incr_gsi, insert_after,
> +                &offvar, NULL);
> +      incr = gsi_stmt (incr_gsi);
> +      set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
> +
> +      stride_step = force_gimple_operand (stride_step, &stmts, true, 
> NULL_TREE);
> +      if (stmts)
> +       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
> +
> +      prev_stmt_info = NULL;
> +      running_off = offvar;
> +      for (j = 0; j < ncopies; j++)
> +       {
> +         tree vec_inv;
> +
> +         v = VEC_alloc (constructor_elt, gc, nunits);
> +         for (i = 0; i < nunits; i++)
> +           {
> +             tree newref, newoff;
> +             gimple incr;
> +             if (TREE_CODE (ref) == ARRAY_REF)
> +               newref = build4 (ARRAY_REF, TREE_TYPE (ref),
> +                                unshare_expr (TREE_OPERAND (ref, 0)),
> +                                running_off,
> +                                NULL_TREE, NULL_TREE);
> +             else
> +               newref = build2 (MEM_REF, TREE_TYPE (ref),
> +                                running_off,
> +                                TREE_OPERAND (ref, 1));
> +
> +             newref = force_gimple_operand_gsi (gsi, newref, true,
> +                                                NULL_TREE, true,
> +                                                GSI_SAME_STMT);
> +             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, newref);
> +             newoff = SSA_NAME_VAR (running_off);
> +             if (POINTER_TYPE_P (TREE_TYPE (newoff)))
> +               incr = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, 
> newoff,
> +                                                    running_off, 
> stride_step);
> +             else
> +               incr = gimple_build_assign_with_ops (PLUS_EXPR, newoff,
> +                                                    running_off, 
> stride_step);
> +             newoff = make_ssa_name (newoff, incr);
> +             gimple_assign_set_lhs (incr, newoff);
> +             vect_finish_stmt_generation (stmt, incr, gsi);
> +
> +             running_off = newoff;
> +           }
> +
> +         vec_inv = build_constructor (vectype, v);
> +         new_temp = vect_init_vector (stmt, vec_inv, vectype, gsi);
> +         new_stmt = SSA_NAME_DEF_STMT (new_temp);
> +         mark_symbols_for_renaming (new_stmt);

This should not be necessary - in fact please manually set the proper
VUSE here.

Thanks,
Richard.

> +
> +         if (j == 0)
> +           STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
> +         else
> +           STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
>          prev_stmt_info = vinfo_for_stmt (new_stmt);
>        }
>       return true;
> Index: testsuite/gfortran.dg/vect/rnflow-trs2a2.f90
> ===================================================================
> --- testsuite/gfortran.dg/vect/rnflow-trs2a2.f90        (revision 0)
> +++ testsuite/gfortran.dg/vect/rnflow-trs2a2.f90        (revision 0)
> @@ -0,0 +1,33 @@
> +! { dg-do compile }
> +! { dg-require-effective-target vect_double }
> +
> +      function trs2a2 (j, k, u, d, m)
> +!      matrice de transition intermediaire, partant de k sans descendre
> +!      sous j. R = IjU(I-Ik)DIj, avec Ii = deltajj, j >= i.
> +!      alternative: trs2a2 = 0
> +!                   trs2a2 (j:k-1, j:k-1) = matmul (utrsft (j:k-1,j:k-1),
> +!                                                   dtrsft (j:k-1,j:k-1))
> +!
> +      real, dimension (1:m,1:m) :: trs2a2  ! resultat
> +      real, dimension (1:m,1:m) :: u, d    ! matrices utrsft, dtrsft
> +      integer, intent (in)      :: j, k, m ! niveaux vallee pic
> +!
> +!##### following line replaced by Prentice to make less system dependent
> +!      real (kind = kind (1.0d0)) :: dtmp
> +      real (kind = selected_real_kind (10,50)) :: dtmp
> +!
> +      trs2a2 = 0.0
> +      do iclw1 = j, k - 1
> +         do iclw2 = j, k - 1
> +            dtmp = 0.0d0
> +            do iclww = j, k - 1
> +               dtmp = dtmp + u (iclw1, iclww) * d (iclww, iclw2)
> +            enddo
> +            trs2a2 (iclw1, iclw2) = dtmp
> +         enddo
> +      enddo
> +      return
> +      end function trs2a2
> +
> +! { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  } }
> +! { dg-final { cleanup-tree-dump "vect" } }

Reply via email to