This re-does the fix for PR53185 in a less intrusive way, allowing both strided load vectorization and peeling for alignment. Testing on my local machine tells me that this fix results in a 3% speedup of rnflow.
Bootstrapped on x86_64-unknown-linux-gnu, testing in progress. Richard. 2013-02-05 Richard Biener <rguent...@suse.de> PR tree-optimization/53342 PR tree-optimization/53185 * tree-vectorizer.h (vect_check_strided_load): Remove. * tree-vect-data-refs.c (vect_enhance_data_refs_alignment): Do not disallow peeling for vectorized strided loads. (vect_check_strided_load): Make static and simplify. (vect_analyze_data_refs): Adjust. * tree-vect-stmts.c (vectorizable_load): Handle peeled loops correctly when vectorizing strided loads. * gcc.dg/vect/pr53185-2.c: New testcase. Index: gcc/tree-vectorizer.h =================================================================== *** gcc/tree-vectorizer.h (revision 195751) --- gcc/tree-vectorizer.h (working copy) *************** extern bool vect_analyze_data_ref_access *** 923,929 **** extern bool vect_prune_runtime_alias_test_list (loop_vec_info); extern tree vect_check_gather (gimple, loop_vec_info, tree *, tree *, int *); - extern bool vect_check_strided_load (gimple, loop_vec_info, tree *, tree *); extern bool vect_analyze_data_refs (loop_vec_info, bb_vec_info, int *); extern tree vect_create_data_ref_ptr (gimple, tree, struct loop *, tree, tree *, gimple_stmt_iterator *, --- 923,928 ---- Index: gcc/tree-vect-data-refs.c =================================================================== *** gcc/tree-vect-data-refs.c (revision 195751) --- gcc/tree-vect-data-refs.c (working copy) *************** vect_enhance_data_refs_alignment (loop_v *** 1615,1632 **** && GROUP_FIRST_ELEMENT (stmt_info) != stmt) continue; - /* FORNOW: Any strided load prevents peeling. The induction - variable analysis will fail when the prologue loop is generated, - and so we can't generate the new base for the pointer. */ - if (STMT_VINFO_STRIDE_LOAD_P (stmt_info)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "strided load prevents peeling"); - do_peeling = false; - break; - } - /* For invariant accesses there is nothing to enhance. */ if (integer_zerop (DR_STEP (dr))) continue; --- 1615,1620 ---- *************** vect_check_gather (gimple stmt, loop_vec *** 2890,2898 **** This handles ARRAY_REFs (with variant index) and MEM_REFs (with variant base pointer) only. */ ! bool ! vect_check_strided_load (gimple stmt, loop_vec_info loop_vinfo, tree *basep, ! tree *stepp) { struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); stmt_vec_info stmt_info = vinfo_for_stmt (stmt); --- 2878,2885 ---- This handles ARRAY_REFs (with variant index) and MEM_REFs (with variant base pointer) only. */ ! static bool ! vect_check_strided_load (gimple stmt, loop_vec_info loop_vinfo) { struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); stmt_vec_info stmt_info = vinfo_for_stmt (stmt); *************** vect_check_strided_load (gimple stmt, lo *** 2925,2934 **** || !simple_iv (loop, loop_containing_stmt (stmt), off, &iv, true)) return false; - if (basep) - *basep = iv.base; - if (stepp) - *stepp = iv.step; return true; } --- 2912,2917 ---- *************** vect_analyze_data_refs (loop_vec_info lo *** 3473,3480 **** { bool strided_load = false; if (!nested_in_vect_loop_p (loop, stmt)) ! strided_load ! = vect_check_strided_load (stmt, loop_vinfo, NULL, NULL); if (!strided_load) { if (dump_enabled_p ()) --- 3456,3462 ---- { bool strided_load = false; if (!nested_in_vect_loop_p (loop, stmt)) ! strided_load = vect_check_strided_load (stmt, loop_vinfo); if (!strided_load) { if (dump_enabled_p ()) Index: gcc/tree-vect-stmts.c =================================================================== *** gcc/tree-vect-stmts.c (revision 195751) --- gcc/tree-vect-stmts.c (working copy) *************** vectorizable_load (gimple stmt, gimple_s *** 4353,4359 **** tree aggr_type; tree gather_base = NULL_TREE, gather_off = NULL_TREE; tree gather_off_vectype = NULL_TREE, gather_decl = NULL_TREE; - tree stride_base, stride_step; int gather_scale = 1; enum vect_def_type gather_dt = vect_unknown_def_type; --- 4353,4358 ---- *************** vectorizable_load (gimple stmt, gimple_s *** 4462,4472 **** } } else if (STMT_VINFO_STRIDE_LOAD_P (stmt_info)) ! { ! if (!vect_check_strided_load (stmt, loop_vinfo, ! &stride_base, &stride_step)) ! return false; ! } else { negative = tree_int_cst_compare (nested_in_vect_loop --- 4461,4467 ---- } } else if (STMT_VINFO_STRIDE_LOAD_P (stmt_info)) ! ; else { negative = tree_int_cst_compare (nested_in_vect_loop *************** vectorizable_load (gimple stmt, gimple_s *** 4674,4686 **** bool insert_after; gimple incr; tree offvar; - tree ref = DR_REF (dr); tree ivstep; tree running_off; vec<constructor_elt, va_gc> *v = NULL; gimple_seq stmts = NULL; ! gcc_assert (stride_base && stride_step); /* For a load with loop-invariant (but other than power-of-2) stride (i.e. not a grouped access) like so: --- 4669,4689 ---- bool insert_after; gimple incr; tree offvar; tree ivstep; tree running_off; vec<constructor_elt, va_gc> *v = NULL; gimple_seq stmts = NULL; + tree stride_base, stride_step, alias_off; + + gcc_assert (!nested_in_vect_loop); ! stride_base ! = fold_build_pointer_plus ! (unshare_expr (DR_BASE_ADDRESS (dr)), ! size_binop (PLUS_EXPR, ! convert_to_ptrofftype (unshare_expr (DR_OFFSET (dr))), ! convert_to_ptrofftype (DR_INIT(dr)))); ! stride_step = fold_convert (sizetype, unshare_expr (DR_STEP (dr))); /* For a load with loop-invariant (but other than power-of-2) stride (i.e. not a grouped access) like so: *************** vectorizable_load (gimple stmt, gimple_s *** 4716,4721 **** --- 4719,4725 ---- prev_stmt_info = NULL; running_off = offvar; + alias_off = build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0); for (j = 0; j < ncopies; j++) { tree vec_inv; *************** vectorizable_load (gimple stmt, gimple_s *** 4725,4757 **** { tree newref, newoff; gimple incr; ! if (TREE_CODE (ref) == ARRAY_REF) ! { ! newref = build4 (ARRAY_REF, TREE_TYPE (ref), ! unshare_expr (TREE_OPERAND (ref, 0)), ! running_off, ! NULL_TREE, NULL_TREE); ! if (!useless_type_conversion_p (TREE_TYPE (vectype), ! TREE_TYPE (newref))) ! newref = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vectype), ! newref); ! } ! else ! newref = build2 (MEM_REF, TREE_TYPE (vectype), ! running_off, ! TREE_OPERAND (ref, 1)); newref = force_gimple_operand_gsi (gsi, newref, true, NULL_TREE, true, GSI_SAME_STMT); CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, newref); newoff = copy_ssa_name (running_off, NULL); ! if (POINTER_TYPE_P (TREE_TYPE (newoff))) ! incr = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, newoff, ! running_off, stride_step); ! else ! incr = gimple_build_assign_with_ops (PLUS_EXPR, newoff, ! running_off, stride_step); vect_finish_stmt_generation (stmt, incr, gsi); running_off = newoff; --- 4729,4744 ---- { tree newref, newoff; gimple incr; ! newref = build2 (MEM_REF, TREE_TYPE (vectype), ! running_off, alias_off); newref = force_gimple_operand_gsi (gsi, newref, true, NULL_TREE, true, GSI_SAME_STMT); CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, newref); newoff = copy_ssa_name (running_off, NULL); ! incr = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, newoff, ! running_off, stride_step); vect_finish_stmt_generation (stmt, incr, gsi); running_off = newoff; Index: gcc/testsuite/gcc.dg/vect/pr53185-2.c =================================================================== *** gcc/testsuite/gcc.dg/vect/pr53185-2.c (revision 0) --- gcc/testsuite/gcc.dg/vect/pr53185-2.c (working copy) *************** *** 0 **** --- 1,27 ---- + void __attribute__((noinline,noclone)) + fn1 (int * __restrict f, int * __restrict d, unsigned short a, int c) + { + unsigned short e; + for (e = 0; e < a; ++e) + f[e] = d[e * c]; + } + + extern void abort (void); + + int main () + { + int a[32], b[3 * 32]; + int i, off; + for (i = 0; i < 3 * 32; ++i) + b[i] = i; + for (off = 0; off < 8; ++off) + { + fn1 (&a[off], &b[off], 32 - off, 3); + for (i = 0; i < 32 - off; ++i) + if (a[off+i] != b[off+i*3]) + abort (); + } + return 0; + } + + /* { dg-final { cleanup-tree-dump "vect" } } */