Elide loops over ncopies. * tree-vect-stmts.cc (vectorizable_load): --- gcc/tree-vect-stmts.cc | 54 ++++++++++++++++-------------------------- 1 file changed, 20 insertions(+), 34 deletions(-)
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 2efa000034c..717d4694b88 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -11007,10 +11007,9 @@ vectorizable_load (vec_info *vinfo, gcc_assert (!grouped_load && !slp_perm); unsigned int inside_cost = 0, prologue_cost = 0; - for (j = 0; j < 1; j++) { /* 1. Create the vector or array pointer update chain. */ - if (j == 0 && !costing_p) + if (!costing_p) { if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info, @@ -11022,13 +11021,6 @@ vectorizable_load (vec_info *vinfo, at_loop, offset, &dummy, gsi, &ptr_incr, false, bump); } - else if (!costing_p) - { - gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo)); - if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info)) - dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, - gsi, stmt_info, bump); - } gimple *new_stmt = NULL; for (i = 0; i < vec_num; i++) @@ -11039,12 +11031,11 @@ vectorizable_load (vec_info *vinfo, if (!costing_p) { if (mask) - vec_mask = vec_masks[vec_num * j + i]; + vec_mask = vec_masks[i]; if (loop_masks) final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, - vec_num, vectype, - vec_num * j + i); + vec_num, vectype, i); if (vec_mask) final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, final_mask, vec_mask, gsi); @@ -11067,7 +11058,7 @@ vectorizable_load (vec_info *vinfo, continue; } if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) - vec_offset = vec_offsets[vec_num * j + i]; + vec_offset = vec_offsets[i]; tree zero = build_zero_cst (vectype); tree scale = size_int (gs_info.scale); @@ -11076,8 +11067,7 @@ vectorizable_load (vec_info *vinfo, if (loop_lens) final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens, - vec_num, vectype, - vec_num * j + i, 1); + vec_num, vectype, i, 1); else final_len = build_int_cst (sizetype, @@ -11148,7 +11138,7 @@ vectorizable_load (vec_info *vinfo, { new_stmt = vect_build_one_gather_load_call (vinfo, stmt_info, gsi, &gs_info, - dataref_ptr, vec_offsets[vec_num * j + i], + dataref_ptr, vec_offsets[i], final_mask); data_ref = NULL_TREE; } @@ -11159,7 +11149,7 @@ vectorizable_load (vec_info *vinfo, data with just the lower lanes filled. */ new_stmt = vect_build_one_gather_load_call (vinfo, stmt_info, gsi, &gs_info, - dataref_ptr, vec_offsets[2 * vec_num * j + 2 * i], + dataref_ptr, vec_offsets[2 * i], final_mask); tree low = make_ssa_name (vectype); gimple_set_lhs (new_stmt, low); @@ -11204,7 +11194,7 @@ vectorizable_load (vec_info *vinfo, new_stmt = vect_build_one_gather_load_call (vinfo, stmt_info, gsi, &gs_info, dataref_ptr, - vec_offsets[2 * vec_num * j + 2 * i + 1], + vec_offsets[2 * i + 1], final_mask); tree high = make_ssa_name (vectype); gimple_set_lhs (new_stmt, high); @@ -11229,8 +11219,8 @@ vectorizable_load (vec_info *vinfo, { /* We have a offset vector with double the number of lanes. Select the low/high part accordingly. */ - vec_offset = vec_offsets[(vec_num * j + i) / 2]; - if ((vec_num * j + i) & 1) + vec_offset = vec_offsets[i / 2]; + if (i & 1) { int count = offset_nunits.to_constant (); vec_perm_builder sel (count, count, 1); @@ -11290,9 +11280,8 @@ vectorizable_load (vec_info *vinfo, than the data vector for now. */ unsigned HOST_WIDE_INT factor = const_offset_nunits / const_nunits; - vec_offset = vec_offsets[(vec_num * j + i) / factor]; - unsigned elt_offset - = ((vec_num * j + i) % factor) * const_nunits; + vec_offset = vec_offsets[i / factor]; + unsigned elt_offset = (i % factor) * const_nunits; tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset)); tree scale = size_int (gs_info.scale); align = get_object_alignment (DR_REF (first_dr_info->dr)); @@ -11372,10 +11361,9 @@ vectorizable_load (vec_info *vinfo, /* For costing some adjacent vector loads, we'd like to cost with the total number of them once instead of cost each one by one. */ unsigned int n_adjacent_loads = 0; - for (j = 0; j < 1; j++) { /* 1. Create the vector or array pointer update chain. */ - if (j == 0 && !costing_p) + if (!costing_p) { bool simd_lane_access_p = STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) != 0; @@ -11450,11 +11438,10 @@ vectorizable_load (vec_info *vinfo, if (!costing_p) { if (mask) - vec_mask = vec_masks[vec_num * j + i]; + vec_mask = vec_masks[i]; if (loop_masks) final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, - vec_num, vectype, - vec_num * j + i); + vec_num, vectype, i); if (vec_mask) final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, final_mask, vec_mask, gsi); @@ -11504,8 +11491,7 @@ vectorizable_load (vec_info *vinfo, unsigned factor = (new_ovmode == vmode) ? 1 : GET_MODE_UNIT_SIZE (vmode); final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens, - vec_num, vectype, - vec_num * j + i, factor); + vec_num, vectype, i, factor); } else if (final_mask) { @@ -11612,12 +11598,12 @@ vectorizable_load (vec_info *vinfo, /* Try to use a single smaller load when we are about to load excess elements compared to the unrolled scalar loop. */ - if (known_gt ((vec_num * j + i + 1) * nunits, + if (known_gt ((i + 1) * nunits, (group_size * vf - gap))) { poly_uint64 remain = ((group_size * vf - gap) - - (vec_num * j + i) * nunits); - if (known_ge ((vec_num * j + i + 1) * nunits + - i * nunits); + if (known_ge ((i + 1) * nunits - (group_size * vf - gap), nunits)) /* DR will be unused. */ ltype = NULL_TREE; @@ -11920,7 +11906,7 @@ vectorizable_load (vec_info *vinfo, if (alignment_support_scheme == dr_explicit_realign_optimized) { gcc_assert (phi); - if (i == vec_num - 1 && j == 0) + if (i == vec_num - 1) add_phi_arg (phi, lsq, loop_latch_edge (containing_loop), UNKNOWN_LOCATION); msq = lsq; -- 2.43.0