Propagate out ncopies == 1. * tree-vect-stmts.cc (vectorizable_load): Step 3. --- gcc/tree-vect-stmts.cc | 46 +++++++++++------------------------------- 1 file changed, 12 insertions(+), 34 deletions(-)
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index eca7e70adf4..2efa000034c 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -9836,7 +9836,6 @@ vectorizable_load (vec_info *vinfo, tree dataref_ptr = NULL_TREE; tree dataref_offset = NULL_TREE; gimple *ptr_incr = NULL; - int ncopies; int i, j; unsigned int group_size; poly_uint64 group_gap_adj; @@ -9938,16 +9937,9 @@ vectorizable_load (vec_info *vinfo, else vf = 1; - /* Multiple types in SLP are handled by creating the appropriate number of - vectorized stmts for each SLP node. Hence, NCOPIES is always 1 in - case of SLP. */ - ncopies = 1; - - gcc_assert (ncopies >= 1); - /* FORNOW. This restriction should be relaxed. */ if (nested_in_vect_loop - && (ncopies > 1 || SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) > 1)) + && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) > 1) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, @@ -9955,20 +9947,6 @@ vectorizable_load (vec_info *vinfo, return false; } - /* Invalidate assumptions made by dependence analysis when vectorization - on the unrolled body effectively re-orders stmts. */ - if (ncopies > 1 - && STMT_VINFO_MIN_NEG_DIST (stmt_info) != 0 - && maybe_gt (LOOP_VINFO_VECT_FACTOR (loop_vinfo), - STMT_VINFO_MIN_NEG_DIST (stmt_info))) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "cannot perform implicit CSE when unrolling " - "with negative dependence distance\n"); - return false; - } - elem_type = TREE_TYPE (vectype); mode = TYPE_MODE (vectype); @@ -10018,7 +9996,7 @@ vectorizable_load (vec_info *vinfo, int maskload_elsval = 0; bool need_zeroing = false; if (!get_load_store_type (vinfo, stmt_info, vectype, slp_node, mask, VLS_LOAD, - ncopies, &memory_access_type, &poffset, + 1, &memory_access_type, &poffset, &alignment_support_scheme, &misalignment, &gs_info, &lanes_ifn, &elsvals)) return false; @@ -10194,8 +10172,7 @@ vectorizable_load (vec_info *vinfo, gcc_assert (memory_access_type == SLP_TREE_MEMORY_ACCESS_TYPE (slp_node)); if (dump_enabled_p () && !costing_p) - dump_printf_loc (MSG_NOTE, vect_location, - "transform load. ncopies = %d\n", ncopies); + dump_printf_loc (MSG_NOTE, vect_location, "transform load.\n"); /* Transform. */ @@ -10443,6 +10420,7 @@ vectorizable_load (vec_info *vinfo, /* For SLP permutation support we need to load the whole group, not only the number of vector stmts the permutation result fits in. */ + int ncopies; if (slp_perm) { /* We don't yet generate SLP_TREE_LOAD_PERMUTATIONs for @@ -10869,7 +10847,7 @@ vectorizable_load (vec_info *vinfo, /* For costing some adjacent vector loads, we'd like to cost with the total number of them once instead of cost each one by one. */ unsigned int n_adjacent_loads = 0; - ncopies = slp_node->vec_stmts_size / group_size; + int ncopies = slp_node->vec_stmts_size / group_size; for (j = 0; j < ncopies; j++) { if (costing_p) @@ -11029,7 +11007,7 @@ vectorizable_load (vec_info *vinfo, gcc_assert (!grouped_load && !slp_perm); unsigned int inside_cost = 0, prologue_cost = 0; - for (j = 0; j < ncopies; j++) + for (j = 0; j < 1; j++) { /* 1. Create the vector or array pointer update chain. */ if (j == 0 && !costing_p) @@ -11065,7 +11043,7 @@ vectorizable_load (vec_info *vinfo, if (loop_masks) final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, - vec_num * ncopies, vectype, + vec_num, vectype, vec_num * j + i); if (vec_mask) final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, @@ -11098,7 +11076,7 @@ vectorizable_load (vec_info *vinfo, if (loop_lens) final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens, - vec_num * ncopies, vectype, + vec_num, vectype, vec_num * j + i, 1); else final_len @@ -11394,7 +11372,7 @@ vectorizable_load (vec_info *vinfo, /* For costing some adjacent vector loads, we'd like to cost with the total number of them once instead of cost each one by one. */ unsigned int n_adjacent_loads = 0; - for (j = 0; j < ncopies; j++) + for (j = 0; j < 1; j++) { /* 1. Create the vector or array pointer update chain. */ if (j == 0 && !costing_p) @@ -11475,7 +11453,7 @@ vectorizable_load (vec_info *vinfo, vec_mask = vec_masks[vec_num * j + i]; if (loop_masks) final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, - vec_num * ncopies, vectype, + vec_num, vectype, vec_num * j + i); if (vec_mask) final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, @@ -11526,7 +11504,7 @@ vectorizable_load (vec_info *vinfo, unsigned factor = (new_ovmode == vmode) ? 1 : GET_MODE_UNIT_SIZE (vmode); final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens, - vec_num * ncopies, vectype, + vec_num, vectype, vec_num * j + i, factor); } else if (final_mask) @@ -11942,7 +11920,7 @@ vectorizable_load (vec_info *vinfo, if (alignment_support_scheme == dr_explicit_realign_optimized) { gcc_assert (phi); - if (i == vec_num - 1 && j == ncopies - 1) + if (i == vec_num - 1 && j == 0) add_phi_arg (phi, lsq, loop_latch_edge (containing_loop), UNKNOWN_LOCATION); msq = lsq; -- 2.43.0