On Tue, 5 Apr 2022, Richard Sandiford wrote: > check_load_store_for_partial_vectors predates the support for SLP > gathers and so had a hard-coded assumption that gathers/scatters > (and load/stores lanes) would be non-SLP operations. This patch > passes down the slp_node so that the routine can work out how > many vectors are needed in both the SLP and non-SLP cases. > > Tested on aarch64-linux-gnu and x86_64-linux-gnu. OK to install?
OK. Richard. > Richard > > > gcc/ > PR tree-optimization/103761 > * tree-vect-stmts.cc (check_load_store_for_partial_vectors): Replace > the ncopies parameter with an slp_node parameter. Calculate the > number of vectors based on it and vectype. Rename lambda to > group_memory_nvectors. > (vectorizable_store, vectorizable_load): Update calls accordingly. > > gcc/testsuite/ > PR tree-optimization/103761 > * gcc.dg/vect/pr103761.c: New test. > * gcc.target/aarch64/sve/pr103761.c: Likewise. > --- > gcc/testsuite/gcc.dg/vect/pr103761.c | 13 +++++++ > .../gcc.target/aarch64/sve/pr103761.c | 13 +++++++ > gcc/tree-vect-stmts.cc | 37 ++++++++++++------- > 3 files changed, 50 insertions(+), 13 deletions(-) > create mode 100644 gcc/testsuite/gcc.dg/vect/pr103761.c > create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/pr103761.c > > diff --git a/gcc/testsuite/gcc.dg/vect/pr103761.c > b/gcc/testsuite/gcc.dg/vect/pr103761.c > new file mode 100644 > index 00000000000..0982a63eb6a > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/pr103761.c > @@ -0,0 +1,13 @@ > +/* { dg-do compile } */ > + > +void f(long *restrict x, int *restrict y, short *restrict z, int *restrict a) > +{ > + for (int i = 0; i < 100; i += 4) > + { > + x[i] = (long) y[z[i]] + 1; > + x[i + 1] = (long) y[z[i + 1]] + 2; > + x[i + 2] = (long) y[z[i + 2]] + 3; > + x[i + 3] = (long) y[z[i + 3]] + 4; > + a[i] += 1; > + } > +} > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr103761.c > b/gcc/testsuite/gcc.target/aarch64/sve/pr103761.c > new file mode 100644 > index 00000000000..001b4d407ab > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/pr103761.c > @@ -0,0 +1,13 @@ > +/* { dg-options "-O3" } */ > + > +void f(long *restrict x, int *restrict y, short *restrict z, int *restrict a) > +{ > + for (int i = 0; i < 100; i += 4) > + { > + x[i] = (long) y[z[i]] + 1; > + x[i + 1] = (long) y[z[i + 1]] + 2; > + x[i + 2] = (long) y[z[i + 2]] + 3; > + x[i + 3] = (long) y[z[i + 3]] + 4; > + a[i] += 1; > + } > +} > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > index f6fc7e1fcdd..c0107c8c489 100644 > --- a/gcc/tree-vect-stmts.cc > +++ b/gcc/tree-vect-stmts.cc > @@ -1690,7 +1690,8 @@ static tree permute_vec_elements (vec_info *, tree, > tree, tree, stmt_vec_info, > as well as whether the target does. > > VLS_TYPE says whether the statement is a load or store and VECTYPE > - is the type of the vector being loaded or stored. MEMORY_ACCESS_TYPE > + is the type of the vector being loaded or stored. SLP_NODE is the SLP > + node that contains the statement, or null if none. MEMORY_ACCESS_TYPE > says how the load or store is going to be implemented and GROUP_SIZE > is the number of load or store statements in the containing group. > If the access is a gather load or scatter store, GS_INFO describes > @@ -1703,11 +1704,11 @@ static tree permute_vec_elements (vec_info *, tree, > tree, tree, stmt_vec_info, > > static void > check_load_store_for_partial_vectors (loop_vec_info loop_vinfo, tree vectype, > + slp_tree slp_node, > vec_load_store_type vls_type, > int group_size, > vect_memory_access_type > memory_access_type, > - unsigned int ncopies, > gather_scatter_info *gs_info, > tree scalar_mask) > { > @@ -1715,6 +1716,12 @@ check_load_store_for_partial_vectors (loop_vec_info > loop_vinfo, tree vectype, > if (memory_access_type == VMAT_INVARIANT) > return; > > + unsigned int nvectors; > + if (slp_node) > + nvectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); > + else > + nvectors = vect_get_num_copies (loop_vinfo, vectype); > + > vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); > machine_mode vecmode = TYPE_MODE (vectype); > bool is_load = (vls_type == VLS_LOAD); > @@ -1732,7 +1739,8 @@ check_load_store_for_partial_vectors (loop_vec_info > loop_vinfo, tree vectype, > LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; > return; > } > - vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, > scalar_mask); > + vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype, > + scalar_mask); > return; > } > > @@ -1754,7 +1762,8 @@ check_load_store_for_partial_vectors (loop_vec_info > loop_vinfo, tree vectype, > LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false; > return; > } > - vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, > scalar_mask); > + vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype, > + scalar_mask); > return; > } > > @@ -1784,7 +1793,7 @@ check_load_store_for_partial_vectors (loop_vec_info > loop_vinfo, tree vectype, > /* We might load more scalars than we need for permuting SLP loads. > We checked in get_group_load_store_type that the extra elements > don't leak into a new vector. */ > - auto get_valid_nvectors = [] (poly_uint64 size, poly_uint64 nunits) > + auto group_memory_nvectors = [](poly_uint64 size, poly_uint64 nunits) > { > unsigned int nvectors; > if (can_div_away_from_zero_p (size, nunits, &nvectors)) > @@ -1799,7 +1808,7 @@ check_load_store_for_partial_vectors (loop_vec_info > loop_vinfo, tree vectype, > if (targetm.vectorize.get_mask_mode (vecmode).exists (&mask_mode) > && can_vec_mask_load_store_p (vecmode, mask_mode, is_load)) > { > - unsigned int nvectors = get_valid_nvectors (group_size * vf, nunits); > + nvectors = group_memory_nvectors (group_size * vf, nunits); > vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype, > scalar_mask); > using_partial_vectors_p = true; > } > @@ -1807,7 +1816,7 @@ check_load_store_for_partial_vectors (loop_vec_info > loop_vinfo, tree vectype, > machine_mode vmode; > if (get_len_load_store_mode (vecmode, is_load).exists (&vmode)) > { > - unsigned int nvectors = get_valid_nvectors (group_size * vf, nunits); > + nvectors = group_memory_nvectors (group_size * vf, nunits); > vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo); > unsigned factor = (vecmode == vmode) ? 1 : GET_MODE_UNIT_SIZE > (vecmode); > vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, factor); > @@ -7571,9 +7580,10 @@ vectorizable_store (vec_info *vinfo, > > if (loop_vinfo > && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) > - check_load_store_for_partial_vectors (loop_vinfo, vectype, vls_type, > - group_size, memory_access_type, > - ncopies, &gs_info, mask); > + check_load_store_for_partial_vectors (loop_vinfo, vectype, slp_node, > + vls_type, group_size, > + memory_access_type, &gs_info, > + mask); > > if (slp_node > && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0], > @@ -8921,9 +8931,10 @@ vectorizable_load (vec_info *vinfo, > > if (loop_vinfo > && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)) > - check_load_store_for_partial_vectors (loop_vinfo, vectype, VLS_LOAD, > - group_size, memory_access_type, > - ncopies, &gs_info, mask); > + check_load_store_for_partial_vectors (loop_vinfo, vectype, slp_node, > + VLS_LOAD, group_size, > + memory_access_type, &gs_info, > + mask); > > if (dump_enabled_p () > && memory_access_type != VMAT_ELEMENTWISE > -- Richard Biener <rguent...@suse.de> SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg, Germany; GF: Ivo Totev; HRB 36809 (AG Nuernberg)