Hi! On 2023-10-19T11:47:14+0000, Richard Biener <rguent...@suse.de> wrote: > The following implements SLP vectorization support for gathers > without relying on IFNs being pattern detected (and supported by > the target). That includes support for emulated gathers but also > the legacy x86 builtin path. > > Bootstrapped and tested on x86_64-unknown-linux-gnu, will push.
For GCN (tested '-march=gfx90a'), I see: PASS: gcc.dg/vect/vect-gather-2.c (test for excess errors) +FAIL: gcc.dg/vect/vect-gather-2.c scan-tree-dump vect "different gather base" +FAIL: gcc.dg/vect/vect-gather-2.c scan-tree-dump vect "different gather scale" +PASS: gcc.dg/vect/vect-gather-2.c scan-tree-dump-not vect "Loop contains only SLP stmts" Grüße Thomas > PR tree-optimization/111131 > * tree-vect-loop.cc (update_epilogue_loop_vinfo): Make > sure to update all gather/scatter stmt DRs, not only those > that eventually got VMAT_GATHER_SCATTER set. > * tree-vect-slp.cc (_slp_oprnd_info::first_gs_info): Add. > (vect_get_and_check_slp_defs): Handle gathers/scatters, > adding the offset as SLP operand and comparing base and scale. > (vect_build_slp_tree_1): Handle gathers. > (vect_build_slp_tree_2): Likewise. > > * gcc.dg/vect/vect-gather-1.c: Now expected to vectorize > everywhere. > * gcc.dg/vect/vect-gather-2.c: Expected to not SLP anywhere. > Massage the scale case to more reliably produce a different > one. Scan for the specific messages. > * gcc.dg/vect/vect-gather-3.c: Masked gather is also supported > for AVX2, but not emulated. > * gcc.dg/vect/vect-gather-4.c: Expected to not SLP anywhere. > Massage to more properly ensure this. > * gcc.dg/vect/tsvc/vect-tsvc-s353.c: Expect to vectorize > everywhere. > --- > .../gcc.dg/vect/tsvc/vect-tsvc-s353.c | 2 +- > gcc/testsuite/gcc.dg/vect/vect-gather-1.c | 2 +- > gcc/testsuite/gcc.dg/vect/vect-gather-2.c | 13 ++++-- > gcc/testsuite/gcc.dg/vect/vect-gather-3.c | 2 +- > gcc/testsuite/gcc.dg/vect/vect-gather-4.c | 6 +-- > gcc/tree-vect-loop.cc | 6 ++- > gcc/tree-vect-slp.cc | 45 +++++++++++++++++-- > 7 files changed, 61 insertions(+), 15 deletions(-) > > diff --git a/gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-s353.c > b/gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-s353.c > index 98ba7522471..2c4fa3f5991 100644 > --- a/gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-s353.c > +++ b/gcc/testsuite/gcc.dg/vect/tsvc/vect-tsvc-s353.c > @@ -44,4 +44,4 @@ int main (int argc, char **argv) > return 0; > } > > -/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" { xfail { ! > riscv_v } } } } */ > +/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */ > diff --git a/gcc/testsuite/gcc.dg/vect/vect-gather-1.c > b/gcc/testsuite/gcc.dg/vect/vect-gather-1.c > index e3bbf5c0bf8..5f6640d9ab6 100644 > --- a/gcc/testsuite/gcc.dg/vect/vect-gather-1.c > +++ b/gcc/testsuite/gcc.dg/vect/vect-gather-1.c > @@ -58,4 +58,4 @@ main (void) > return 0; > } > > -/* { dg-final { scan-tree-dump "Loop contains only SLP stmts" vect { target > vect_gather_load_ifn } } } */ > +/* { dg-final { scan-tree-dump "Loop contains only SLP stmts" vect } } */ > diff --git a/gcc/testsuite/gcc.dg/vect/vect-gather-2.c > b/gcc/testsuite/gcc.dg/vect/vect-gather-2.c > index a1f6ba458a9..4c23b808333 100644 > --- a/gcc/testsuite/gcc.dg/vect/vect-gather-2.c > +++ b/gcc/testsuite/gcc.dg/vect/vect-gather-2.c > @@ -8,6 +8,7 @@ f1 (int *restrict y, int *restrict x1, int *restrict x2, > { > for (int i = 0; i < N; ++i) > { > + /* Different base. */ > y[i * 2] = x1[indices[i * 2]] + 1; > y[i * 2 + 1] = x2[indices[i * 2 + 1]] + 2; > } > @@ -18,8 +19,9 @@ f2 (int *restrict y, int *restrict x, int *restrict indices) > { > for (int i = 0; i < N; ++i) > { > - y[i * 2] = x[indices[i * 2]] + 1; > - y[i * 2 + 1] = x[indices[i * 2 + 1] * 2] + 2; > + /* Different scale. */ > + y[i * 2] = *(int *)((char *)x + (__UINTPTR_TYPE__)indices[i * 2] * 4) > + 1; > + y[i * 2 + 1] = *(int *)((char *)x + (__UINTPTR_TYPE__)indices[i * 2 + > 1] * 2) + 2; > } > } > > @@ -28,9 +30,12 @@ f3 (int *restrict y, int *restrict x, int *restrict > indices) > { > for (int i = 0; i < N; ++i) > { > + /* Different type. */ > y[i * 2] = x[indices[i * 2]] + 1; > - y[i * 2 + 1] = x[(unsigned int) indices[i * 2 + 1]] + 2; > + y[i * 2 + 1] = x[((unsigned int *) indices)[i * 2 + 1]] + 2; > } > } > > -/* { dg-final { scan-tree-dump-not "Loop contains only SLP stmts" vect { > target vect_gather_load_ifn } } } */ > +/* { dg-final { scan-tree-dump-not "Loop contains only SLP stmts" vect } } */ > +/* { dg-final { scan-tree-dump "different gather base" vect { target { ! > vect_gather_load_ifn } } } } */ > +/* { dg-final { scan-tree-dump "different gather scale" vect { target { ! > vect_gather_load_ifn } } } } */ > diff --git a/gcc/testsuite/gcc.dg/vect/vect-gather-3.c > b/gcc/testsuite/gcc.dg/vect/vect-gather-3.c > index adfef3bf407..30ba6789e03 100644 > --- a/gcc/testsuite/gcc.dg/vect/vect-gather-3.c > +++ b/gcc/testsuite/gcc.dg/vect/vect-gather-3.c > @@ -62,4 +62,4 @@ main (void) > return 0; > } > > -/* { dg-final { scan-tree-dump "Loop contains only SLP stmts" vect { target > { vect_gather_load_ifn && vect_masked_load } } } } */ > +/* { dg-final { scan-tree-dump "Loop contains only SLP stmts" vect { target > { { vect_gather_load_ifn || avx2 } && vect_masked_load } } } } */ > diff --git a/gcc/testsuite/gcc.dg/vect/vect-gather-4.c > b/gcc/testsuite/gcc.dg/vect/vect-gather-4.c > index ee2e4e4999a..1ce63e69199 100644 > --- a/gcc/testsuite/gcc.dg/vect/vect-gather-4.c > +++ b/gcc/testsuite/gcc.dg/vect/vect-gather-4.c > @@ -39,10 +39,10 @@ f3 (int *restrict y, int *restrict x, int *restrict > indices) > y[i * 2] = (indices[i * 2] < N * 2 > ? x[indices[i * 2]] + 1 > : 1); > - y[i * 2 + 1] = (indices[i * 2 + 1] < N * 2 > - ? x[(unsigned int) indices[i * 2 + 1]] + 2 > + y[i * 2 + 1] = (((unsigned int *)indices)[i * 2 + 1] < N * 2 > + ? x[((unsigned int *) indices)[i * 2 + 1]] + 2 > : 2); > } > } > > -/* { dg-final { scan-tree-dump-not "Loop contains only SLP stmts" vect { > target vect_gather_load_ifn } } } */ > +/* { dg-final { scan-tree-dump-not "Loop contains only SLP stmts" vect } } */ > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > index ebab1953b9c..8877ebde246 100644 > --- a/gcc/tree-vect-loop.cc > +++ b/gcc/tree-vect-loop.cc > @@ -11362,8 +11362,7 @@ update_epilogue_loop_vinfo (class loop *epilogue, > tree advance) > updated offset we set using ADVANCE. Instead we have to make sure the > reference in the data references point to the corresponding copy of > the original in the epilogue. */ > - if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo)) > - == VMAT_GATHER_SCATTER) > + if (STMT_VINFO_GATHER_SCATTER_P (vect_stmt_to_vectorize (stmt_vinfo))) > { > DR_REF (dr) > = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE, > @@ -11372,6 +11371,9 @@ update_epilogue_loop_vinfo (class loop *epilogue, > tree advance) > = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE, > &find_in_mapping, &mapping); > } > + else > + gcc_assert (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize > (stmt_vinfo)) > + != VMAT_GATHER_SCATTER); > DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo); > stmt_vinfo->dr_aux.stmt = stmt_vinfo; > /* The vector size of the epilogue is smaller than that of the main > loop > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc > index d081999a763..8efff2e912d 100644 > --- a/gcc/tree-vect-slp.cc > +++ b/gcc/tree-vect-slp.cc > @@ -283,10 +283,11 @@ typedef struct _slp_oprnd_info > vec<tree> ops; > /* Information about the first statement, its vector def-type, type, the > operand itself in case it's constant, and an indication if it's a > pattern > - stmt. */ > + stmt and gather/scatter info. */ > tree first_op_type; > enum vect_def_type first_dt; > bool any_pattern; > + gather_scatter_info first_gs_info; > } *slp_oprnd_info; > > > @@ -609,6 +610,7 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned > char swap, > unsigned int i, number_of_oprnds; > enum vect_def_type dt = vect_uninitialized_def; > slp_oprnd_info oprnd_info; > + gather_scatter_info gs_info; > unsigned int commutative_op = -1U; > bool first = stmt_num == 0; > > @@ -660,6 +662,19 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned > char swap, > > oprnd_info = (*oprnds_info)[i]; > > + if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > + { > + gcc_assert (number_of_oprnds == 1); > + if (!is_a <loop_vec_info> (vinfo) > + || !vect_check_gather_scatter (stmt_info, > + as_a <loop_vec_info> (vinfo), > + first ? &oprnd_info->first_gs_info > + : &gs_info)) > + return -1; > + > + oprnd = first ? oprnd_info->first_gs_info.offset : gs_info.offset; > + } > + > stmt_vec_info def_stmt_info; > if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info)) > { > @@ -792,6 +807,25 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned > char swap, > return 1; > } > > + if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > + { > + if (!operand_equal_p (oprnd_info->first_gs_info.base, > + gs_info.base)) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "Build SLP failed: different gather base\n"); > + return 1; > + } > + if (oprnd_info->first_gs_info.scale != gs_info.scale) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "Build SLP failed: different gather scale\n"); > + return 1; > + } > + } > + > /* Not first stmt of the group, check that the def-stmt/s match > the def-stmt/s of the first stmt. Allow different definition > types for reduction chains: the first stmt must be a > @@ -1235,6 +1269,9 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char > *swap, > || rhs_code == INDIRECT_REF > || rhs_code == COMPONENT_REF > || rhs_code == MEM_REF))) > + || (ldst_p > + && (STMT_VINFO_GATHER_SCATTER_P (stmt_info) > + != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info))) > || first_stmt_ldst_p != ldst_p > || first_stmt_phi_p != phi_p) > { > @@ -1357,12 +1394,12 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char > *swap, > if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)) > && rhs_code != CFN_GATHER_LOAD > && rhs_code != CFN_MASK_GATHER_LOAD > + && !STMT_VINFO_GATHER_SCATTER_P (stmt_info) > /* Not grouped loads are handled as externals for BB > vectorization. For loop vectorization we can handle > splats the same we handle single element interleaving. */ > && (is_a <bb_vec_info> (vinfo) > - || stmt_info != first_stmt_info > - || STMT_VINFO_GATHER_SCATTER_P (stmt_info))) > + || stmt_info != first_stmt_info)) > { > /* Not grouped load. */ > if (dump_enabled_p ()) > @@ -1858,6 +1895,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node, > gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD) > || gimple_call_internal_p (stmt, IFN_GATHER_LOAD) > || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)); > + else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > + gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))); > else > { > *max_nunits = this_max_nunits; > -- > 2.35.3 ----------------- Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955