Hi, this implements support for strided grouped stores in the non-SLP case (the SLP case existed already). Before we were ignoring all but the last store in a group. That led to a miscompile of GemsFDTD, the testcase reflects that situation.
Also since r224511 yesterday grouped strided non-SLP loads were broken, all loads in a group were using the same base address, which is okay only for the SLP case, as the code is structured right now (only the SLP case uses the permutation path, non-SLP emits scalar loads directly). Regstrapping on x86-64-linux in progress, okay if that passes? Ciao, Michael. PR middle-end/66253 * tree-vect-stmts.c (vectorizable_store): Implement non-SLP grouped strided stores. (vectorizable_load): Don't use the DR from first_stmt in the non-SLP grouped strided case. testsuite/ * gcc.dg/vect/pr66253.c: New testcase. Index: tree-vect-stmts.c =================================================================== --- tree-vect-stmts.c (revision 224562) +++ tree-vect-stmts.c (working copy) @@ -5262,16 +5262,17 @@ vectorizable_store (gimple stmt, gimple_ gimple_seq stmts = NULL; tree stride_base, stride_step, alias_off; tree vec_oprnd; + unsigned int g; gcc_assert (!nested_in_vect_loop_p (loop, stmt)); stride_base = fold_build_pointer_plus - (unshare_expr (DR_BASE_ADDRESS (dr)), + (unshare_expr (DR_BASE_ADDRESS (first_dr)), size_binop (PLUS_EXPR, - convert_to_ptrofftype (unshare_expr (DR_OFFSET (dr))), - convert_to_ptrofftype (DR_INIT(dr)))); - stride_step = fold_convert (sizetype, unshare_expr (DR_STEP (dr))); + convert_to_ptrofftype (unshare_expr (DR_OFFSET (first_dr))), + convert_to_ptrofftype (DR_INIT(first_dr)))); + stride_step = fold_convert (sizetype, unshare_expr (DR_STEP (first_dr))); /* For a store with loop-invariant (but other than power-of-2) stride (i.e. not a grouped access) like so: @@ -5302,6 +5303,7 @@ vectorizable_store (gimple stmt, gimple_ ltype = vectype; ltype = build_aligned_type (ltype, TYPE_ALIGN (elem_type)); ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); + group_size = 1; } ivstep = stride_step; @@ -5322,65 +5324,89 @@ vectorizable_store (gimple stmt, gimple_ gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); prev_stmt_info = NULL; - running_off = offvar; - alias_off = build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0); - for (j = 0; j < ncopies; j++) + alias_off = build_int_cst (reference_alias_ptr_type (DR_REF (first_dr)), 0); + next_stmt = first_stmt; + for (g = 0; g < group_size; g++) { - /* We've set op and dt above, from gimple_assign_rhs1(stmt), - and first_stmt == stmt. */ - if (j == 0) - { - if (slp) - { - vect_get_vec_defs (op, NULL_TREE, stmt, &vec_oprnds, NULL, - slp_node, -1); - vec_oprnd = vec_oprnds[0]; - } - else - vec_oprnd = vect_get_vec_def_for_operand (op, first_stmt, NULL); - } - else + running_off = offvar; + if (g) { - if (slp) - vec_oprnd = vec_oprnds[j]; - else - vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, vec_oprnd); - } - - for (i = 0; i < nstores; i++) - { - tree newref, newoff; - gimple incr, assign; - tree size = TYPE_SIZE (ltype); - /* Extract the i'th component. */ - tree pos = fold_build2 (MULT_EXPR, bitsizetype, bitsize_int (i), + tree size = TYPE_SIZE_UNIT (ltype); + tree pos = fold_build2 (MULT_EXPR, sizetype, size_int (g), size); - tree elem = fold_build3 (BIT_FIELD_REF, ltype, vec_oprnd, - size, pos); - - elem = force_gimple_operand_gsi (gsi, elem, true, - NULL_TREE, true, - GSI_SAME_STMT); - - newref = build2 (MEM_REF, ltype, - running_off, alias_off); - - /* And store it to *running_off. */ - assign = gimple_build_assign (newref, elem); - vect_finish_stmt_generation (stmt, assign, gsi); - - newoff = copy_ssa_name (running_off, NULL); + tree newoff = copy_ssa_name (running_off, NULL); incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR, - running_off, stride_step); + running_off, pos); vect_finish_stmt_generation (stmt, incr, gsi); - running_off = newoff; - if (j == 0 && i == 0) - STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = assign; + } + for (j = 0; j < ncopies; j++) + { + /* We've set op and dt above, from gimple_assign_rhs1(stmt), + and first_stmt == stmt. */ + if (j == 0) + { + if (slp) + { + vect_get_vec_defs (op, NULL_TREE, stmt, &vec_oprnds, NULL, + slp_node, -1); + vec_oprnd = vec_oprnds[0]; + } + else + { + gcc_assert (gimple_assign_single_p (next_stmt)); + op = gimple_assign_rhs1 (next_stmt); + vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt, + NULL); + } + } else - STMT_VINFO_RELATED_STMT (prev_stmt_info) = assign; - prev_stmt_info = vinfo_for_stmt (assign); + { + if (slp) + vec_oprnd = vec_oprnds[j]; + else + vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, vec_oprnd); + } + + for (i = 0; i < nstores; i++) + { + tree newref, newoff; + gimple incr, assign; + tree size = TYPE_SIZE (ltype); + /* Extract the i'th component. */ + tree pos = fold_build2 (MULT_EXPR, bitsizetype, + bitsize_int (i), size); + tree elem = fold_build3 (BIT_FIELD_REF, ltype, vec_oprnd, + size, pos); + + elem = force_gimple_operand_gsi (gsi, elem, true, + NULL_TREE, true, + GSI_SAME_STMT); + + newref = build2 (MEM_REF, ltype, + running_off, alias_off); + + /* And store it to *running_off. */ + assign = gimple_build_assign (newref, elem); + vect_finish_stmt_generation (stmt, assign, gsi); + + newoff = copy_ssa_name (running_off, NULL); + incr = gimple_build_assign (newoff, POINTER_PLUS_EXPR, + running_off, stride_step); + vect_finish_stmt_generation (stmt, incr, gsi); + + running_off = newoff; + if (g == group_size - 1) + { + if (j == 0 && i == 0) + STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = assign; + else + STMT_VINFO_RELATED_STMT (prev_stmt_info) = assign; + prev_stmt_info = vinfo_for_stmt (assign); + } + } } + next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt)); } return true; } @@ -6265,7 +6291,7 @@ vectorizable_load (gimple stmt, gimple_s gcc_assert (!nested_in_vect_loop); - if (grouped_load) + if (slp && grouped_load) first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info))); else Index: testsuite/gcc.dg/vect/pr66253.c =================================================================== --- testsuite/gcc.dg/vect/pr66253.c (revision 0) +++ testsuite/gcc.dg/vect/pr66253.c (working copy) @@ -0,0 +1,51 @@ +/* { dg-require-effective-target vect_double } */ +/* { dg-require-effective-target vect_hw_misalign } */ + +#include "tree-vect.h" + +void __attribute__((noinline,noclone)) +test1(_Complex double * __restrict__ a, _Complex double * __restrict__ b, + double * __restrict__ c, int stride, int n) +{ + int i; + for (i = 0; i < n; i++) + { + a[i*stride] = 0.5 * b[i*stride] * c[i*stride]; + } +} + +double ca[256]; +_Complex double ia[256]; +_Complex double da[256]; + +extern void abort (void); + +int main () +{ + int i; + int stride; + + check_vect (); + + for (stride = 1; stride < 15; stride++) + { + for (i = 0; i < 256; i++) + { + __real__ ia[i] = (i + stride) % 19; + __imag__ ia[i] = (i + stride) % 23; + ca[i] = (i + stride) % 29; + __asm__ volatile (""); + } + + test1(da, ia, ca, stride, 256/stride); + + for (i = 0; i < 256/stride; i++) + { + if (da[i*stride] != 0.5 * ia[i*stride] * ca[i*stride]) + abort (); + } + } + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */