On Wed, Aug 17, 2016 at 12:52 PM, Alan Hayward <alan.hayw...@arm.com> wrote: > > > On 16/08/2016 10:01, "Alan Hayward" <alan.hayw...@arm.com> wrote: > >> >> >>On 16/08/2016 09:33, "Richard Biener" <richard.guent...@gmail.com> wrote: >> >>>On Mon, Aug 15, 2016 at 4:16 PM, Alan Hayward <alan.hayw...@arm.com> >>>wrote: >>>> >>>> >>>> On 15/08/2016 12:17, "Richard Biener" <richard.guent...@gmail.com> >>>>wrote: >>>> >>>>>On Mon, Aug 15, 2016 at 11:48 AM, Alan Hayward <alan.hayw...@arm.com> >>>>>wrote: >>>>>> The testcase pr71752.c was failing because the SLP code was >>>>>>generating >>>>>>an >>>>>> SLP >>>>>> vector using arguments from the SLP scalar stmts, but was using the >>>>>>wrong >>>>>> argument number. >>>>>> >>>>>> vect_get_slp_defs() is given a vector of operands. When calling down >>>>>>to >>>>>> vect_get_constant_vectors it uses i as op_num - making the assumption >>>>>>that >>>>>> the >>>>>> first op in the vector refers to the first argument in the SLP scalar >>>>>> statement, the second op refers to the second arg and so on. >>>>>> >>>>>> However, previously in vectorizable_reduction, the call to >>>>>> vect_get_vec_defs >>>>>> destroyed this ordering by potentially only passing op1. >>>>>> >>>>>> The solution is in vectorizable_reduction to create a vector of >>>>>>operands >>>>>> equal >>>>>> in size to the number of arguments in the SLP statements. We maintain >>>>>>the >>>>>> argument ordering and if we don't require defs for that argument we >>>>>>instead >>>>>> push NULL into the vector. In vect_get_slp_defs we need to handle >>>>>>cases >>>>>> where >>>>>> an op might be NULL. >>>>>> >>>>>> Tested with a check run on X86 and AArch64. >>>>>> Ok to commit? >>>>>> >>>>> >>>>>Ugh. Note the logic in vect_get_slp_defs is incredibly fragile - I >>>>>think you can't >>>>>simply "skip" ops the way you do as you need to still increment >>>>>child_index >>>>>accordingly for ignored ops. >>>> >>>> Agreed, I should be maintaining the child_index. >>>> Looking at the code, I need a valid oprnd for that code to work. >>>> >>>> Given that the size of the ops vector is never more than 3, it probably >>>> makes >>>> sense to reset child_index each time and do a quick for loop through >>>>all >>>> the >>>> children. >>> >>>Hmm, yes - that should work. It should also remove the need to keep >>>operand order in sync? So it might no longer require the >>>vectorizable_reduction change. >> >>We still need to pass the correct index down to >>vect_get_constant_vectors(), >>which requires the operands in the original order for i to be correct. >>(We can’t use child_index for that). >> >>Another option would be to pass down an additional vector containing the >>indexes >>of the operands. But I think that’d be an even worse option. >> > > Updated with a loop for calculating child_index. > Tested with a check run for x86 and aarch64.
LGTM. Thanks, Richard. > > diff --git a/gcc/testsuite/gcc.dg/vect/pr71752.c > b/gcc/testsuite/gcc.dg/vect/pr71752.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..8d26754b4fedf8b104caae8742a445dff > bf23f0a > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/pr71752.c > @@ -0,0 +1,19 @@ > +/* { dg-do compile } */ > + > +unsigned int q4, yg; > + > +unsigned int > +w6 (unsigned int z5, unsigned int jv) > +{ > + unsigned int *f2 = &jv; > + > + while (*f2 < 21) > + { > + q4 -= jv; > + z5 -= jv; > + f2 = &yg; > + ++(*f2); > + } > + return z5; > +} > + > diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c > index > 2a7e0c6661bc1ba82c9f03720e550749f2252a7c..826481af3d1d8b29bcdbd7d81c0fd5a85 > 9ecd9b0 100644 > --- a/gcc/tree-vect-loop.c > +++ b/gcc/tree-vect-loop.c > @@ -5364,7 +5364,7 @@ vectorizable_reduction (gimple *stmt, > gimple_stmt_iterator *gsi, > auto_vec<tree> vect_defs; > auto_vec<gimple *> phis; > int vec_num; > - tree def0, def1, tem, op0, op1 = NULL_TREE; > + tree def0, def1, tem, op1 = NULL_TREE; > bool first_p = true; > tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE; > gimple *cond_expr_induction_def_stmt = NULL; > @@ -5964,29 +5964,36 @@ vectorizable_reduction (gimple *stmt, > gimple_stmt_iterator *gsi, > /* Handle uses. */ > if (j == 0) > { > - op0 = ops[!reduc_index]; > - if (op_type == ternary_op) > - { > - if (reduc_index == 0) > - op1 = ops[2]; > - else > - op1 = ops[1]; > - } > + if (slp_node) > + { > + /* Get vec defs for all the operands except the reduction index, > + ensuring the ordering of the ops in the vector is kept. */ > + auto_vec<tree, 3> slp_ops; > + auto_vec<vec<tree>, 3> vec_defs; > > - if (slp_node) > - vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1, > - slp_node, -1); > + slp_ops.quick_push ((reduc_index == 0) ? NULL : ops[0]); > + slp_ops.quick_push ((reduc_index == 1) ? NULL : ops[1]); > + if (op_type == ternary_op) > + slp_ops.quick_push ((reduc_index == 2) ? NULL : ops[2]); > + > + vect_get_slp_defs (slp_ops, slp_node, &vec_defs, -1); > + > + vec_oprnds0.safe_splice (vec_defs[(reduc_index == 0) ? 1 : 0]); > + if (op_type == ternary_op) > + vec_oprnds1.safe_splice (vec_defs[(reduc_index == 2) ? 1 : > 2]); > + } > else > - { > + { > loop_vec_def0 = vect_get_vec_def_for_operand > (ops[!reduc_index], > stmt); > vec_oprnds0.quick_push (loop_vec_def0); > if (op_type == ternary_op) > { > + op1 = (reduc_index == 0) ? ops[2] : ops[1]; > loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt); > vec_oprnds1.quick_push (loop_vec_def1); > } > - } > + } > } > else > { > diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c > index > fb325d54f1084461d44cd54a98e5b7f99541a188..5a611e42556e0931a372a6c626cc24846 > f11029d 100644 > --- a/gcc/tree-vect-slp.c > +++ b/gcc/tree-vect-slp.c > @@ -3194,24 +3194,32 @@ vect_get_slp_defs (vec<tree> ops, slp_tree > slp_node, > { > gimple *first_stmt; > int number_of_vects = 0, i; > - unsigned int child_index = 0; > HOST_WIDE_INT lhs_size_unit, rhs_size_unit; > slp_tree child = NULL; > vec<tree> vec_defs; > tree oprnd; > - bool vectorized_defs; > + bool first_iteration = true; > > first_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[0]; > FOR_EACH_VEC_ELT (ops, i, oprnd) > { > + bool vectorized_defs = false; > + > + if (oprnd == NULL) > + { > + vec_defs = vNULL; > + vec_defs.create (0); > + vec_oprnds->quick_push (vec_defs); > + continue; > + } > + > /* For each operand we check if it has vectorized definitions in a > child > node or we need to create them (for invariants and constants). We > check if the LHS of the first stmt of the next child matches OPRND. > If it does, we found the correct child. Otherwise, we call > - vect_get_constant_vectors (), and not advance CHILD_INDEX in order > - to check this child node for the next operand. */ > - vectorized_defs = false; > - if (SLP_TREE_CHILDREN (slp_node).length () > child_index) > + vect_get_constant_vectors (). */ > + for (unsigned int child_index = 0; > + child_index < SLP_TREE_CHILDREN (slp_node).length (); > child_index++) > { > child = SLP_TREE_CHILDREN (slp_node)[child_index]; > > @@ -3231,30 +3239,25 @@ vect_get_slp_defs (vec<tree> ops, slp_tree > slp_node, > statements. */ > number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child); > vectorized_defs = true; > - child_index++; > + break; > } > } > - else > - child_index++; > } > > - if (!vectorized_defs) > - { > - if (i == 0) > - { > - number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); > - /* Number of vector stmts was calculated according to LHS in > - vect_schedule_slp_instance (), fix it by replacing LHS > with > - RHS, if necessary. See vect_get_smallest_scalar_type () > for > - details. */ > - vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit, > - &rhs_size_unit); > - if (rhs_size_unit != lhs_size_unit) > - { > - number_of_vects *= rhs_size_unit; > - number_of_vects /= lhs_size_unit; > - } > - } > + if (!vectorized_defs && first_iteration) > + { > + number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); > + /* Number of vector stmts was calculated according to LHS in > + vect_schedule_slp_instance (), fix it by replacing LHS with > + RHS, if necessary. See vect_get_smallest_scalar_type () for > + details. */ > + vect_get_smallest_scalar_type (first_stmt, &lhs_size_unit, > + &rhs_size_unit); > + if (rhs_size_unit != lhs_size_unit) > + { > + number_of_vects *= rhs_size_unit; > + number_of_vects /= lhs_size_unit; > + } > } > > /* Allocate memory for vectorized defs. */ > @@ -3276,6 +3279,8 @@ vect_get_slp_defs (vec<tree> ops, slp_tree slp_node, > /* For reductions, we only need initial values. */ > if (reduc_index != -1) > return; > + > + first_iteration = false; > } > } > > >