On Fri, 24 Nov 2023, Tamar Christina wrote:

> Good morning,
> 
> This is a respun patch with a fix for VLA.
> 
> This adds support to vectorizable_live_reduction to handle multiple exits by
> doing a search for which exit the live value should be materialized in.
> 
> Additionally which value in the index we're after depends on whether the exit
> it's materialized in is an early exit or whether the loop's main exit is
> different from the loop's natural one (i.e. the one with the same src block as
> the latch).
> 
> In those two cases we want the first rather than the last value as we're going
> to restart the iteration in the scalar loop.  For VLA this means we need to
> reverse both the mask and vector since there's only a way to get the last
> active element and not the first.
> 
> For inductions and multiple exits:
>   - we test if the target will support vectorizing the induction
>   - mark all inductions in the loop as relevant
>   - for codegen of non-live inductions during codegen
>   - induction during an early exit gets the first element rather than last.
> 
> For reductions and multiple exits:
>   - Reductions for early exits reduces the reduction definition statement
>     rather than the reduction step.  This allows us to get the value at the
>     start of the iteration.
>   - The peeling layout means that we just have to update one block, the merge
>     block.  We expect all the reductions to be the same but we leave it up to
>     the value numbering to clean up any duplicate code as we iterate over all
>     edges.
> 
> These two changes fix the reduction codegen given before which has been added
> to the testsuite for early vect.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
>       * tree-vect-loop.cc (vectorizable_live_operation): Support early exits.
>       (vect_analyze_loop_operations): Check if target supports vectorizing IV.
>       (vect_transform_loop): Call vectorizable_live_operation for non-live
>       inductions or reductions.
>       (find_connected_edge, vectorizable_live_operation_1): New.
>       (vect_create_epilog_for_reduction): Support reductions in early break.
>       * tree-vect-stmts.cc (perm_mask_for_reverse): Expose.
>       (vect_stmt_relevant_p): Mark all inductions when early break as being
>       relevant.
>       * tree-vectorizer.h (perm_mask_for_reverse): Expose.
>       (vect_iv_increment_position): New.
>       * tree-vect-loop-manip.cc (vect_iv_increment_position): Expose.
> 
> --- inline copy of patch ---
> 
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index 
> 476be8a0bb6da2d06c4ca7052cb07bacecca60b1..1a4ba349fb6ae39c79401aecd4e7eaaaa9e2b8a0
>  100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -453,7 +453,7 @@ vect_adjust_loop_lens_control (tree iv_type, gimple_seq 
> *seq,
>     INSERT_AFTER is set to true if the increment should be inserted after
>     *BSI.  */
>  
> -static void
> +void
>  vect_iv_increment_position (edge loop_exit, gimple_stmt_iterator *bsi,
>                           bool *insert_after)
>  {
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 
> 8a50380de49bc12105be47ea1d8ee3cf1f2bdab4..b42318b2999e6a27e6983382190792602cb25af1
>  100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -2163,6 +2163,15 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
>           ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
>                                             -1, false, &cost_vec);
>  
> +       /* Check if we can perform the operation for early break if we force
> +          the live operation.  */
> +       if (ok
> +           && LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> +           && !STMT_VINFO_LIVE_P (stmt_info)
> +           && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
> +         ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
> +                                           -1, false, &cost_vec);

can you add && !PURE_SLP_STMT?

> +
>            if (!ok)
>           return opt_result::failure_at (phi,
>                                          "not vectorized: relevant phi not "
> @@ -5842,6 +5851,10 @@ vect_create_partial_epilog (tree vec_def, tree 
> vectype, code_helper code,
>     SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
>     REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
>       (counting from 0)
> +   LOOP_EXIT is the edge to update in the merge block.  In the case of a 
> single
> +     exit this edge is always the main loop exit.
> +   MAIN_EXIT_P indicates whether we are updating the main exit or an 
> alternate
> +     exit.  This determines whether we use the final or original value.
>  
>     This function:
>     1. Completes the reduction def-use cycles.
> @@ -5882,7 +5895,9 @@ static void
>  vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
>                                 stmt_vec_info stmt_info,
>                                 slp_tree slp_node,
> -                               slp_instance slp_node_instance)
> +                               slp_instance slp_node_instance,
> +                               edge loop_exit,
> +                               bool main_exit_p = true)

isn't main_exit_p computable from 'loop_exit' by comparing that to
the one recorded in loop_vinfo?  If so please do that instead of passing
in another argument.

>  {
>    stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
>    gcc_assert (reduc_info->is_reduc_info);
> @@ -6053,7 +6068,7 @@ vect_create_epilog_for_reduction (loop_vec_info 
> loop_vinfo,
>        /* Create an induction variable.  */
>        gimple_stmt_iterator incr_gsi;
>        bool insert_after;
> -      standard_iv_increment_position (loop, &incr_gsi, &insert_after);
> +      vect_iv_increment_position (loop_exit, &incr_gsi, &insert_after);
>        create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, 
> &incr_gsi,
>                insert_after, &indx_before_incr, &indx_after_incr);
>  
> @@ -6132,23 +6147,30 @@ vect_create_epilog_for_reduction (loop_vec_info 
> loop_vinfo,
>           Store them in NEW_PHIS.  */
>    if (double_reduc)
>      loop = outer_loop;
> -  exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
> +  /* We need to reduce values in all exits.  */
> +  exit_bb = loop_exit->dest;
>    exit_gsi = gsi_after_labels (exit_bb);
>    reduc_inputs.create (slp_node ? vec_num : ncopies);
> +  vec <gimple *> vec_stmts;
> +  if (main_exit_p)
> +    vec_stmts = STMT_VINFO_VEC_STMTS (rdef_info);
> +  else
> +    vec_stmts = STMT_VINFO_VEC_STMTS (STMT_VINFO_REDUC_DEF (rdef_info));

both would be wrong for SLP, also I think you need to look at
STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))?  For SLP the
PHI SLP node is reached via slp_node_instance->reduc_phis.

I think an overall better structure would be to add a

vect_get_vect_def (stmt_vec_info, slp_tree, unsigned);

abstracting SLP and non-SLP and doing

  for (unsigned i = 0; i < vec_num * ncopies; ++i)
    {
      def = vect_get_vect_def (stmt_info, slp_node, i);
...
    }

and then adjusting stmt_info/slp_node according to main_exit_p?
(would be nice to transition stmt_info->vec_stmts to stmt_info->vec_defs)

That said, wherever possible please think of SLP ;)

> +
>    for (unsigned i = 0; i < vec_num; i++)
>      {
>        gimple_seq stmts = NULL;
>        if (slp_node)
>       def = vect_get_slp_vect_def (slp_node, i);
>        else
> -     def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
> +     def = gimple_get_lhs (vec_stmts[0]);
>        for (j = 0; j < ncopies; j++)
>       {
>         tree new_def = copy_ssa_name (def);
>         phi = create_phi_node (new_def, exit_bb);
>         if (j)
> -         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
> -       SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, def);
> +         def = gimple_get_lhs (vec_stmts[j]);
> +       SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
>         new_def = gimple_convert (&stmts, vectype, new_def);
>         reduc_inputs.quick_push (new_def);
>       }
> @@ -6885,7 +6907,20 @@ vect_create_epilog_for_reduction (loop_vec_info 
> loop_vinfo,
>            FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
>           {
>             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
> -             SET_USE (use_p, scalar_result);
> +             {
> +               gimple *stmt = USE_STMT (use_p);
> +               if (main_exit_p)
> +                 SET_USE (use_p, scalar_result);
> +               else if (is_a <gphi *> (stmt))
> +                 {
> +                   /* If an early exit only update usages in the merge
> +                      block.  */

shouldn't that be the only use at this point anyway?  You only
update uses in PHI nodes btw. and you can use SET_USE, maybe
you wanted to check that
gimple_phi_arg_edge (stmt, phi_arg_index_from_use (use_p)) == merge_e
instead?

That said, the comment could be more precise

Are we calling vect_create_epilog_for_reduction for each early exit?
I suppose not?

> +                   edge merge_e = single_succ_edge (loop_exit->dest);
> +                   if (gimple_bb (stmt) != merge_e->dest)
> +                     continue;
> +                   SET_PHI_ARG_DEF (stmt, merge_e->dest_idx, scalar_result);
> +                 }
> +             }
>             update_stmt (use_stmt);
>           }
>          }
> @@ -10481,6 +10516,156 @@ vectorizable_induction (loop_vec_info loop_vinfo,
>    return true;
>  }
>  
> +/* Function vectorizable_live_operation_1.
> +
> +   helper function for vectorizable_live_operation.  */
> +
> +tree
> +vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
> +                            stmt_vec_info stmt_info, edge exit_e,
> +                            tree vectype, int ncopies, slp_tree slp_node,
> +                            tree bitsize, tree bitstart, tree vec_lhs,
> +                            tree lhs_type, bool restart_loop,
> +                            gimple_stmt_iterator *exit_gsi)
> +{
> +  basic_block exit_bb = exit_e->dest;
> +  gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS 
> (loop_vinfo));
> +
> +  tree vec_lhs_phi = copy_ssa_name (vec_lhs);
> +  gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
> +  for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
> +    SET_PHI_ARG_DEF (phi, i, vec_lhs);
> +
> +  gimple_seq stmts = NULL;
> +  tree new_tree;
> +  if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> +    {
> +      /* Emit:
> +
> +      SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
> +
> +      where VEC_LHS is the vectorized live-out result and MASK is
> +      the loop mask for the final iteration.  */
> +      gcc_assert (ncopies == 1 && !slp_node);
> +      gimple_seq tem = NULL;
> +      gimple_stmt_iterator gsi = gsi_last (tem);
> +      tree len = vect_get_loop_len (loop_vinfo, &gsi,
> +                                 &LOOP_VINFO_LENS (loop_vinfo),
> +                                 1, vectype, 0, 0);
> +
> +      /* BIAS - 1.  */
> +      signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +      tree bias_minus_one
> +     = int_const_binop (MINUS_EXPR,
> +                        build_int_cst (TREE_TYPE (len), biasval),
> +                        build_one_cst (TREE_TYPE (len)));
> +
> +      /* LAST_INDEX = LEN + (BIAS - 1).  */
> +      tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
> +                                  len, bias_minus_one);
> +
> +      /* This needs to implement extraction of the first index, but not sure
> +      how the LEN stuff works.  At the moment we shouldn't get here since
> +      there's no LEN support for early breaks.  But guard this so there's
> +      no incorrect codegen.  */
> +      gcc_assert (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
> +
> +      /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
> +      tree scalar_res
> +     = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
> +                     vec_lhs_phi, last_index);
> +
> +      /* Convert the extracted vector element to the scalar type.  */
> +      new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> +    }
> +  else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> +    {
> +      /* Emit:
> +
> +      SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
> +
> +      where VEC_LHS is the vectorized live-out result and MASK is
> +      the loop mask for the final iteration.  */
> +      gcc_assert (!slp_node);
> +      tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
> +      gimple_seq tem = NULL;
> +      gimple_stmt_iterator gsi = gsi_last (tem);
> +      tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
> +                                   &LOOP_VINFO_MASKS (loop_vinfo),
> +                                   1, vectype, 0);
> +      tree scalar_res;
> +
> +      /* For an inverted control flow with early breaks we want EXTRACT_FIRST
> +      instead of EXTRACT_LAST.  Emulate by reversing the vector and mask. */
> +      if (restart_loop && LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> +     {
> +       /* First create the permuted mask.  */
> +       tree perm_mask = perm_mask_for_reverse (TREE_TYPE (mask));
> +       tree perm_dest = copy_ssa_name (mask);
> +       gimple *perm_stmt
> +             = gimple_build_assign (perm_dest, VEC_PERM_EXPR, mask,
> +                                    mask, perm_mask);
> +       vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
> +                                    &gsi);
> +       mask = perm_dest;
> +
> +       /* Then permute the vector contents.  */
> +       tree perm_elem = perm_mask_for_reverse (vectype);
> +       perm_dest = copy_ssa_name (vec_lhs_phi);
> +       perm_stmt
> +             = gimple_build_assign (perm_dest, VEC_PERM_EXPR, vec_lhs_phi,
> +                                    vec_lhs_phi, perm_elem);
> +       vect_finish_stmt_generation (loop_vinfo, stmt_info, perm_stmt,
> +                                    &gsi);
> +       vec_lhs_phi = perm_dest;
> +     }
> +
> +      gimple_seq_add_seq (&stmts, tem);
> +
> +      scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
> +                              mask, vec_lhs_phi);
> +
> +      /* Convert the extracted vector element to the scalar type.  */
> +      new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> +    }
> +  else
> +    {
> +      tree bftype = TREE_TYPE (vectype);
> +      if (VECTOR_BOOLEAN_TYPE_P (vectype))
> +     bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
> +      new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, 
> bitstart);
> +      new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
> +                                    &stmts, true, NULL_TREE);
> +    }
> +
> +  *exit_gsi = gsi_after_labels (exit_bb);
> +  if (stmts)
> +    gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
> +
> +  return new_tree;
> +}
> +
> +/* Find the edge that's the final one in the path from SRC to DEST and
> +   return it.  This edge must exist in at most one forwarder edge between.  
> */
> +
> +static edge
> +find_connected_edge (edge src, basic_block dest)
> +{
> +   if (src->dest == dest)
> +     return src;
> +
> +  edge e;
> +  edge_iterator ei;
> +
> +  FOR_EACH_EDGE (e, ei, dest->preds)
> +    {
> +      if (src->dest == e->src)
> +     return e;
> +    }

isn't that just find_edge (src->dest, dest)?

> +  return NULL;
> +}
> +
>  /* Function vectorizable_live_operation.
>  
>     STMT_INFO computes a value that is used outside the loop.  Check if
> @@ -10505,7 +10690,8 @@ vectorizable_live_operation (vec_info *vinfo, 
> stmt_vec_info stmt_info,
>    int vec_entry = 0;
>    poly_uint64 vec_index = 0;
>  
> -  gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
> +  gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
> +           || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
>  
>    /* If a stmt of a reduction is live, vectorize it via
>       vect_create_epilog_for_reduction.  vectorizable_reduction assessed
> @@ -10530,8 +10716,22 @@ vectorizable_live_operation (vec_info *vinfo, 
> stmt_vec_info stmt_info,
>        if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
>         || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
>       return true;
> +
> +      /* If early break we only have to materialize the reduction on the 
> merge
> +      block, but we have to find an alternate exit first.  */
> +      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
> +     {
> +       for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
> +         if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
> +           vect_create_epilog_for_reduction (loop_vinfo, stmt_info,
> +                                             slp_node, slp_node_instance,
> +                                             exit, false);

Hmm, for each one.  But we only need a single reduction epilogue, no?
In the merge block?

> +     }
> +
>        vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
> -                                     slp_node_instance);
> +                                     slp_node_instance,
> +                                     LOOP_VINFO_IV_EXIT (loop_vinfo));
> +
>        return true;
>      }
>  
> @@ -10683,103 +10883,63 @@ vectorizable_live_operation (vec_info *vinfo, 
> stmt_vec_info stmt_info,
>          lhs' = new_tree;  */
>  
>        class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
> -      basic_block exit_bb = LOOP_VINFO_IV_EXIT (loop_vinfo)->dest;
> -      gcc_assert (single_pred_p (exit_bb));
> -
> -      tree vec_lhs_phi = copy_ssa_name (vec_lhs);
> -      gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
> -      SET_PHI_ARG_DEF (phi, LOOP_VINFO_IV_EXIT (loop_vinfo)->dest_idx, 
> vec_lhs);
> -
> -      gimple_seq stmts = NULL;
> -      tree new_tree;
> -      if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
> -     {
> -       /* Emit:
> -
> -            SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
> -
> -          where VEC_LHS is the vectorized live-out result and MASK is
> -          the loop mask for the final iteration.  */
> -       gcc_assert (ncopies == 1 && !slp_node);
> -       gimple_seq tem = NULL;
> -       gimple_stmt_iterator gsi = gsi_last (tem);
> -       tree len
> -         = vect_get_loop_len (loop_vinfo, &gsi,
> -                              &LOOP_VINFO_LENS (loop_vinfo),
> -                              1, vectype, 0, 0);
> -
> -       /* BIAS - 1.  */
> -       signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> -       tree bias_minus_one
> -         = int_const_binop (MINUS_EXPR,
> -                            build_int_cst (TREE_TYPE (len), biasval),
> -                            build_one_cst (TREE_TYPE (len)));
> -
> -       /* LAST_INDEX = LEN + (BIAS - 1).  */
> -       tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
> -                                       len, bias_minus_one);
> -
> -       /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
> -       tree scalar_res
> -         = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
> -                         vec_lhs_phi, last_index);
> -
> -       /* Convert the extracted vector element to the scalar type.  */
> -       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> -     }
> -      else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
> -     {
> -       /* Emit:
> -
> -            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
> -
> -          where VEC_LHS is the vectorized live-out result and MASK is
> -          the loop mask for the final iteration.  */
> -       gcc_assert (ncopies == 1 && !slp_node);
> -       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
> -       gimple_seq tem = NULL;
> -       gimple_stmt_iterator gsi = gsi_last (tem);
> -       tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
> -                                       &LOOP_VINFO_MASKS (loop_vinfo),
> -                                       1, vectype, 0);
> -       gimple_seq_add_seq (&stmts, tem);
> -       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
> -                                       mask, vec_lhs_phi);
> -
> -       /* Convert the extracted vector element to the scalar type.  */
> -       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
> -     }
> -      else
> -     {
> -       tree bftype = TREE_TYPE (vectype);
> -       if (VECTOR_BOOLEAN_TYPE_P (vectype))
> -         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
> -       new_tree = build3 (BIT_FIELD_REF, bftype,
> -                          vec_lhs_phi, bitsize, bitstart);
> -       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
> -                                        &stmts, true, NULL_TREE);
> -     }
> +      /* Check if we have a loop where the chosen exit is not the main exit,
> +      in these cases for an early break we restart the iteration the vector 
> code
> +      did.  For the live values we want the value at the start of the 
> iteration
> +      rather than at the end.  */
> +      edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
> +      bool restart_loop = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
> +      FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
> +     if (!is_gimple_debug (use_stmt)
> +         && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
> +       {
> +         basic_block use_bb = gimple_bb (use_stmt);
> +         if (!is_a <gphi *> (use_stmt))
> +           continue;
> +         for (auto exit_e : get_loop_exit_edges (loop))
> +           {
> +             /* See if this exit leads to the value.  */
> +             edge dest_e = find_connected_edge (exit_e, use_bb);
> +             if (!dest_e || PHI_ARG_DEF_FROM_EDGE (use_stmt, dest_e) != lhs)
> +               continue;
>  
> -      gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
> -      if (stmts)
> -     gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
> +             gimple *tmp_vec_stmt = vec_stmt;
> +             tree tmp_vec_lhs = vec_lhs;
> +             tree tmp_bitstart = bitstart;
> +             /* For early exit where the exit is not in the BB that leads
> +                to the latch then we're restarting the iteration in the
> +                scalar loop.  So get the first live value.  */
> +             restart_loop = restart_loop || exit_e != main_e;
> +             if (restart_loop)
> +               {
> +                 tmp_vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
> +                 tmp_vec_lhs = gimple_get_lhs (tmp_vec_stmt);
> +                 tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
> +               }
>  
> -      /* Remove existing phis that copy from lhs and create copies
> -      from new_tree.  */
> -      gimple_stmt_iterator gsi;
> -      for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi);)
> -     {
> -       gimple *phi = gsi_stmt (gsi);
> -       if ((gimple_phi_arg_def (phi, 0) == lhs))
> -         {
> -           remove_phi_node (&gsi, false);
> -           tree lhs_phi = gimple_phi_result (phi);
> -           gimple *copy = gimple_build_assign (lhs_phi, new_tree);
> -           gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
> -         }
> -       else
> -         gsi_next (&gsi);
> -     }
> +             gimple_stmt_iterator exit_gsi;
> +             tree new_tree
> +               = vectorizable_live_operation_1 (loop_vinfo, stmt_info,
> +                                                exit_e, vectype, ncopies,
> +                                                slp_node, bitsize,
> +                                                tmp_bitstart, tmp_vec_lhs,
> +                                                lhs_type, restart_loop,
> +                                                &exit_gsi);
> +
> +             /* Use the empty block on the exit to materialize the new stmts
> +                so we can use update the PHI here.  */
> +             if (gimple_phi_num_args (use_stmt) == 1)
> +               {
> +                 auto gsi = gsi_for_stmt (use_stmt);
> +                 remove_phi_node (&gsi, false);
> +                 tree lhs_phi = gimple_phi_result (use_stmt);
> +                 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
> +                 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
> +               }
> +             else
> +               SET_PHI_ARG_DEF (use_stmt, dest_e->dest_idx, new_tree);
> +           }
> +       }

Difficult to see what changed due to the split out, guess it'll be ok.

>        /* There a no further out-of-loop uses of lhs by LC-SSA construction.  
> */
>        FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
> @@ -11797,6 +11957,21 @@ vect_transform_loop (loop_vec_info loop_vinfo, 
> gimple *loop_vectorized_call)
>             if (dump_enabled_p ())
>               dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
>             vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
> +           /* If vectorizing early break we must also vectorize the use of
> +              the PHIs as a live operation.  */
> +           if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> +               && !STMT_VINFO_LIVE_P (stmt_info)
> +               && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
> +             {
> +               if (dump_enabled_p ())
> +                 dump_printf_loc (MSG_NOTE, vect_location,
> +                      "----> vectorizing early break reduc or induc phi: %G",
> +                      (gimple *) phi);
> +               bool done
> +                 = vectorizable_live_operation (loop_vinfo, stmt_info, NULL,
> +                                                NULL, -1, true, NULL);

you should be able to amend can_vectorize_live_stmts instead by
adding || (LOOP_VINFO_EARLY_BREAKS (loop_vinfo) && vect_induction_def),
then we keep it at one place also where we'd handle the SLP case.

> +               gcc_assert (done);
> +             }
>           }
>       }
>  
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 
> fe38beb4fa1d9f8593445354f56ba52e10a040cd..f1b6a13395f286f9997530bbe57cda3a00502f8f
>  100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -342,6 +342,7 @@ is_simple_and_all_uses_invariant (stmt_vec_info stmt_info,
>     - it has uses outside the loop.
>     - it has vdefs (it alters memory).
>     - control stmts in the loop (except for the exit condition).
> +   - it is an induction and we have multiple exits.
>  
>     CHECKME: what other side effects would the vectorizer allow?  */
>  
> @@ -399,6 +400,19 @@ vect_stmt_relevant_p (stmt_vec_info stmt_info, 
> loop_vec_info loop_vinfo,
>       }
>      }
>  
> +  /* Check if it's an induction and multiple exits.  In this case there will 
> be
> +     a usage later on after peeling which is needed for the alternate exit.  
> */
> +  if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> +      && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
> +    {
> +      if (dump_enabled_p ())
> +       dump_printf_loc (MSG_NOTE, vect_location,
> +                        "vec_stmt_relevant_p: induction forced for "
> +                        "early break.\n");
> +      *relevant = vect_used_in_scope;
> +

I think you should instead set *live_p?

> +    }
> +
>    if (*live_p && *relevant == vect_unused_in_scope
>        && !is_simple_and_all_uses_invariant (stmt_info, loop_vinfo))
>      {
> @@ -1774,7 +1788,7 @@ compare_step_with_zero (vec_info *vinfo, stmt_vec_info 
> stmt_info)
>  /* If the target supports a permute mask that reverses the elements in
>     a vector of type VECTYPE, return that mask, otherwise return null.  */
>  
> -static tree
> +tree
>  perm_mask_for_reverse (tree vectype)
>  {
>    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 
> 076a698eb4288f68e81f91923f7e3e8d181ad685..de673ae56eac455c9560a29d7f3792b6c3c49f3b
>  100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -2227,6 +2227,7 @@ extern bool vect_can_advance_ivs_p (loop_vec_info);
>  extern void vect_update_inits_of_drs (loop_vec_info, tree, tree_code);
>  extern edge vec_init_loop_exit_info (class loop *);
>  extern bool vect_is_loop_exit_latch_pred (edge, class loop *);
> +extern void vect_iv_increment_position (edge, gimple_stmt_iterator *, bool 
> *);
>  
>  /* In tree-vect-stmts.cc.  */
>  extern tree get_related_vectype_for_scalar_type (machine_mode, tree,
> @@ -2248,6 +2249,7 @@ extern bool vect_is_simple_use (vec_info *, 
> stmt_vec_info, slp_tree,
>                               enum vect_def_type *,
>                               tree *, stmt_vec_info * = NULL);
>  extern bool vect_maybe_update_slp_op_vectype (slp_tree, tree);
> +extern tree perm_mask_for_reverse (tree);
>  extern bool supportable_widening_operation (vec_info*, code_helper,
>                                           stmt_vec_info, tree, tree,
>                                           code_helper*, code_helper*,
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

Reply via email to