On Tue, 15 Aug 2023, Juzhe-Zhong wrote:

> Hi, Richard and Richi.
> 
> This patch is adding MASK_LEN_{LOAD_LANES,STORE_LANES} support into 
> vectorizer.
> 
> Consider this simple case:
> 
> void __attribute__ ((noinline, noclone))
> foo (int *__restrict a, int *__restrict b, int *__restrict c,
>         int *__restrict d, int *__restrict e, int *__restrict f,
>         int *__restrict g, int *__restrict h, int *__restrict j, int n)
> {
>   for (int i = 0; i < n; ++i)
>     {
>       a[i] = j[i * 8];
>       b[i] = j[i * 8 + 1];
>       c[i] = j[i * 8 + 2];
>       d[i] = j[i * 8 + 3];
>       e[i] = j[i * 8 + 4];
>       f[i] = j[i * 8 + 5];
>       g[i] = j[i * 8 + 6];
>       h[i] = j[i * 8 + 7];
>     }
> }
> 
> RVV Gimple IR:
> 
>   _79 = .SELECT_VL (ivtmp_81, POLY_INT_CST [4, 4]);
>   ivtmp_125 = _79 * 32;
>   vect_array.8 = .MASK_LEN_LOAD_LANES (vectp_j.6_124, 32B, { -1, ... }, _79, 
> 0);
>   vect__8.9_122 = vect_array.8[0];
>   vect__8.10_121 = vect_array.8[1];
>   vect__8.11_120 = vect_array.8[2];
>   vect__8.12_119 = vect_array.8[3];
>   vect__8.13_118 = vect_array.8[4];
>   vect__8.14_117 = vect_array.8[5];
>   vect__8.15_116 = vect_array.8[6];
>   vect__8.16_115 = vect_array.8[7];
>   vect_array.8 ={v} {CLOBBER};
>   ivtmp_114 = _79 * 4;
>   .MASK_LEN_STORE (vectp_a.17_113, 32B, { -1, ... }, _79, 0, vect__8.9_122);
>   .MASK_LEN_STORE (vectp_b.19_109, 32B, { -1, ... }, _79, 0, vect__8.10_121);
>   .MASK_LEN_STORE (vectp_c.21_105, 32B, { -1, ... }, _79, 0, vect__8.11_120);
>   .MASK_LEN_STORE (vectp_d.23_101, 32B, { -1, ... }, _79, 0, vect__8.12_119);
>   .MASK_LEN_STORE (vectp_e.25_97, 32B, { -1, ... }, _79, 0, vect__8.13_118);
>   .MASK_LEN_STORE (vectp_f.27_93, 32B, { -1, ... }, _79, 0, vect__8.14_117);
>   .MASK_LEN_STORE (vectp_g.29_89, 32B, { -1, ... }, _79, 0, vect__8.15_116);
>   .MASK_LEN_STORE (vectp_h.31_85, 32B, { -1, ... }, _79, 0, vect__8.16_115);
> 
> ASM:
> 
> foo:
>       lw      t4,8(sp)
>       ld      t5,0(sp)
>       ble     t4,zero,.L5
> .L3:
>       vsetvli t1,t4,e8,mf4,ta,ma
>       vlseg8e32.v     v8,(t5)
>       slli    t3,t1,2
>       slli    t6,t1,5
>       vse32.v v8,0(a0)
>       vse32.v v9,0(a1)
>       vse32.v v10,0(a2)
>       vse32.v v11,0(a3)
>       vse32.v v12,0(a4)
>       vse32.v v13,0(a5)
>       vse32.v v14,0(a6)
>       vse32.v v15,0(a7)
>       sub     t4,t4,t1
>       add     t5,t5,t6
>       add     a0,a0,t3
>       add     a1,a1,t3
>       add     a2,a2,t3
>       add     a3,a3,t3
>       add     a4,a4,t3
>       add     a5,a5,t3
>       add     a6,a6,t3
>       add     a7,a7,t3
>       bne     t4,zero,.L3
> .L5:
>       ret
> 
> The details of the approach:
> 
> Step 1 - Modifiy the LANES LOAD/STORE support function 
> (vect_load_lanes_supported/vect_store_lanes_supported):
> 
> +/* Return FN if vec_{masked_,mask_len,}load_lanes is available for COUNT
> +   vectors of type VECTYPE.  MASKED_P says whether the masked form is 
> needed. */
>  
> -bool
> +internal_fn
>  vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
>                          bool masked_p)
>  {
> -  if (masked_p)
> -    return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
> -                                      vec_mask_load_lanes_optab,
> -                                      vectype, count);
> +  if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
> +                                 vec_mask_len_load_lanes_optab,
> +                                 vectype, count))
> +    return IFN_MASK_LEN_LOAD_LANES;
> +  else if (masked_p)
> +    {
> +      if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
> +                                     vec_mask_load_lanes_optab,
> +                                     vectype, count))
> +     return IFN_MASK_LOAD_LANES;
> +    }
>    else
> -    return vect_lanes_optab_supported_p ("vec_load_lanes",
> -                                      vec_load_lanes_optab,
> -                                      vectype, count);
> +    {
> +      if (vect_lanes_optab_supported_p ("vec_load_lanes",
> +                                     vec_load_lanes_optab,
> +                                     vectype, count))
> +     return IFN_LOAD_LANES;
> +    }
> +  return IFN_LAST;
>  }
>  
> Instead of returning TRUE or FALSE whether target support the LANES 
> LOAD/STORE.
> I change it into return internal_fn of the LANES LOAD/STORE that target 
> support,
> If target didn't support any LANE LOAD/STORE optabs, return IFN_LAST.
> 
> Step 2 - Compute IFN for LANES LOAD/STORE (Only compute once).
> 
>       if (!STMT_VINFO_STRIDED_P (first_stmt_info)
>         && (can_overrun_p || !would_overrun_p)
>         && compare_step_with_zero (vinfo, stmt_info) > 0)
>       {
>         /* First cope with the degenerate case of a single-element
>            vector.  */
>         if (known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U))
>           ;
> 
>         else
>           {
>             /* Otherwise try using LOAD/STORE_LANES.  */
>             *lanes_ifn
>               = vls_type == VLS_LOAD
>                   ? vect_load_lanes_supported (vectype, group_size, masked_p)
>                   : vect_store_lanes_supported (vectype, group_size,
>                                                 masked_p);
>             if (*lanes_ifn != IFN_LAST)
>               {
>                 *memory_access_type = VMAT_LOAD_STORE_LANES;
>                 overrun_p = would_overrun_p;
>               }
> 
>             /* If that fails, try using permuting loads.  */
>             else if (vls_type == VLS_LOAD
>                        ? vect_grouped_load_supported (vectype,
>                                                       single_element_p,
>                                                       group_size)
>                        : vect_grouped_store_supported (vectype, group_size))
>               {
>                 *memory_access_type = VMAT_CONTIGUOUS_PERMUTE;
>                 overrun_p = would_overrun_p;
>               }
>           }
>       }
> 
> Step 3 - Build MASK_LEN_{LANES_LOAD,LANES_STORE} Gimple IR:
> 
> +       if (lanes_ifn == IFN_MASK_LEN_STORE_LANES)
> +         {
> +           if (loop_lens)
> +             final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> +                                            ncopies, vectype, j, 1);
> +           else
> +             final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
> +           signed char biasval
> +             = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +           bias = build_int_cst (intQI_type_node, biasval);
> +           if (!final_mask)
> +             {
> +               mask_vectype = truth_type_for (vectype);
> +               final_mask = build_minus_one_cst (mask_vectype);
> +             }
> +         }
> +
>         gcall *call;
> -       if (final_mask)
> +       if (final_len && final_mask)
> +         {
> +           /* Emit:
> +                MASK_LEN_STORE_LANES (DATAREF_PTR, ALIAS_PTR, VEC_MASK,
> +                                      LEN, BIAS, VEC_ARRAY).  */
> +           unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
> +           tree alias_ptr = build_int_cst (ref_type, align);
> +           call = gimple_build_call_internal (IFN_MASK_LEN_STORE_LANES, 6,
> +                                              dataref_ptr, alias_ptr,
> +                                              final_mask, final_len, bias,
> +                                              vec_array);
> +         }
> +       else if (final_mask)
> 
> The LEN and MASK flow is totally the same as other MASK_LEN_* load/store.

OK.

Thanks,
Richard.

> gcc/ChangeLog:
> 
>         * internal-fn.cc (internal_load_fn_p): Apply 
> MASK_LEN_{LOAD_LANES,STORE_LANES} into vectorizer.
>         (internal_store_fn_p): Ditto.
>         (internal_fn_len_index): Ditto.
>         (internal_fn_mask_index): Ditto.
>         (internal_fn_stored_value_index): Ditto.
>         * tree-vect-data-refs.cc (vect_store_lanes_supported): Ditto.
>         (vect_load_lanes_supported): Ditto.
>         * tree-vect-loop.cc: Ditto.
>         * tree-vect-slp.cc (vect_slp_prefer_store_lanes_p): Ditto.
>         * tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto.
>         (get_group_load_store_type): Ditto.
>         (vectorizable_store): Ditto.
>         (vectorizable_load): Ditto.
>         * tree-vectorizer.h (vect_store_lanes_supported): Ditto.
>         (vect_load_lanes_supported): Ditto.
> 
> ---
>  gcc/internal-fn.cc         |   7 ++
>  gcc/tree-vect-data-refs.cc |  61 ++++++++++------
>  gcc/tree-vect-loop.cc      |  11 +--
>  gcc/tree-vect-slp.cc       |   2 +-
>  gcc/tree-vect-stmts.cc     | 141 ++++++++++++++++++++++++++++---------
>  gcc/tree-vectorizer.h      |   4 +-
>  6 files changed, 163 insertions(+), 63 deletions(-)
> 
> diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
> index 4f2b20a79e5..cc1ede58799 100644
> --- a/gcc/internal-fn.cc
> +++ b/gcc/internal-fn.cc
> @@ -4578,6 +4578,7 @@ internal_load_fn_p (internal_fn fn)
>      case IFN_MASK_LOAD:
>      case IFN_LOAD_LANES:
>      case IFN_MASK_LOAD_LANES:
> +    case IFN_MASK_LEN_LOAD_LANES:
>      case IFN_GATHER_LOAD:
>      case IFN_MASK_GATHER_LOAD:
>      case IFN_MASK_LEN_GATHER_LOAD:
> @@ -4600,6 +4601,7 @@ internal_store_fn_p (internal_fn fn)
>      case IFN_MASK_STORE:
>      case IFN_STORE_LANES:
>      case IFN_MASK_STORE_LANES:
> +    case IFN_MASK_LEN_STORE_LANES:
>      case IFN_SCATTER_STORE:
>      case IFN_MASK_SCATTER_STORE:
>      case IFN_MASK_LEN_SCATTER_STORE:
> @@ -4672,6 +4674,8 @@ internal_fn_len_index (internal_fn fn)
>      case IFN_COND_LEN_NEG:
>      case IFN_MASK_LEN_LOAD:
>      case IFN_MASK_LEN_STORE:
> +    case IFN_MASK_LEN_LOAD_LANES:
> +    case IFN_MASK_LEN_STORE_LANES:
>        return 3;
>  
>      default:
> @@ -4689,8 +4693,10 @@ internal_fn_mask_index (internal_fn fn)
>      {
>      case IFN_MASK_LOAD:
>      case IFN_MASK_LOAD_LANES:
> +    case IFN_MASK_LEN_LOAD_LANES:
>      case IFN_MASK_STORE:
>      case IFN_MASK_STORE_LANES:
> +    case IFN_MASK_LEN_STORE_LANES:
>      case IFN_MASK_LEN_LOAD:
>      case IFN_MASK_LEN_STORE:
>        return 2;
> @@ -4726,6 +4732,7 @@ internal_fn_stored_value_index (internal_fn fn)
>        return 4;
>  
>      case IFN_MASK_LEN_STORE:
> +    case IFN_MASK_LEN_STORE_LANES:
>        return 5;
>  
>      default:
> diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
> index a3570c45b52..3e9a284666c 100644
> --- a/gcc/tree-vect-data-refs.cc
> +++ b/gcc/tree-vect-data-refs.cc
> @@ -5438,22 +5438,31 @@ vect_grouped_store_supported (tree vectype, unsigned 
> HOST_WIDE_INT count)
>    return false;
>  }
>  
> +/* Return FN if vec_{mask_,mask_len_}store_lanes is available for COUNT 
> vectors
> +   of type VECTYPE.  MASKED_P says whether the masked form is needed.  */
>  
> -/* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
> -   type VECTYPE.  MASKED_P says whether the masked form is needed.  */
> -
> -bool
> +internal_fn
>  vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
>                           bool masked_p)
>  {
> -  if (masked_p)
> -    return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
> -                                      vec_mask_store_lanes_optab,
> -                                      vectype, count);
> +  if (vect_lanes_optab_supported_p ("vec_mask_len_store_lanes",
> +                                 vec_mask_len_store_lanes_optab, vectype,
> +                                 count))
> +    return IFN_MASK_LEN_STORE_LANES;
> +  else if (masked_p)
> +    {
> +      if (vect_lanes_optab_supported_p ("vec_mask_store_lanes",
> +                                     vec_mask_store_lanes_optab, vectype,
> +                                     count))
> +     return IFN_MASK_STORE_LANES;
> +    }
>    else
> -    return vect_lanes_optab_supported_p ("vec_store_lanes",
> -                                      vec_store_lanes_optab,
> -                                      vectype, count);
> +    {
> +      if (vect_lanes_optab_supported_p ("vec_store_lanes",
> +                                     vec_store_lanes_optab, vectype, count))
> +     return IFN_STORE_LANES;
> +    }
> +  return IFN_LAST;
>  }
>  
>  
> @@ -6056,21 +6065,31 @@ vect_grouped_load_supported (tree vectype, bool 
> single_element_p,
>    return false;
>  }
>  
> -/* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
> -   type VECTYPE.  MASKED_P says whether the masked form is needed.  */
> +/* Return FN if vec_{masked_,mask_len_}load_lanes is available for COUNT 
> vectors
> +   of type VECTYPE.  MASKED_P says whether the masked form is needed.  */
>  
> -bool
> +internal_fn
>  vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
>                          bool masked_p)
>  {
> -  if (masked_p)
> -    return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
> -                                      vec_mask_load_lanes_optab,
> -                                      vectype, count);
> +  if (vect_lanes_optab_supported_p ("vec_mask_len_load_lanes",
> +                                 vec_mask_len_load_lanes_optab, vectype,
> +                                 count))
> +    return IFN_MASK_LEN_LOAD_LANES;
> +  else if (masked_p)
> +    {
> +      if (vect_lanes_optab_supported_p ("vec_mask_load_lanes",
> +                                     vec_mask_load_lanes_optab, vectype,
> +                                     count))
> +     return IFN_MASK_LOAD_LANES;
> +    }
>    else
> -    return vect_lanes_optab_supported_p ("vec_load_lanes",
> -                                      vec_load_lanes_optab,
> -                                      vectype, count);
> +    {
> +      if (vect_lanes_optab_supported_p ("vec_load_lanes", 
> vec_load_lanes_optab,
> +                                     vectype, count))
> +     return IFN_LOAD_LANES;
> +    }
> +  return IFN_LAST;
>  }
>  
>  /* Function vect_permute_load_chain.
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index bc3063c3615..1fcd8d07ea1 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -2839,7 +2839,8 @@ start_over:
>            instructions record it and move on to the next instance.  */
>         if (loads_permuted
>             && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
> -           && vect_store_lanes_supported (vectype, group_size, false))
> +           && vect_store_lanes_supported (vectype, group_size, false)
> +                != IFN_LAST)
>           {
>             FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
>               {
> @@ -2848,9 +2849,9 @@ start_over:
>                 /* Use SLP for strided accesses (or if we can't
>                    load-lanes).  */
>                 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
> -                   || ! vect_load_lanes_supported
> +                   || vect_load_lanes_supported
>                           (STMT_VINFO_VECTYPE (stmt_vinfo),
> -                          DR_GROUP_SIZE (stmt_vinfo), false))
> +                          DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
>                   break;
>               }
>  
> @@ -3153,7 +3154,7 @@ again:
>        vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
>        unsigned int size = DR_GROUP_SIZE (vinfo);
>        tree vectype = STMT_VINFO_VECTYPE (vinfo);
> -      if (! vect_store_lanes_supported (vectype, size, false)
> +      if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
>        && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
>        && ! vect_grouped_store_supported (vectype, size))
>       return opt_result::failure_at (vinfo->stmt,
> @@ -3165,7 +3166,7 @@ again:
>         bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
>         size = DR_GROUP_SIZE (vinfo);
>         vectype = STMT_VINFO_VECTYPE (vinfo);
> -       if (! vect_load_lanes_supported (vectype, size, false)
> +       if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
>             && ! vect_grouped_load_supported (vectype, single_element_p,
>                                               size))
>           return opt_result::failure_at (vinfo->stmt,
> diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> index cf91b21cf7d..9ad2634762e 100644
> --- a/gcc/tree-vect-slp.cc
> +++ b/gcc/tree-vect-slp.cc
> @@ -3094,7 +3094,7 @@ vect_slp_prefer_store_lanes_p (vec_info *vinfo, 
> stmt_vec_info stmt_info,
>    if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS 
> (vectype))
>        || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
>      return false;
> -  return vect_store_lanes_supported (vectype, group_size, false);
> +  return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST;
>  }
>  
>  /* Analyze an SLP instance starting from a group of grouped stores.  Call
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 86d033aa60c..cd8e0a76374 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -1610,9 +1610,15 @@ check_load_store_for_partial_vectors (loop_vec_info 
> loop_vinfo, tree vectype,
>    bool is_load = (vls_type == VLS_LOAD);
>    if (memory_access_type == VMAT_LOAD_STORE_LANES)
>      {
> -      if (is_load
> -       ? !vect_load_lanes_supported (vectype, group_size, true)
> -       : !vect_store_lanes_supported (vectype, group_size, true))
> +      internal_fn ifn
> +     = (is_load ? vect_load_lanes_supported (vectype, group_size, true)
> +                : vect_store_lanes_supported (vectype, group_size, true));
> +      if (ifn == IFN_MASK_LEN_LOAD_LANES || ifn == IFN_MASK_LEN_STORE_LANES)
> +     vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
> +      else if (ifn == IFN_MASK_LOAD_LANES || ifn == IFN_MASK_STORE_LANES)
> +     vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype,
> +                            scalar_mask);
> +      else
>       {
>         if (dump_enabled_p ())
>           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -1620,10 +1626,7 @@ check_load_store_for_partial_vectors (loop_vec_info 
> loop_vinfo, tree vectype,
>                            " the target doesn't have an appropriate"
>                            " load/store-lanes instruction.\n");
>         LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
> -       return;
>       }
> -      vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype,
> -                          scalar_mask);
>        return;
>      }
>  
> @@ -2074,7 +2077,8 @@ get_group_load_store_type (vec_info *vinfo, 
> stmt_vec_info stmt_info,
>                          poly_int64 *poffset,
>                          dr_alignment_support *alignment_support_scheme,
>                          int *misalignment,
> -                        gather_scatter_info *gs_info)
> +                        gather_scatter_info *gs_info,
> +                        internal_fn *lanes_ifn)
>  {
>    loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
>    class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
> @@ -2272,24 +2276,30 @@ get_group_load_store_type (vec_info *vinfo, 
> stmt_vec_info stmt_info,
>         if (known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U))
>           ;
>  
> -       /* Otherwise try using LOAD/STORE_LANES.  */
> -       else if (vls_type == VLS_LOAD
> -                ? vect_load_lanes_supported (vectype, group_size, masked_p)
> -                : vect_store_lanes_supported (vectype, group_size,
> -                                              masked_p))
> +       else
>           {
> -           *memory_access_type = VMAT_LOAD_STORE_LANES;
> -           overrun_p = would_overrun_p;
> -         }
> +           /* Otherwise try using LOAD/STORE_LANES.  */
> +           *lanes_ifn
> +             = vls_type == VLS_LOAD
> +                 ? vect_load_lanes_supported (vectype, group_size, masked_p)
> +                 : vect_store_lanes_supported (vectype, group_size,
> +                                               masked_p);
> +           if (*lanes_ifn != IFN_LAST)
> +             {
> +               *memory_access_type = VMAT_LOAD_STORE_LANES;
> +               overrun_p = would_overrun_p;
> +             }
>  
> -       /* If that fails, try using permuting loads.  */
> -       else if (vls_type == VLS_LOAD
> -                ? vect_grouped_load_supported (vectype, single_element_p,
> -                                               group_size)
> -                : vect_grouped_store_supported (vectype, group_size))
> -         {
> -           *memory_access_type = VMAT_CONTIGUOUS_PERMUTE;
> -           overrun_p = would_overrun_p;
> +           /* If that fails, try using permuting loads.  */
> +           else if (vls_type == VLS_LOAD
> +                      ? vect_grouped_load_supported (vectype,
> +                                                     single_element_p,
> +                                                     group_size)
> +                      : vect_grouped_store_supported (vectype, group_size))
> +             {
> +               *memory_access_type = VMAT_CONTIGUOUS_PERMUTE;
> +               overrun_p = would_overrun_p;
> +             }
>           }
>       }
>  
> @@ -2378,7 +2388,8 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
> stmt_info,
>                    poly_int64 *poffset,
>                    dr_alignment_support *alignment_support_scheme,
>                    int *misalignment,
> -                  gather_scatter_info *gs_info)
> +                  gather_scatter_info *gs_info,
> +                  internal_fn *lanes_ifn)
>  {
>    loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
>    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> @@ -2441,7 +2452,7 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
> stmt_info,
>                                     masked_p,
>                                     vls_type, memory_access_type, poffset,
>                                     alignment_support_scheme,
> -                                   misalignment, gs_info))
> +                                   misalignment, gs_info, lanes_ifn))
>       return false;
>      }
>    else if (STMT_VINFO_STRIDED_P (stmt_info))
> @@ -3087,11 +3098,8 @@ vect_get_loop_variant_data_ptr_increment (
>    loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo);
>    tree step = vect_dr_behavior (vinfo, dr_info)->step;
>  
> -  /* TODO: We don't support gather/scatter or load_lanes/store_lanes for 
> pointer
> -     IVs are updated by variable amount but we will support them in the 
> future.
> -   */
> -  gcc_assert (memory_access_type != VMAT_GATHER_SCATTER
> -           && memory_access_type != VMAT_LOAD_STORE_LANES);
> +  /* gather/scatter never reach here.  */
> +  gcc_assert (memory_access_type != VMAT_GATHER_SCATTER);
>  
>    /* When we support SELECT_VL pattern, we dynamic adjust
>       the memory address by .SELECT_VL result.
> @@ -8094,9 +8102,11 @@ vectorizable_store (vec_info *vinfo,
>    enum dr_alignment_support alignment_support_scheme;
>    int misalignment;
>    poly_int64 poffset;
> +  internal_fn lanes_ifn;
>    if (!get_load_store_type (vinfo, stmt_info, vectype, slp_node, mask, 
> vls_type,
>                           ncopies, &memory_access_type, &poffset,
> -                         &alignment_support_scheme, &misalignment, &gs_info))
> +                         &alignment_support_scheme, &misalignment, &gs_info,
> +                         &lanes_ifn))
>      return false;
>  
>    if (mask)
> @@ -8885,6 +8895,8 @@ vectorizable_store (vec_info *vinfo,
>           }
>  
>         tree final_mask = NULL;
> +       tree final_len = NULL;
> +       tree bias = NULL;
>         if (loop_masks)
>           final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
>                                            ncopies, vectype, j);
> @@ -8892,8 +8904,37 @@ vectorizable_store (vec_info *vinfo,
>           final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
>                                          final_mask, vec_mask, gsi);
>  
> +       if (lanes_ifn == IFN_MASK_LEN_STORE_LANES)
> +         {
> +           if (loop_lens)
> +             final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> +                                            ncopies, vectype, j, 1);
> +           else
> +             final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
> +           signed char biasval
> +             = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +           bias = build_int_cst (intQI_type_node, biasval);
> +           if (!final_mask)
> +             {
> +               mask_vectype = truth_type_for (vectype);
> +               final_mask = build_minus_one_cst (mask_vectype);
> +             }
> +         }
> +
>         gcall *call;
> -       if (final_mask)
> +       if (final_len && final_mask)
> +         {
> +           /* Emit:
> +                MASK_LEN_STORE_LANES (DATAREF_PTR, ALIAS_PTR, VEC_MASK,
> +                                      LEN, BIAS, VEC_ARRAY).  */
> +           unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
> +           tree alias_ptr = build_int_cst (ref_type, align);
> +           call = gimple_build_call_internal (IFN_MASK_LEN_STORE_LANES, 6,
> +                                              dataref_ptr, alias_ptr,
> +                                              final_mask, final_len, bias,
> +                                              vec_array);
> +         }
> +       else if (final_mask)
>           {
>             /* Emit:
>                  MASK_STORE_LANES (DATAREF_PTR, ALIAS_PTR, VEC_MASK,
> @@ -9598,9 +9639,11 @@ vectorizable_load (vec_info *vinfo,
>    enum dr_alignment_support alignment_support_scheme;
>    int misalignment;
>    poly_int64 poffset;
> +  internal_fn lanes_ifn;
>    if (!get_load_store_type (vinfo, stmt_info, vectype, slp_node, mask, 
> VLS_LOAD,
>                           ncopies, &memory_access_type, &poffset,
> -                         &alignment_support_scheme, &misalignment, &gs_info))
> +                         &alignment_support_scheme, &misalignment, &gs_info,
> +                         &lanes_ifn))
>      return false;
>  
>    if (mask)
> @@ -10386,6 +10429,8 @@ vectorizable_load (vec_info *vinfo,
>         tree vec_array = create_vector_array (vectype, vec_num);
>  
>         tree final_mask = NULL_TREE;
> +       tree final_len = NULL_TREE;
> +       tree bias = NULL_TREE;
>         if (loop_masks)
>           final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
>                                            ncopies, vectype, j);
> @@ -10393,8 +10438,36 @@ vectorizable_load (vec_info *vinfo,
>           final_mask = prepare_vec_mask (loop_vinfo, mask_vectype, final_mask,
>                                          vec_mask, gsi);
>  
> +       if (lanes_ifn == IFN_MASK_LEN_LOAD_LANES)
> +         {
> +           if (loop_lens)
> +             final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
> +                                            ncopies, vectype, j, 1);
> +           else
> +             final_len = size_int (TYPE_VECTOR_SUBPARTS (vectype));
> +           signed char biasval
> +             = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
> +           bias = build_int_cst (intQI_type_node, biasval);
> +           if (!final_mask)
> +             {
> +               mask_vectype = truth_type_for (vectype);
> +               final_mask = build_minus_one_cst (mask_vectype);
> +             }
> +         }
> +
>         gcall *call;
> -       if (final_mask)
> +       if (final_len && final_mask)
> +         {
> +           /* Emit:
> +                VEC_ARRAY = MASK_LEN_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
> +                                                 VEC_MASK, LEN, BIAS).  */
> +           unsigned int align = TYPE_ALIGN (TREE_TYPE (vectype));
> +           tree alias_ptr = build_int_cst (ref_type, align);
> +           call = gimple_build_call_internal (IFN_MASK_LEN_LOAD_LANES, 5,
> +                                              dataref_ptr, alias_ptr,
> +                                              final_mask, final_len, bias);
> +         }
> +       else if (final_mask)
>           {
>             /* Emit:
>                  VEC_ARRAY = MASK_LOAD_LANES (DATAREF_PTR, ALIAS_PTR,
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 1de144988c8..53a3d78d545 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -2297,9 +2297,9 @@ extern tree bump_vector_ptr (vec_info *, tree, gimple 
> *, gimple_stmt_iterator *,
>  extern void vect_copy_ref_info (tree, tree);
>  extern tree vect_create_destination_var (tree, tree);
>  extern bool vect_grouped_store_supported (tree, unsigned HOST_WIDE_INT);
> -extern bool vect_store_lanes_supported (tree, unsigned HOST_WIDE_INT, bool);
> +extern internal_fn vect_store_lanes_supported (tree, unsigned HOST_WIDE_INT, 
> bool);
>  extern bool vect_grouped_load_supported (tree, bool, unsigned HOST_WIDE_INT);
> -extern bool vect_load_lanes_supported (tree, unsigned HOST_WIDE_INT, bool);
> +extern internal_fn vect_load_lanes_supported (tree, unsigned HOST_WIDE_INT, 
> bool);
>  extern void vect_permute_store_chain (vec_info *, vec<tree> &,
>                                     unsigned int, stmt_vec_info,
>                                     gimple_stmt_iterator *, vec<tree> *);
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

Reply via email to