Hi, Richi. Thanks so much. I have addressed comments and sent V3 patch: https://gcc.gnu.org/pipermail/gcc-patches/2023-July/623566.html
Thanks. juzhe.zh...@rivai.ai From: Richard Biener Date: 2023-07-04 17:27 To: Ju-Zhe Zhong CC: gcc-patches; richard.sandiford Subject: Re: [PATCH V2] VECT: Apply LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer On Tue, 4 Jul 2023, juzhe.zh...@rivai.ai wrote: > From: Ju-Zhe Zhong <juzhe.zh...@rivai.ai> > > Hi, Richard and Richi. > > The len_mask_gather_load/len_mask_scatter_store patterns have been added. > Now, this patch applies them into vectorizer. > > Here is the example: > > void > f (int *restrict a, > int *restrict b, int n, > int base, int step, > int *restrict cond) > { > for (int i = 0; i < n; ++i) > { > if (cond[i]) > a[i * 4] = b[i]; > } > } > > Gimple IR: > > <bb 3> [local count: 105119324]: > _58 = (unsigned long) n_13(D); > > <bb 4> [local count: 630715945]: > # vectp_cond.7_45 = PHI <vectp_cond.7_46(4), cond_14(D)(3)> > # vectp_b.11_51 = PHI <vectp_b.11_52(4), b_15(D)(3)> > # vectp_a.14_55 = PHI <vectp_a.14_56(4), a_16(D)(3)> > # ivtmp_59 = PHI <ivtmp_60(4), _58(3)> > _61 = .SELECT_VL (ivtmp_59, POLY_INT_CST [2, 2]); > ivtmp_44 = _61 * 4; > vect__4.9_47 = .LEN_MASK_LOAD (vectp_cond.7_45, 32B, _61, 0, { -1, ... }); > mask__24.10_49 = vect__4.9_47 != { 0, ... }; > vect__8.13_53 = .LEN_MASK_LOAD (vectp_b.11_51, 32B, _61, 0, mask__24.10_49); > ivtmp_54 = _61 * 16; > .LEN_MASK_SCATTER_STORE (vectp_a.14_55, { 0, 16, 32, ... }, 1, > vect__8.13_53, _61, 0, mask__24.10_49); > vectp_cond.7_46 = vectp_cond.7_45 + ivtmp_44; > vectp_b.11_52 = vectp_b.11_51 + ivtmp_44; > vectp_a.14_56 = vectp_a.14_55 + ivtmp_54; > ivtmp_60 = ivtmp_59 - _61; > if (ivtmp_60 != 0) > goto <bb 4>; [83.33%] > else > goto <bb 5>; [16.67%] > > gcc/ChangeLog: > > * optabs-query.cc (supports_vec_gather_load_p): Apply > LEN_MASK_GATHER_LOAD/SCATTER_STORE into vectorizer. > (supports_vec_scatter_store_p): Ditto. > * tree-vect-data-refs.cc (vect_gather_scatter_fn_p): Ditto. > * tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto. > (vect_get_strided_load_store_ops): Ditto. > (vectorizable_store): Ditto. > (vectorizable_load): Ditto. > > --- > gcc/optabs-query.cc | 2 + > gcc/tree-vect-data-refs.cc | 15 +++- > gcc/tree-vect-stmts.cc | 136 ++++++++++++++++++++++++++++++++----- > 3 files changed, 134 insertions(+), 19 deletions(-) > > diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc > index 2fdd0d34354..bf1f484e874 100644 > --- a/gcc/optabs-query.cc > +++ b/gcc/optabs-query.cc > @@ -676,6 +676,7 @@ supports_vec_gather_load_p (machine_mode mode) > this_fn_optabs->supports_vec_gather_load[mode] > = (supports_vec_convert_optab_p (gather_load_optab, mode) > || supports_vec_convert_optab_p (mask_gather_load_optab, mode) > + || supports_vec_convert_optab_p (len_mask_gather_load_optab, mode) > ? 1 : -1); > > return this_fn_optabs->supports_vec_gather_load[mode] > 0; > @@ -692,6 +693,7 @@ supports_vec_scatter_store_p (machine_mode mode) > this_fn_optabs->supports_vec_scatter_store[mode] > = (supports_vec_convert_optab_p (scatter_store_optab, mode) > || supports_vec_convert_optab_p (mask_scatter_store_optab, mode) > + || supports_vec_convert_optab_p (len_mask_scatter_store_optab, mode) > ? 1 : -1); > > return this_fn_optabs->supports_vec_scatter_store[mode] > 0; > diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc > index ebe93832b1e..8d32eb3c83b 100644 > --- a/gcc/tree-vect-data-refs.cc > +++ b/gcc/tree-vect-data-refs.cc > @@ -3873,16 +3873,24 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool > read_p, bool masked_p, > return false; > > /* Work out which function we need. */ > - internal_fn ifn, alt_ifn; > + internal_fn ifn, alt_ifn, len_mask_ifn; > if (read_p) > { > ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD; > alt_ifn = IFN_MASK_GATHER_LOAD; > + /* When target supports LEN_MASK_GATHER_LOAD, we always > + use LEN_MASK_GATHER_LOAD regardless whether len and > + mask are valid or not. */ > + len_mask_ifn = IFN_LEN_MASK_GATHER_LOAD; > } > else > { > ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE; > alt_ifn = IFN_MASK_SCATTER_STORE; > + /* When target supports LEN_MASK_SCATTER_STORE, we always > + use LEN_MASK_SCATTER_STORE regardless whether len and > + mask are valid or not. */ > + len_mask_ifn = IFN_LEN_MASK_SCATTER_STORE; > } > > for (;;) > @@ -3893,7 +3901,10 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool > read_p, bool masked_p, > > /* Test whether the target supports this combination. */ > if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type, > - offset_vectype, scale)) > + offset_vectype, scale) > + || internal_gather_scatter_fn_supported_p (len_mask_ifn, vectype, > + memory_type, > + offset_vectype, scale)) > { > *ifn_out = ifn; I think *ifn_out should be len_mask_ifn if that matched, so instead add an else if just like we do for alt_ifn? > *offset_vectype_out = offset_vectype; > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > index a0c39268bf0..1f607b7102b 100644 > --- a/gcc/tree-vect-stmts.cc > +++ b/gcc/tree-vect-stmts.cc > @@ -1771,6 +1771,18 @@ check_load_store_for_partial_vectors (loop_vec_info > loop_vinfo, tree vectype, > gs_info->offset_vectype, > gs_info->scale)) > { > + ifn = (is_load > + ? IFN_LEN_MASK_GATHER_LOAD > + : IFN_LEN_MASK_SCATTER_STORE); > + if (internal_gather_scatter_fn_supported_p (ifn, vectype, > + gs_info->memory_type, > + gs_info->offset_vectype, > + gs_info->scale)) > + { > + vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo); > + vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1); > + return; > + } > if (dump_enabled_p ()) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > "can't operate on partial vectors because" > @@ -3129,16 +3141,39 @@ vect_get_gather_scatter_ops (loop_vec_info loop_vinfo, > static void > vect_get_strided_load_store_ops (stmt_vec_info stmt_info, > loop_vec_info loop_vinfo, > + gimple_stmt_iterator *gsi, > gather_scatter_info *gs_info, > - tree *dataref_bump, tree *vec_offset) > + tree *dataref_bump, tree *vec_offset, > + vec_loop_lens *loop_lens) > { > struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); > tree vectype = STMT_VINFO_VECTYPE (stmt_info); > > - tree bump = size_binop (MULT_EXPR, > - fold_convert (sizetype, unshare_expr (DR_STEP (dr))), > - size_int (TYPE_VECTOR_SUBPARTS (vectype))); > - *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump); > + if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo)) > + { > + /* _31 = .SELECT_VL (ivtmp_29, POLY_INT_CST [4, 4]); > + ivtmp_8 = _31 * 16 (step in bytes); > + .LEN_MASK_SCATTER_STORE (vectp_a.9_7, ... ); > + vectp_a.9_26 = vectp_a.9_7 + ivtmp_8; */ > + tree loop_len > + = vect_get_loop_len (loop_vinfo, gsi, loop_lens, 1, vectype, 0, 0); > + tree tmp > + = fold_build2 (MULT_EXPR, sizetype, > + fold_convert (sizetype, unshare_expr (DR_STEP (dr))), > + loop_len); > + tree bump = make_temp_ssa_name (sizetype, NULL, "ivtmp"); > + gassign *assign = gimple_build_assign (bump, tmp); > + gsi_insert_before (gsi, assign, GSI_SAME_STMT); > + *dataref_bump = bump; > + } > + else > + { > + tree bump > + = size_binop (MULT_EXPR, > + fold_convert (sizetype, unshare_expr (DR_STEP (dr))), > + size_int (TYPE_VECTOR_SUBPARTS (vectype))); > + *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump); > + } > > /* The offset given in GS_INFO can have pointer type, so use the element > type of the vector instead. */ > @@ -8685,8 +8720,8 @@ vectorizable_store (vec_info *vinfo, > else if (memory_access_type == VMAT_GATHER_SCATTER) > { > aggr_type = elem_type; > - vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info, > - &bump, &vec_offset); > + vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info, > + &bump, &vec_offset, loop_lens); > } > else > { > @@ -8915,6 +8950,8 @@ vectorizable_store (vec_info *vinfo, > unsigned HOST_WIDE_INT align; > > tree final_mask = NULL_TREE; > + tree final_len = NULL_TREE; > + tree bias = NULL_TREE; > if (loop_masks) > final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, > vec_num * ncopies, > @@ -8929,8 +8966,43 @@ vectorizable_store (vec_info *vinfo, > if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > vec_offset = vec_offsets[vec_num * j + i]; > tree scale = size_int (gs_info.scale); > + > + if (internal_gather_scatter_fn_supported_p ( > + IFN_LEN_MASK_SCATTER_STORE, vectype, gs_info.memory_type, > + TREE_TYPE (vec_offset), gs_info.scale)) gs_info.ifn should have recorded the appropriate ifn, so you should be able to simplify this. > + { > + if (loop_lens) > + { > + final_len > + = vect_get_loop_len (loop_vinfo, gsi, loop_lens, > + vec_num * ncopies, vectype, > + vec_num * j + i, 1); > + } > + else > + { > + tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); > + final_len > + = build_int_cst (iv_type, > + TYPE_VECTOR_SUBPARTS (vectype)); > + } > + signed char biasval > + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); > + bias = build_int_cst (intQI_type_node, biasval); > + if (!final_mask) > + { > + mask_vectype = truth_type_for (vectype); > + final_mask = build_minus_one_cst (mask_vectype); > + } > + } > + > gcall *call; > - if (final_mask) > + if (final_len && final_len) > + call > + = gimple_build_call_internal (IFN_LEN_MASK_SCATTER_STORE, > + 7, dataref_ptr, vec_offset, > + scale, vec_oprnd, final_len, > + bias, final_mask); > + else if (final_mask) > call = gimple_build_call_internal > (IFN_MASK_SCATTER_STORE, 5, dataref_ptr, vec_offset, > scale, vec_oprnd, final_mask); > @@ -9047,9 +9119,6 @@ vectorizable_store (vec_info *vinfo, > machine_mode vmode = TYPE_MODE (vectype); > machine_mode new_vmode = vmode; > internal_fn partial_ifn = IFN_LAST; > - /* Produce 'len' and 'bias' argument. */ > - tree final_len = NULL_TREE; > - tree bias = NULL_TREE; > if (loop_lens) > { > opt_machine_mode new_ovmode > @@ -10177,8 +10246,8 @@ vectorizable_load (vec_info *vinfo, > else if (memory_access_type == VMAT_GATHER_SCATTER) > { > aggr_type = elem_type; > - vect_get_strided_load_store_ops (stmt_info, loop_vinfo, &gs_info, > - &bump, &vec_offset); > + vect_get_strided_load_store_ops (stmt_info, loop_vinfo, gsi, &gs_info, > + &bump, &vec_offset, loop_lens); > } > else > { > @@ -10339,6 +10408,8 @@ vectorizable_load (vec_info *vinfo, > for (i = 0; i < vec_num; i++) > { > tree final_mask = NULL_TREE; > + tree final_len = NULL_TREE; > + tree bias = NULL_TREE; > if (loop_masks > && memory_access_type != VMAT_INVARIANT) > final_mask = vect_get_loop_mask (loop_vinfo, gsi, loop_masks, > @@ -10368,8 +10439,42 @@ vectorizable_load (vec_info *vinfo, > vec_offset = vec_offsets[vec_num * j + i]; > tree zero = build_zero_cst (vectype); > tree scale = size_int (gs_info.scale); > + > + if (internal_gather_scatter_fn_supported_p ( > + IFN_LEN_MASK_GATHER_LOAD, vectype, > + gs_info.memory_type, TREE_TYPE (vec_offset), > + gs_info.scale)) Likewise. > + { > + if (loop_lens) > + { > + final_len = vect_get_loop_len ( > + loop_vinfo, gsi, loop_lens, vec_num * ncopies, > + vectype, vec_num * j + i, 1); > + } > + else > + { > + tree iv_type > + = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo); > + final_len = build_int_cst ( > + iv_type, TYPE_VECTOR_SUBPARTS (vectype)); > + } > + signed char biasval > + = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo); > + bias = build_int_cst (intQI_type_node, biasval); > + if (!final_mask) > + { > + mask_vectype = truth_type_for (vectype); > + final_mask = build_minus_one_cst (mask_vectype); > + } > + } > + > gcall *call; > - if (final_mask) > + if (final_len && final_mask) > + call = gimple_build_call_internal ( > + IFN_LEN_MASK_GATHER_LOAD, 7, dataref_ptr, > + vec_offset, scale, zero, final_len, bias, > + final_mask); > + else if (final_mask) > call = gimple_build_call_internal > (IFN_MASK_GATHER_LOAD, 5, dataref_ptr, > vec_offset, scale, zero, final_mask); > @@ -10462,9 +10567,6 @@ vectorizable_load (vec_info *vinfo, > machine_mode vmode = TYPE_MODE (vectype); > machine_mode new_vmode = vmode; > internal_fn partial_ifn = IFN_LAST; > - /* Produce 'len' and 'bias' argument. */ > - tree final_len = NULL_TREE; > - tree bias = NULL_TREE; > if (loop_lens) > { > opt_machine_mode new_ovmode > -- Richard Biener <rguent...@suse.de> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman; HRB 36809 (AG Nuernberg)