On Tue, Nov 4, 2025 at 4:00 PM Robin Dapp <[email protected]> wrote:
>
> Hi,
>
> Similar to the signed/unsigned patch before this one relaxes the
> gather/scatter restrictions on scale factors. The basic idea is that a
> natively unsupported scale factor can still be reached by emitting a
> multiplication before the actual gather operation. As before, we need
> to make sure that there is no overflow when multiplying.
>
> The approach is similar to before, just that we have two more "passes" that
> check supported offset types.
>
> Bootstrapped and regtested on x86 and power10. Regtested on aarch64 and
> rv64gcv_zvl512b (with and without target support for scaling).
OK.
Thanks,
Richard.
> Regards
> Robin
>
> gcc/ChangeLog:
>
> * tree-vect-data-refs.cc (struct gather_scatter_config):
> Add scale.
> (vect_gather_scatter_get_configs): Try various scales.
> (vect_gather_scatter_fn_p): Add scale handling.
> (vect_check_gather_scatter): Add scale parameter.
> * tree-vect-stmts.cc (check_load_store_for_partial_vectors):
> Ditto.
> (vect_truncate_gather_scatter_offset): Ditto.
> (vect_use_grouped_gather): Ditto.
> (get_load_store_type): Ditto.
> (vectorizable_store): Scale offset if necessary.
> (vectorizable_load): Ditto.
> * tree-vectorizer.h (struct vect_load_store_data): Add
> supported_scale.
> (vect_gather_scatter_fn_p): Add argument.
> ---
> gcc/tree-vect-data-refs.cc | 181 +++++++++++++++++++++++++++++--------
> gcc/tree-vect-stmts.cc | 71 ++++++++++++---
> gcc/tree-vectorizer.h | 11 ++-
> 3 files changed, 210 insertions(+), 53 deletions(-)
>
> diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
> index fb2450a30c4..9c87cc1de6d 100644
> --- a/gcc/tree-vect-data-refs.cc
> +++ b/gcc/tree-vect-data-refs.cc
> @@ -4431,6 +4431,7 @@ struct gather_scatter_config
> {
> internal_fn ifn;
> tree offset_vectype;
> + int scale;
> vec<int> elsvals;
> };
>
> @@ -4523,38 +4524,62 @@ vect_gather_scatter_get_configs (vec_info *vinfo,
> bool read_p, bool masked_p,
> if (!offset_vectype)
> continue;
>
> - vec<int> elsvals = vNULL;
> + /* Try multiple scale values. Start with exact match, then try
> + smaller common scales that a target might support . */
> + int scales_to_try[] = {scale, 1, 2, 4, 8};
>
> - /* If we haven't determined which IFN is supported yet, try all three
> - to find which one the target supports. */
> - if (ifn == IFN_LAST)
> + for (unsigned int j = 0;
> + j < sizeof (scales_to_try) / sizeof (*scales_to_try);
> + j++)
> {
> - ifn = vect_gather_scatter_which_ifn (read_p, masked_p,
> - vectype, memory_type,
> - offset_vectype, scale,
> &elsvals);
> - if (ifn != IFN_LAST)
> + int try_scale = scales_to_try[j];
> +
> + /* Skip scales >= requested scale (except for exact match). */
> + if (j > 0 && try_scale >= scale)
> + continue;
> +
> + /* Skip if requested scale is not a multiple of this scale. */
> + if (j > 0 && scale % try_scale != 0)
> + continue;
> +
> + vec<int> elsvals = vNULL;
> +
> + /* If we haven't determined which IFN is supported yet, try all
> three
> + to find which one the target supports. */
> + if (ifn == IFN_LAST)
> {
> - /* Found which IFN is supported. Save this configuration. */
> - gather_scatter_config config;
> - config.ifn = ifn;
> - config.offset_vectype = offset_vectype;
> - config.elsvals = elsvals;
> - configs.safe_push (config);
> + ifn = vect_gather_scatter_which_ifn (read_p, masked_p,
> + vectype, memory_type,
> + offset_vectype, try_scale,
> + &elsvals);
> + if (ifn != IFN_LAST)
> + {
> + /* Found which IFN is supported. Save this configuration.
> */
> + gather_scatter_config config;
> + config.ifn = ifn;
> + config.offset_vectype = offset_vectype;
> + config.scale = try_scale;
> + config.elsvals = elsvals;
> + configs.safe_push (config);
> + }
> }
> - }
> - else
> - {
> - /* We already know which IFN is supported, just check if this
> - offset type works with it. */
> - if (internal_gather_scatter_fn_supported_p (ifn, vectype,
> memory_type,
> - offset_vectype, scale,
> - &elsvals))
> + else
> {
> - gather_scatter_config config;
> - config.ifn = ifn;
> - config.offset_vectype = offset_vectype;
> - config.elsvals = elsvals;
> - configs.safe_push (config);
> + /* We already know which IFN is supported, just check if this
> + offset type and scale work with it. */
> + if (internal_gather_scatter_fn_supported_p (ifn, vectype,
> + memory_type,
> + offset_vectype,
> + try_scale,
> + &elsvals))
> + {
> + gather_scatter_config config;
> + config.ifn = ifn;
> + config.offset_vectype = offset_vectype;
> + config.scale = try_scale;
> + config.elsvals = elsvals;
> + configs.safe_push (config);
> + }
> }
> }
> }
> @@ -4570,6 +4595,11 @@ vect_gather_scatter_get_configs (vec_info *vinfo, bool
> read_p, bool masked_p,
> base address. If OFFSET_TYPE is scalar the function chooses an
> appropriate vector type for it. SCALE is the amount by which the
> offset should be multiplied *after* it has been converted to address
> width.
> + SCALE is the requested scale, but if the target doesn't support it,
> + If the target does not support the requested SCALE, SUPPORTED_SCALE
> + will contain the scale that is actually supported
> + (which may be smaller, requiring additional multiplication).
> + Otherwise SUPPORTED_SCALE is 0.
>
> Return true if the function is supported, storing the function id in
> *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.
> @@ -4582,12 +4612,14 @@ vect_gather_scatter_get_configs (vec_info *vinfo,
> bool read_p, bool masked_p,
> bool
> vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
> tree vectype, tree memory_type, tree offset_type,
> - int scale, internal_fn *ifn_out,
> + int scale, int *supported_scale,
> + internal_fn *ifn_out,
> tree *offset_vectype_out,
> tree *supported_offset_vectype,
> vec<int> *elsvals)
> {
> *supported_offset_vectype = NULL_TREE;
> + *supported_scale = 0;
> unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
> unsigned int element_bits = vector_element_bits (vectype);
> if (element_bits != memory_bits)
> @@ -4609,11 +4641,19 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool
> read_p, bool masked_p,
> if (configs.is_empty ())
> return false;
>
> - /* First, try to find a configuration that matches our offset type
> - (no conversion needed). */
> + /* Selection priority:
> + 1 - Exact scale match + offset type match
> + 2 - Exact scale match + sign-swapped offset
> + 3 - Smaller scale + offset type match
> + 4 - Smaller scale + sign-swapped offset
> + Within each category, prefer smaller offset types. */
> +
> + /* First pass: exact scale match with no conversion. */
> for (unsigned int i = 0; i < configs.length (); i++)
> {
> - if (TYPE_SIGN (configs[i].offset_vectype) == TYPE_SIGN
> (offset_vectype))
> + if (configs[i].scale == scale
> + && TYPE_SIGN (configs[i].offset_vectype)
> + == TYPE_SIGN (offset_vectype))
> {
> *ifn_out = configs[i].ifn;
> *offset_vectype_out = configs[i].offset_vectype;
> @@ -4623,19 +4663,77 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool
> read_p, bool masked_p,
> }
> }
>
> - /* No direct match. This means we try to find a sign-swapped offset
> - vectype. */
> + /* No direct match. This means we try to find either
> + - a sign-swapped offset vectype or
> + - a different scale and 2x larger offset type
> + - a different scale and larger sign-swapped offset vectype. */
> unsigned int offset_precision = TYPE_PRECISION (TREE_TYPE
> (offset_vectype));
> unsigned int needed_precision
> = TYPE_UNSIGNED (offset_vectype) ? offset_precision * 2 : POINTER_SIZE;
> needed_precision = std::min (needed_precision, (unsigned) POINTER_SIZE);
>
> + /* Second pass: No direct match. This means we try to find a sign-swapped
> + offset vectype. */
> enum tree_code tmp;
> for (unsigned int i = 0; i < configs.length (); i++)
> {
> unsigned int precision
> = TYPE_PRECISION (TREE_TYPE (configs[i].offset_vectype));
> - if (precision >= needed_precision
> + if (configs[i].scale == scale
> + && precision >= needed_precision
> + && (supportable_convert_operation (CONVERT_EXPR,
> + configs[i].offset_vectype,
> + offset_vectype, &tmp)
> + || (needed_precision == offset_precision
> + && tree_nop_conversion_p (configs[i].offset_vectype,
> + offset_vectype))))
> + {
> + *ifn_out = configs[i].ifn;
> + *offset_vectype_out = offset_vectype;
> + *supported_offset_vectype = configs[i].offset_vectype;
> + if (elsvals)
> + *elsvals = configs[i].elsvals;
> + return true;
> + }
> + }
> +
> + /* Third pass: Try a smaller scale with the same signedness. */
> + needed_precision = offset_precision * 2;
> + needed_precision = std::min (needed_precision, (unsigned) POINTER_SIZE);
> +
> + for (unsigned int i = 0; i < configs.length (); i++)
> + {
> + unsigned int precision
> + = TYPE_PRECISION (TREE_TYPE (configs[i].offset_vectype));
> + if (configs[i].scale < scale
> + && precision >= needed_precision
> + && (supportable_convert_operation (CONVERT_EXPR,
> + configs[i].offset_vectype,
> + offset_vectype, &tmp)
> + || (needed_precision == offset_precision
> + && tree_nop_conversion_p (configs[i].offset_vectype,
> + offset_vectype))))
> + {
> + *ifn_out = configs[i].ifn;
> + *offset_vectype_out = configs[i].offset_vectype;
> + *supported_scale = configs[i].scale;
> + if (elsvals)
> + *elsvals = configs[i].elsvals;
> + return true;
> + }
> + }
> +
> + /* Fourth pass: Try a smaller scale and sign-swapped offset vectype. */
> + needed_precision
> + = TYPE_UNSIGNED (offset_vectype) ? offset_precision * 2 : POINTER_SIZE;
> + needed_precision = std::min (needed_precision, (unsigned) POINTER_SIZE);
> +
> + for (unsigned int i = 0; i < configs.length (); i++)
> + {
> + unsigned int precision
> + = TYPE_PRECISION (TREE_TYPE (configs[i].offset_vectype));
> + if (configs[i].scale < scale
> + && precision >= needed_precision
> && (supportable_convert_operation (CONVERT_EXPR,
> configs[i].offset_vectype,
> offset_vectype, &tmp)
> @@ -4646,6 +4744,7 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p,
> bool masked_p,
> *ifn_out = configs[i].ifn;
> *offset_vectype_out = offset_vectype;
> *supported_offset_vectype = configs[i].offset_vectype;
> + *supported_scale = configs[i].scale;
> if (elsvals)
> *elsvals = configs[i].elsvals;
> return true;
> @@ -4805,6 +4904,7 @@ vect_check_gather_scatter (stmt_vec_info stmt_info,
> tree vectype,
>
> base = fold_convert (sizetype, base);
> base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
> + int tmp_scale;
> tree tmp_offset_vectype;
>
> /* OFF at this point may be either a SSA_NAME or some tree expression
> @@ -4878,14 +4978,16 @@ vect_check_gather_scatter (stmt_vec_info stmt_info,
> tree vectype,
> && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
> masked_p, vectype,
> memory_type,
> signed_char_type_node,
> - new_scale, &ifn,
> + new_scale, &tmp_scale,
> + &ifn,
> &offset_vectype,
> &tmp_offset_vectype,
> elsvals)
> && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
> masked_p, vectype,
> memory_type,
> unsigned_char_type_node,
> - new_scale, &ifn,
> + new_scale, &tmp_scale,
> + &ifn,
> &offset_vectype,
> &tmp_offset_vectype,
> elsvals))
> @@ -4910,7 +5012,9 @@ vect_check_gather_scatter (stmt_vec_info stmt_info,
> tree vectype,
> && !POINTER_TYPE_P (TREE_TYPE (off))
> && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
> masked_p, vectype, memory_type,
> - TREE_TYPE (off), scale, &ifn,
> + TREE_TYPE (off),
> + scale, &tmp_scale,
> + &ifn,
> &offset_vectype,
> &tmp_offset_vectype,
> elsvals))
> @@ -4966,7 +5070,8 @@ vect_check_gather_scatter (stmt_vec_info stmt_info,
> tree vectype,
> if (use_ifn_p)
> {
> if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
> - vectype, memory_type, offtype, scale,
> + vectype, memory_type, offtype,
> + scale, &tmp_scale,
> &ifn, &offset_vectype,
> &tmp_offset_vectype,
> elsvals))
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 84ba756a042..d153544640a 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -1512,6 +1512,9 @@ check_load_store_for_partial_vectors (loop_vec_info
> loop_vinfo, tree vectype,
> we chose a different one use this instead. */
> if (ls->supported_offset_vectype)
> off_vectype = ls->supported_offset_vectype;
> + /* Same for scale. */
> + if (ls->supported_scale)
> + scale = ls->supported_scale;
>
> if (internal_gather_scatter_fn_supported_p (len_ifn, vectype,
> memory_type,
> @@ -1706,8 +1709,10 @@ vect_truncate_gather_scatter_offset (stmt_vec_info
> stmt_info, tree vectype,
> no narrower than OFFSET_TYPE. */
> tree memory_type = TREE_TYPE (DR_REF (dr));
> tree tmp_offset_vectype;
> + int tmp_scale;
> if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
> - vectype, memory_type, offset_type, scale,
> + vectype, memory_type, offset_type,
> + scale, &tmp_scale,
> &gs_info->ifn, &gs_info->offset_vectype,
> &tmp_offset_vectype, elsvals)
> || gs_info->ifn == IFN_LAST)
> @@ -1789,9 +1794,10 @@ vect_use_grouped_gather (dr_vec_info *dr_info, tree
> vectype,
> not available we still have a strided load/store. */
> bool ok = false;
> tree tmp_vectype;
> + int tmp_scale;
> if (vect_gather_scatter_fn_p
> (loop_vinfo, DR_IS_READ (dr), masked_p, *pun_vectype,
> - TREE_TYPE (*pun_vectype), *pun_vectype, 1, &ifn,
> + TREE_TYPE (*pun_vectype), *pun_vectype, 1, &tmp_scale, &ifn,
> &offset_vectype, &tmp_vectype, elsvals))
> ok = true;
> else if (internal_strided_fn_supported_p (strided_ifn, *pun_vectype,
> @@ -2091,6 +2097,7 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
> stmt_info,
> bool *slp_perm = &ls->slp_perm;
> unsigned *n_perms = &ls->n_perms;
> tree *supported_offset_vectype = &ls->supported_offset_vectype;
> + int *supported_scale = &ls->supported_scale;
> loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
> poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
> class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
> @@ -2164,7 +2171,7 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
> stmt_info,
> tree tem;
> if (vect_gather_scatter_fn_p (loop_vinfo, vls_type == VLS_LOAD,
> masked_p, vectype, memory_type,
> - offset_vectype, scale,
> + offset_vectype, scale, supported_scale,
> &ls->gs.ifn, &tem,
> supported_offset_vectype, elsvals))
> {
> @@ -2179,6 +2186,10 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
> stmt_info,
> dump_printf_loc (MSG_NOTE, vect_location,
> " target supports offset type %T.\n",
> *supported_offset_vectype);
> + if (*supported_scale)
> + dump_printf_loc (MSG_NOTE, vect_location,
> + " target supports offset scale %d.\n",
> + *supported_scale);
> }
> *memory_access_type = VMAT_GATHER_SCATTER_IFN;
> }
> @@ -2455,7 +2466,7 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
> stmt_info,
> gcc_assert (vect_gather_scatter_fn_p
> (loop_vinfo, vls_type == VLS_LOAD, masked_p, vectype,
> gs_info.memory_type, TREE_TYPE (gs_info.offset),
> - gs_info.scale, &gs_info.ifn,
> + gs_info.scale, supported_scale, &gs_info.ifn,
> &tmp, supported_offset_vectype, elsvals));
>
> SLP_TREE_GS_SCALE (slp_node) = gs_info.scale;
> @@ -8853,6 +8864,10 @@ vectorizable_store (vec_info *vinfo,
> inside_cost
> += record_stmt_cost (cost_vec, 1, vector_stmt,
> slp_node, 0, vect_body);
> + if (ls.supported_scale)
> + inside_cost
> + += record_stmt_cost (cost_vec, 1, vector_stmt,
> + slp_node, 0, vect_body);
>
> unsigned int cnunits = vect_nunits_for_cost (vectype);
> inside_cost
> @@ -8867,12 +8882,26 @@ vectorizable_store (vec_info *vinfo,
> tree scale = size_int (SLP_TREE_GS_SCALE (slp_node));
> bool strided = !VECTOR_TYPE_P (TREE_TYPE (vec_offset));
>
> - /* Perform the offset conversion if necessary. */
> - if (!strided && ls.supported_offset_vectype)
> + /* Perform the offset conversion and scaling if necessary. */
> + if (!strided
> + && (ls.supported_offset_vectype || ls.supported_scale))
> {
> gimple_seq stmts = NULL;
> - vec_offset = gimple_convert
> - (&stmts, ls.supported_offset_vectype, vec_offset);
> + if (ls.supported_offset_vectype)
> + vec_offset = gimple_convert
> + (&stmts, ls.supported_offset_vectype, vec_offset);
> + if (ls.supported_scale)
> + {
> + tree mult_cst = build_int_cst
> + (TREE_TYPE (TREE_TYPE (vec_offset)),
> + SLP_TREE_GS_SCALE (slp_node) / ls.supported_scale);
> + tree mult = build_vector_from_val
> + (TREE_TYPE (vec_offset), mult_cst);
> + vec_offset = gimple_build
> + (&stmts, MULT_EXPR, TREE_TYPE (vec_offset),
> + vec_offset, mult);
> + scale = size_int (ls.supported_scale);
> + }
> gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> }
>
> @@ -10694,6 +10723,10 @@ vectorizable_load (vec_info *vinfo,
> inside_cost
> += record_stmt_cost (cost_vec, 1, vector_stmt,
> slp_node, 0, vect_body);
> + if (ls.supported_scale)
> + inside_cost
> + += record_stmt_cost (cost_vec, 1, vector_stmt,
> + slp_node, 0, vect_body);
>
> unsigned int cnunits = vect_nunits_for_cost (vectype);
> inside_cost
> @@ -10707,12 +10740,26 @@ vectorizable_load (vec_info *vinfo,
> tree scale = size_int (SLP_TREE_GS_SCALE (slp_node));
> bool strided = !VECTOR_TYPE_P (TREE_TYPE (vec_offset));
>
> - /* Perform the offset conversion if necessary. */
> - if (!strided && ls.supported_offset_vectype)
> + /* Perform the offset conversion and scaling if necessary. */
> + if (!strided
> + && (ls.supported_offset_vectype || ls.supported_scale))
> {
> gimple_seq stmts = NULL;
> - vec_offset = gimple_convert
> - (&stmts, ls.supported_offset_vectype, vec_offset);
> + if (ls.supported_offset_vectype)
> + vec_offset = gimple_convert
> + (&stmts, ls.supported_offset_vectype, vec_offset);
> + if (ls.supported_scale)
> + {
> + tree mult_cst = build_int_cst
> + (TREE_TYPE (TREE_TYPE (vec_offset)),
> + SLP_TREE_GS_SCALE (slp_node) / ls.supported_scale);
> + tree mult = build_vector_from_val
> + (TREE_TYPE (vec_offset), mult_cst);
> + vec_offset = gimple_build
> + (&stmts, MULT_EXPR, TREE_TYPE (vec_offset),
> + vec_offset, mult);
> + scale = size_int (ls.supported_scale);
> + }
> gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
> }
>
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index a49fb9cb1ad..70dea5a6ad6 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -290,9 +290,14 @@ struct vect_load_store_data : vect_data {
> tree strided_offset_vectype; // VMAT_GATHER_SCATTER_IFN, originally strided
> tree ls_type; // VMAT_GATHER_SCATTER_IFN
> /* This is set to a supported offset vector type if we don't support the
> - originally requested offset type. In that case there will be an
> - additional offset conversion before the gather/scatter. */
> + originally requested offset type, otherwise NULL.
> + If nonzero there will be an additional offset conversion before
> + the gather/scatter. */
> tree supported_offset_vectype; // VMAT_GATHER_SCATTER_IFN
> + /* Similar for scale. Only nonzero if we don't support the requested
> + scale. Then we need to multiply the offset vector before the
> + gather/scatter. */
> + int supported_scale; // VMAT_GATHER_SCATTER_IFN
> auto_vec<int> elsvals;
> /* True if the load requires a load permutation. */
> bool slp_perm; // SLP_TREE_LOAD_PERMUTATION
> @@ -2596,7 +2601,7 @@ extern bool vect_slp_analyze_instance_alignment
> (vec_info *, slp_instance);
> extern opt_result vect_analyze_data_ref_accesses (vec_info *, vec<int> *);
> extern opt_result vect_prune_runtime_alias_test_list (loop_vec_info);
> extern bool vect_gather_scatter_fn_p (vec_info *, bool, bool, tree, tree,
> - tree, int, internal_fn *, tree *,
> + tree, int, int *, internal_fn *, tree *,
> tree *, vec<int> * = nullptr);
> extern bool vect_check_gather_scatter (stmt_vec_info, tree,
> loop_vec_info, gather_scatter_info *,
> --
> 2.51.0
>
>